From 3f6187a94797d3c4a7db00563a885e4e613b51cf Mon Sep 17 00:00:00 2001 From: nagachika Date: Mon, 17 Jul 2023 18:04:41 +0900 Subject: [PATCH] merge revision(s) 1bc8838d60ef3fc6812d3b64ed87caaf0ae943d9: [Backport #19750] Handle unterminated unicode escapes in regexps This fixes an infinite loop possible after ec3542229b29ec93062e9d90e877ea29d3c19472. For \u{} escapes in regexps, skip validation in the parser, and rely on the regexp code to handle validation. This is necessary so that invalid unicode escapes in comments in extended regexps are allowed. Fixes [Bug #19750] Co-authored-by: Nobuyoshi Nakada --- parse.y | 97 ++++++++++++++++++++++++++++++++----------------- test/ruby/test_parse.rb | 16 ++++++++ 2 files changed, 79 insertions(+), 34 deletions(-) --- parse.y | 101 ++++++++++++++++++++++++++-------------- test/ruby/test_parse.rb | 16 +++++++ version.h | 2 +- 3 files changed, 82 insertions(+), 37 deletions(-) diff --git a/parse.y b/parse.y index bd3251e4e1..5642a55f67 100644 --- a/parse.y +++ b/parse.y @@ -7260,6 +7260,8 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp, return TRUE; } +static int tokadd_mbchar(struct parser_params *p, int c); + /* return value is for ?\u3042 */ static void tokadd_utf8(struct parser_params *p, rb_encoding **encp, @@ -7277,44 +7279,71 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp, if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); } if (peek(p, open_brace)) { /* handle \u{...} form */ - const char *second = NULL; - int c, last = nextc(p); - if (p->lex.pcur >= p->lex.pend) goto unterminated; - while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend); - while (c != close_brace) { - if (c == term) goto unterminated; - if (second == multiple_codepoints) - second = p->lex.pcur; - if (regexp_literal) tokadd(p, last); - if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) { - break; - } - while (ISSPACE(c = *p->lex.pcur)) { - if (++p->lex.pcur >= p->lex.pend) goto unterminated; - last = c; - } - if (term == -1 && !second) - second = multiple_codepoints; - } + if (regexp_literal && p->lex.strterm->u.literal.u1.func == str_regexp) { + /* + * Skip parsing validation code and copy bytes as-is until term or + * closing brace, in order to correctly handle extended regexps where + * invalid unicode escapes are allowed in comments. The regexp parser + * does its own validation and will catch any issues. + */ + int c = *p->lex.pcur; + tokadd(p, c); + for (c = *++p->lex.pcur; p->lex.pcur < p->lex.pend; c = *++p->lex.pcur) { + if (c == close_brace) { + tokadd(p, c); + ++p->lex.pcur; + break; + } + else if (c == term) { + break; + } + if (c == '\\' && p->lex.pcur + 1 < p->lex.pend) { + tokadd(p, c); + c = *++p->lex.pcur; + } + tokadd_mbchar(p, c); + } + } + else { + const char *second = NULL; + int c, last = nextc(p); + if (p->lex.pcur >= p->lex.pend) goto unterminated; + while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend); + while (c != close_brace) { + if (c == term) goto unterminated; + if (second == multiple_codepoints) + second = p->lex.pcur; + if (regexp_literal) tokadd(p, last); + if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) { + break; + } + while (ISSPACE(c = *p->lex.pcur)) { + if (++p->lex.pcur >= p->lex.pend) goto unterminated; + last = c; + } + if (term == -1 && !second) + second = multiple_codepoints; + } - if (c != close_brace) { - unterminated: - token_flush(p); - yyerror0("unterminated Unicode escape"); - return; - } - if (second && second != multiple_codepoints) { - const char *pcur = p->lex.pcur; - p->lex.pcur = second; - dispatch_scan_event(p, tSTRING_CONTENT); - token_flush(p); - p->lex.pcur = pcur; - yyerror0(multiple_codepoints); - token_flush(p); - } + if (c != close_brace) { + unterminated: + token_flush(p); + yyerror0("unterminated Unicode escape"); + return; + } + if (second && second != multiple_codepoints) { + const char *pcur = p->lex.pcur; + p->lex.pcur = second; + dispatch_scan_event(p, tSTRING_CONTENT); + token_flush(p); + p->lex.pcur = pcur; + yyerror0(multiple_codepoints); + token_flush(p); + } - if (regexp_literal) tokadd(p, close_brace); - nextc(p); + if (regexp_literal) tokadd(p, close_brace); + nextc(p); + } } else { /* handle \uxxxx form */ if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) { diff --git a/test/ruby/test_parse.rb b/test/ruby/test_parse.rb index 4488ea620e..a22f11aeae 100644 --- a/test/ruby/test_parse.rb +++ b/test/ruby/test_parse.rb @@ -1041,6 +1041,22 @@ x = __ENCODING__ assert_syntax_error(" 0b\n", /\^/) end + def test_unclosed_unicode_escape_at_eol_bug_19750 + assert_separately([], "#{<<-"begin;"}\n#{<<~'end;'}") + begin; + assert_syntax_error("/\\u", /too short escape sequence/) + assert_syntax_error("/\\u{", /unterminated regexp meets end of file/) + assert_syntax_error("/\\u{\\n", /invalid Unicode list/) + assert_syntax_error("/a#\\u{\\n/", /invalid Unicode list/) + re = eval("/a#\\u{\n$/x") + assert_match(re, 'a') + assert_not_match(re, 'a#') + re = eval("/a#\\u\n$/x") + assert_match(re, 'a') + assert_not_match(re, 'a#') + end; + end + def test_error_def_in_argument assert_separately([], "#{<<-"begin;"}\n#{<<~"end;"}") begin; diff --git a/version.h b/version.h index 403b4b020f..5c14532dd5 100644 --- a/version.h +++ b/version.h @@ -11,7 +11,7 @@ # define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR #define RUBY_VERSION_TEENY 2 #define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR -#define RUBY_PATCHLEVEL 92 +#define RUBY_PATCHLEVEL 93 #include "ruby/version.h" #include "ruby/internal/abi.h"