From ec1eda7b6270fc433682c2e705381bb7959c7195 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Sun, 18 Feb 2024 16:36:16 -0500 Subject: [PATCH] [ruby/prism] Account for encoding in regexp named captures https://github.com/ruby/prism/commit/17dc6b6281 --- prism/regexp.c | 19 ++++++++- test/prism/fixtures/regex_char_width.txt | 3 ++ test/prism/parser_test.rb | 1 + test/prism/ruby_parser_test.rb | 1 + test/prism/snapshots/regex_char_width.txt | 50 +++++++++++++++++++++++ 5 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 test/prism/fixtures/regex_char_width.txt create mode 100644 test/prism/snapshots/regex_char_width.txt diff --git a/prism/regexp.c b/prism/regexp.c index ba498ecc83..6e0fdd295c 100644 --- a/prism/regexp.c +++ b/prism/regexp.c @@ -565,21 +565,36 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) { */ static bool pm_regexp_parse_item(pm_regexp_parser_t *parser) { - switch (*parser->cursor++) { + switch (*parser->cursor) { case '^': case '$': + parser->cursor++; return true; case '\\': + parser->cursor++; if (!pm_regexp_char_is_eof(parser)) { parser->cursor++; } return pm_regexp_parse_quantifier(parser); case '(': + parser->cursor++; return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser); case '[': + parser->cursor++; return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser); - default: + default: { + size_t width; + if (!parser->encoding_changed) { + width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); + } else { + width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); + } + + if (width == 0) return false; // TODO: add appropriate error + parser->cursor += width; + return pm_regexp_parse_quantifier(parser); + } } } diff --git a/test/prism/fixtures/regex_char_width.txt b/test/prism/fixtures/regex_char_width.txt new file mode 100644 index 0000000000..7096b71584 --- /dev/null +++ b/test/prism/fixtures/regex_char_width.txt @@ -0,0 +1,3 @@ +# encoding: sjis +/Ⅷ(?.)Ⅹ(?.)/ =~ 'ⅧaⅩb' +[a, b] diff --git a/test/prism/parser_test.rb b/test/prism/parser_test.rb index 26cc2f5b97..ad06af4359 100644 --- a/test/prism/parser_test.rb +++ b/test/prism/parser_test.rb @@ -55,6 +55,7 @@ module Prism dos_endings.txt heredocs_with_ignored_newlines.txt regex.txt + regex_char_width.txt spanning_heredoc.txt spanning_heredoc_newlines.txt tilde_heredocs.txt diff --git a/test/prism/ruby_parser_test.rb b/test/prism/ruby_parser_test.rb index f89aa4c23e..a71d05e78c 100644 --- a/test/prism/ruby_parser_test.rb +++ b/test/prism/ruby_parser_test.rb @@ -30,6 +30,7 @@ module Prism todos = %w[ heredocs_nested.txt newline_terminated.txt + regex_char_width.txt seattlerb/bug169.txt seattlerb/dstr_evstr.txt seattlerb/heredoc_squiggly_interp.txt diff --git a/test/prism/snapshots/regex_char_width.txt b/test/prism/snapshots/regex_char_width.txt new file mode 100644 index 0000000000..6bf2169b2f --- /dev/null +++ b/test/prism/snapshots/regex_char_width.txt @@ -0,0 +1,50 @@ +@ ProgramNode (location: (2,0)-(3,6)) +├── locals: [:a, :b] +└── statements: + @ StatementsNode (location: (2,0)-(3,6)) + └── body: (length: 2) + ├── @ MatchWriteNode (location: (2,0)-(2,36)) + │ ├── call: + │ │ @ CallNode (location: (2,0)-(2,36)) + │ │ ├── flags: ∅ + │ │ ├── receiver: + │ │ │ @ RegularExpressionNode (location: (2,0)-(2,22)) + │ │ │ ├── flags: ∅ + │ │ │ ├── opening_loc: (2,0)-(2,1) = "/" + │ │ │ ├── content_loc: (2,1)-(2,21) = "\x{E285}\xA7(?.)\x{E285}\xA9(?.)" + │ │ │ ├── closing_loc: (2,21)-(2,22) = "/" + │ │ │ └── unescaped: "\x{E285}\xA7(?.)\x{E285}\xA9(?.)" + │ │ ├── call_operator_loc: ∅ + │ │ ├── name: :=~ + │ │ ├── message_loc: (2,23)-(2,25) = "=~" + │ │ ├── opening_loc: ∅ + │ │ ├── arguments: + │ │ │ @ ArgumentsNode (location: (2,26)-(2,36)) + │ │ │ ├── flags: ∅ + │ │ │ └── arguments: (length: 1) + │ │ │ └── @ StringNode (location: (2,26)-(2,36)) + │ │ │ ├── flags: ∅ + │ │ │ ├── opening_loc: (2,26)-(2,27) = "'" + │ │ │ ├── content_loc: (2,27)-(2,35) = "\x{E285}\xA7a\x{E285}\xA9b" + │ │ │ ├── closing_loc: (2,35)-(2,36) = "'" + │ │ │ └── unescaped: "\x{E285}\xA7a\x{E285}\xA9b" + │ │ ├── closing_loc: ∅ + │ │ └── block: ∅ + │ └── targets: (length: 2) + │ ├── @ LocalVariableTargetNode (location: (2,7)-(2,8)) + │ │ ├── name: :a + │ │ └── depth: 0 + │ └── @ LocalVariableTargetNode (location: (2,17)-(2,18)) + │ ├── name: :b + │ └── depth: 0 + └── @ ArrayNode (location: (3,0)-(3,6)) + ├── flags: ∅ + ├── elements: (length: 2) + │ ├── @ LocalVariableReadNode (location: (3,1)-(3,2)) + │ │ ├── name: :a + │ │ └── depth: 0 + │ └── @ LocalVariableReadNode (location: (3,4)-(3,5)) + │ ├── name: :b + │ └── depth: 0 + ├── opening_loc: (3,0)-(3,1) = "[" + └── closing_loc: (3,5)-(3,6) = "]"