ruby/ext/ripper/eventids2.c
yui-knk d8601621ed Enhance keep_tokens option for RubyVM::AbstractSyntaxTree parsing methods
Implementation for Language Server Protocol (LSP) sometimes needs token information.
For example both `m(1)` and `m(1, )` has same AST structure other than node locations
then it's impossible to check the existence of `,` from AST. However in later case,
it might be better to suggest variables list for the second argument.
Token information is important for such case.

This commit adds these methods.

* Add `keep_tokens` option for `RubyVM::AbstractSyntaxTree.parse`, `.parse_file` and `.of`
* Add `RubyVM::AbstractSyntaxTree::Node#tokens` which returns tokens for the node including tokens for descendants nodes.
* Add `RubyVM::AbstractSyntaxTree::Node#all_tokens` which returns all tokens for the input script regardless the receiver node.

[Feature #19070]

Impacts on memory usage and performance are below:

Memory usage:

```
$ cat test.rb
root = RubyVM::AbstractSyntaxTree.parse_file(File.expand_path('../test/ruby/test_keyword.rb', __FILE__), keep_tokens: true)

$ /usr/bin/time -f %Mkb /usr/local/bin/ruby -v
ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8057) [x86_64-linux]
11408kb

# keep_tokens :false
$ /usr/bin/time -f %Mkb /usr/local/bin/ruby test.rb
17508kb

# keep_tokens :true
$ /usr/bin/time -f %Mkb /usr/local/bin/ruby test.rb
30960kb
```

Performance:

```
$ cat ../ast_keep_tokens.yml
prelude: |
  src = <<~SRC
    module M
      class C
        def m1(a, b)
          1 + a + b
        end
      end
    end
  SRC
benchmark:
  without_keep_tokens: |
    RubyVM::AbstractSyntaxTree.parse(src, keep_tokens: false)
  with_keep_tokens: |
    RubyVM::AbstractSyntaxTree.parse(src, keep_tokens: true)

$ make benchmark COMPARE_RUBY="./ruby" ARGS=../ast_keep_tokens.yml
/home/kaneko.y/.rbenv/shims/ruby --disable=gems -rrubygems -I../benchmark/lib ../benchmark/benchmark-driver/exe/benchmark-driver \
            --executables="compare-ruby::./ruby -I.ext/common --disable-gem" \
            --executables="built-ruby::./miniruby -I../lib -I. -I.ext/common  ../tool/runruby.rb --extout=.ext  -- --disable-gems --disable-gem" \
            --output=markdown --output-compare -v ../ast_keep_tokens.yml
compare-ruby: ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8057) [x86_64-linux]
built-ruby: ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8057) [x86_64-linux]
warming up..

|                     |compare-ruby|built-ruby|
|:--------------------|-----------:|---------:|
|without_keep_tokens  |     21.659k|   21.303k|
|                     |       1.02x|         -|
|with_keep_tokens     |      6.220k|    5.691k|
|                     |       1.09x|         -|
```
2022-11-21 09:01:34 +09:00

294 lines
8.1 KiB
C

typedef struct {
ID ripper_id_backref;
ID ripper_id_backtick;
ID ripper_id_comma;
ID ripper_id_const;
ID ripper_id_cvar;
ID ripper_id_embexpr_beg;
ID ripper_id_embexpr_end;
ID ripper_id_embvar;
ID ripper_id_float;
ID ripper_id_gvar;
ID ripper_id_ident;
ID ripper_id_imaginary;
ID ripper_id_int;
ID ripper_id_ivar;
ID ripper_id_kw;
ID ripper_id_lbrace;
ID ripper_id_lbracket;
ID ripper_id_lparen;
ID ripper_id_nl;
ID ripper_id_op;
ID ripper_id_period;
ID ripper_id_rbrace;
ID ripper_id_rbracket;
ID ripper_id_rparen;
ID ripper_id_semicolon;
ID ripper_id_symbeg;
ID ripper_id_tstring_beg;
ID ripper_id_tstring_content;
ID ripper_id_tstring_end;
ID ripper_id_words_beg;
ID ripper_id_qwords_beg;
ID ripper_id_qsymbols_beg;
ID ripper_id_symbols_beg;
ID ripper_id_words_sep;
ID ripper_id_rational;
ID ripper_id_regexp_beg;
ID ripper_id_regexp_end;
ID ripper_id_label;
ID ripper_id_label_end;
ID ripper_id_tlambda;
ID ripper_id_tlambeg;
ID ripper_id_ignored_nl;
ID ripper_id_comment;
ID ripper_id_embdoc_beg;
ID ripper_id_embdoc;
ID ripper_id_embdoc_end;
ID ripper_id_sp;
ID ripper_id_heredoc_beg;
ID ripper_id_heredoc_end;
ID ripper_id___end__;
ID ripper_id_CHAR;
} ripper_scanner_ids_t;
static ripper_scanner_ids_t ripper_scanner_ids;
#include "eventids2table.c"
static void
ripper_init_eventids2(void)
{
#define set_id2(name) ripper_scanner_ids.ripper_id_##name = rb_intern_const("on_"#name)
set_id2(backref);
set_id2(backtick);
set_id2(comma);
set_id2(const);
set_id2(cvar);
set_id2(embexpr_beg);
set_id2(embexpr_end);
set_id2(embvar);
set_id2(float);
set_id2(gvar);
set_id2(ident);
set_id2(imaginary);
set_id2(int);
set_id2(ivar);
set_id2(kw);
set_id2(lbrace);
set_id2(lbracket);
set_id2(lparen);
set_id2(nl);
set_id2(op);
set_id2(period);
set_id2(rbrace);
set_id2(rbracket);
set_id2(rparen);
set_id2(semicolon);
set_id2(symbeg);
set_id2(tstring_beg);
set_id2(tstring_content);
set_id2(tstring_end);
set_id2(words_beg);
set_id2(qwords_beg);
set_id2(qsymbols_beg);
set_id2(symbols_beg);
set_id2(words_sep);
set_id2(rational);
set_id2(regexp_beg);
set_id2(regexp_end);
set_id2(label);
set_id2(label_end);
set_id2(tlambda);
set_id2(tlambeg);
set_id2(ignored_nl);
set_id2(comment);
set_id2(embdoc_beg);
set_id2(embdoc);
set_id2(embdoc_end);
set_id2(sp);
set_id2(heredoc_beg);
set_id2(heredoc_end);
set_id2(__end__);
set_id2(CHAR);
}
STATIC_ASSERT(k__END___range, k__END__ < SHRT_MAX);
STATIC_ASSERT(ripper_scanner_ids_size, sizeof(ripper_scanner_ids) < SHRT_MAX);
static ID
ripper_token2eventid(enum yytokentype tok)
{
#define O(member) (int)offsetof(ripper_scanner_ids_t, ripper_id_##member)+1
static const unsigned short offsets[] = {
[' '] = O(words_sep),
['!'] = O(op),
['%'] = O(op),
['&'] = O(op),
['*'] = O(op),
['+'] = O(op),
['-'] = O(op),
['/'] = O(op),
['<'] = O(op),
['='] = O(op),
['>'] = O(op),
['?'] = O(op),
['^'] = O(op),
['|'] = O(op),
['~'] = O(op),
[':'] = O(op),
[','] = O(comma),
['.'] = O(period),
[';'] = O(semicolon),
['`'] = O(backtick),
['\n'] = O(nl),
[keyword_alias] = O(kw),
[keyword_and] = O(kw),
[keyword_begin] = O(kw),
[keyword_break] = O(kw),
[keyword_case] = O(kw),
[keyword_class] = O(kw),
[keyword_def] = O(kw),
[keyword_defined] = O(kw),
[keyword_do] = O(kw),
[keyword_do_block] = O(kw),
[keyword_do_cond] = O(kw),
[keyword_else] = O(kw),
[keyword_elsif] = O(kw),
[keyword_end] = O(kw),
[keyword_ensure] = O(kw),
[keyword_false] = O(kw),
[keyword_for] = O(kw),
[keyword_if] = O(kw),
[modifier_if] = O(kw),
[keyword_in] = O(kw),
[keyword_module] = O(kw),
[keyword_next] = O(kw),
[keyword_nil] = O(kw),
[keyword_not] = O(kw),
[keyword_or] = O(kw),
[keyword_redo] = O(kw),
[keyword_rescue] = O(kw),
[modifier_rescue] = O(kw),
[keyword_retry] = O(kw),
[keyword_return] = O(kw),
[keyword_self] = O(kw),
[keyword_super] = O(kw),
[keyword_then] = O(kw),
[keyword_true] = O(kw),
[keyword_undef] = O(kw),
[keyword_unless] = O(kw),
[modifier_unless] = O(kw),
[keyword_until] = O(kw),
[modifier_until] = O(kw),
[keyword_when] = O(kw),
[keyword_while] = O(kw),
[modifier_while] = O(kw),
[keyword_yield] = O(kw),
[keyword__FILE__] = O(kw),
[keyword__LINE__] = O(kw),
[keyword__ENCODING__] = O(kw),
[keyword_BEGIN] = O(kw),
[keyword_END] = O(kw),
[keyword_do_LAMBDA] = O(kw),
[tAMPER] = O(op),
[tANDOP] = O(op),
[tAREF] = O(op),
[tASET] = O(op),
[tASSOC] = O(op),
[tBACK_REF] = O(backref),
[tCHAR] = O(CHAR),
[tCMP] = O(op),
[tCOLON2] = O(op),
[tCOLON3] = O(op),
[tCONSTANT] = O(const),
[tCVAR] = O(cvar),
[tDOT2] = O(op),
[tDOT3] = O(op),
[tBDOT2] = O(op),
[tBDOT3] = O(op),
[tEQ] = O(op),
[tEQQ] = O(op),
[tFID] = O(ident),
[tFLOAT] = O(float),
[tGEQ] = O(op),
[tGVAR] = O(gvar),
[tIDENTIFIER] = O(ident),
[tIMAGINARY] = O(imaginary),
[tINTEGER] = O(int),
[tIVAR] = O(ivar),
[tLBRACE] = O(lbrace),
[tLBRACE_ARG] = O(lbrace),
['{'] = O(lbrace),
['}'] = O(rbrace),
[tLBRACK] = O(lbracket),
['['] = O(lbracket),
[']'] = O(rbracket),
[tLEQ] = O(op),
[tLPAREN] = O(lparen),
[tLPAREN_ARG] = O(lparen),
['('] = O(lparen),
[')'] = O(rparen),
[tLSHFT] = O(op),
[tMATCH] = O(op),
[tNEQ] = O(op),
[tNMATCH] = O(op),
[tNTH_REF] = O(backref),
[tOP_ASGN] = O(op),
[tOROP] = O(op),
[tPOW] = O(op),
[tQWORDS_BEG] = O(qwords_beg),
[tQSYMBOLS_BEG] = O(qsymbols_beg),
[tSYMBOLS_BEG] = O(symbols_beg),
[tRATIONAL] = O(rational),
[tREGEXP_BEG] = O(regexp_beg),
[tREGEXP_END] = O(regexp_end),
[tRPAREN] = O(rparen),
[tRSHFT] = O(op),
[tSTAR] = O(op),
[tDSTAR] = O(op),
[tANDDOT] = O(op),
[tSTRING_BEG] = O(tstring_beg),
[tSTRING_CONTENT] = O(tstring_content),
[tSTRING_DBEG] = O(embexpr_beg),
[tSTRING_DEND] = O(embexpr_end),
[tSTRING_DVAR] = O(embvar),
[tSTRING_END] = O(tstring_end),
[tSYMBEG] = O(symbeg),
[tUMINUS] = O(op),
[tUMINUS_NUM] = O(op),
[tUPLUS] = O(op),
[tWORDS_BEG] = O(words_beg),
[tXSTRING_BEG] = O(backtick),
[tLABEL] = O(label),
[tLABEL_END] = O(label_end),
[tLAMBDA] = O(tlambda),
[tLAMBEG] = O(tlambeg),
/* ripper specific tokens */
[tIGNORED_NL] = O(ignored_nl),
[tCOMMENT] = O(comment),
[tEMBDOC_BEG] = O(embdoc_beg),
[tEMBDOC] = O(embdoc),
[tEMBDOC_END] = O(embdoc_end),
[tSP] = O(sp),
[tHEREDOC_BEG] = O(heredoc_beg),
[tHEREDOC_END] = O(heredoc_end),
[k__END__] = O(__end__),
};
#undef O
int i = (int)tok;
if (i >= 0 && i < numberof(offsets) && (i = offsets[i]) > 0) {
return *(const ID *)((const char *)&ripper_scanner_ids-1+i);
}
/* 128..256 are used as operator tokens */
if (tok < 128) {
return ripper_scanner_ids.ripper_id_CHAR;
}
rb_raise(rb_eRuntimeError, "[Ripper FATAL] unknown token %d", tok);
UNREACHABLE_RETURN(0);
}