[ruby/prism] Optimize context_terminator with a lookup table

483aa89234
2025-08-15 05:29:10 +02:00 · 2025-05-06 08:57:30 -04:00 · 2025-05-06 08:57:30 -04:00 · 18e37ac430
commit 18e37ac430
parent 3ef8d833ab
2 changed files with 96 additions and 112 deletions
--- a/prism/config.yml
+++ b/prism/config.yml
@ -322,13 +322,42 @@ warnings:
  - UNUSED_LOCAL_VARIABLE
  - VOID_STATEMENT
 tokens:
+  # The order of the tokens at the beginning is important, because we use them
+  # for a lookup table.
  - name: EOF
    value: 1
    comment: final token in the file
-  - name: MISSING
-    comment: "a token that was expected but not found"
-  - name: NOT_PROVIDED
-    comment: "a token that was not present but it is okay"
+  - name: BRACE_RIGHT
+    comment: "}"
+  - name: COMMA
+    comment: ","
+  - name: EMBEXPR_END
+    comment: "}"
+  - name: KEYWORD_DO
+    comment: "do"
+  - name: KEYWORD_ELSE
+    comment: "else"
+  - name: KEYWORD_ELSIF
+    comment: "elsif"
+  - name: KEYWORD_END
+    comment: "end"
+  - name: KEYWORD_ENSURE
+    comment: "ensure"
+  - name: KEYWORD_IN
+    comment: "in"
+  - name: KEYWORD_RESCUE
+    comment: "rescue"
+  - name: KEYWORD_THEN
+    comment: "then"
+  - name: KEYWORD_WHEN
+    comment: "when"
+  - name: NEWLINE
+    comment: "a newline character outside of other tokens"
+  - name: PARENTHESIS_RIGHT
+    comment: ")"
+  - name: SEMICOLON
+    comment: ";"
+  # Tokens from here on are not used for lookup, and can be in any order.
  - name: AMPERSAND
    comment: "&"
  - name: AMPERSAND_AMPERSAND
@ -351,8 +380,6 @@ tokens:
    comment: "!~"
  - name: BRACE_LEFT
    comment: "{"
-  - name: BRACE_RIGHT
-    comment: "}"
  - name: BRACKET_LEFT
    comment: "["
  - name: BRACKET_LEFT_ARRAY
@ -375,8 +402,6 @@ tokens:
    comment: ":"
  - name: COLON_COLON
    comment: "::"
-  - name: COMMA
-    comment: ","
  - name: COMMENT
    comment: "a comment"
  - name: CONSTANT
@ -395,8 +420,6 @@ tokens:
    comment: "a line inside of embedded documentation"
  - name: EMBEXPR_BEGIN
    comment: "#{"
-  - name: EMBEXPR_END
-    comment: "}"
  - name: EMBVAR
    comment: "#"
  - name: EQUAL
@ -463,20 +486,10 @@ tokens:
    comment: "def"
  - name: KEYWORD_DEFINED
    comment: "defined?"
-  - name: KEYWORD_DO
-    comment: "do"
  - name: KEYWORD_DO_LOOP
    comment: "do keyword for a predicate in a while, until, or for loop"
-  - name: KEYWORD_ELSE
-    comment: "else"
-  - name: KEYWORD_ELSIF
-    comment: "elsif"
-  - name: KEYWORD_END
-    comment: "end"
  - name: KEYWORD_END_UPCASE
    comment: "END"
-  - name: KEYWORD_ENSURE
-    comment: "ensure"
  - name: KEYWORD_FALSE
    comment: "false"
  - name: KEYWORD_FOR
@ -485,8 +498,6 @@ tokens:
    comment: "if"
  - name: KEYWORD_IF_MODIFIER
    comment: "if in the modifier form"
-  - name: KEYWORD_IN
-    comment: "in"
  - name: KEYWORD_MODULE
    comment: "module"
  - name: KEYWORD_NEXT
@ -499,8 +510,6 @@ tokens:
    comment: "or"
  - name: KEYWORD_REDO
    comment: "redo"
-  - name: KEYWORD_RESCUE
-    comment: "rescue"
  - name: KEYWORD_RESCUE_MODIFIER
    comment: "rescue in the modifier form"
  - name: KEYWORD_RETRY
@ -511,8 +520,6 @@ tokens:
    comment: "self"
  - name: KEYWORD_SUPER
    comment: "super"
-  - name: KEYWORD_THEN
-    comment: "then"
  - name: KEYWORD_TRUE
    comment: "true"
  - name: KEYWORD_UNDEF
@ -525,8 +532,6 @@ tokens:
    comment: "until"
  - name: KEYWORD_UNTIL_MODIFIER
    comment: "until in the modifier form"
-  - name: KEYWORD_WHEN
-    comment: "when"
  - name: KEYWORD_WHILE
    comment: "while"
  - name: KEYWORD_WHILE_MODIFIER
@ -563,16 +568,12 @@ tokens:
    comment: "-="
  - name: MINUS_GREATER
    comment: "->"
-  - name: NEWLINE
-    comment: "a newline character outside of other tokens"
  - name: NUMBERED_REFERENCE
    comment: "a numbered reference to a capture group in the previous regular expression match"
  - name: PARENTHESIS_LEFT
    comment: "("
  - name: PARENTHESIS_LEFT_PARENTHESES
    comment: "( for a parentheses node"
-  - name: PARENTHESIS_RIGHT
-    comment: ")"
  - name: PERCENT
    comment: "%"
  - name: PERCENT_EQUAL
@ -605,8 +606,6 @@ tokens:
    comment: "the beginning of a regular expression"
  - name: REGEXP_END
    comment: "the end of a regular expression"
-  - name: SEMICOLON
-    comment: ";"
  - name: SLASH
    comment: "/"
  - name: SLASH_EQUAL
@ -651,6 +650,10 @@ tokens:
    comment: "a separator between words in a list"
  - name: __END__
    comment: "marker for the point in the file at which the parser should stop"
+  - name: MISSING
+    comment: "a token that was expected but not found"
+  - name: NOT_PROVIDED
+    comment: "a token that was not present but it is okay"
 flags:
  - name: ArgumentsNodeFlags
    values:
--- a/prism/prism.c
+++ b/prism/prism.c
@ -8586,85 +8586,66 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
 /* Context manipulations                                                      */
 /******************************************************************************/

-static bool
-context_terminator(pm_context_t context, pm_token_t *token) {
-    switch (context) {
-        case PM_CONTEXT_MAIN:
-        case PM_CONTEXT_DEF_PARAMS:
-        case PM_CONTEXT_DEFINED:
-        case PM_CONTEXT_MULTI_TARGET:
-        case PM_CONTEXT_TERNARY:
-        case PM_CONTEXT_RESCUE_MODIFIER:
-            return token->type == PM_TOKEN_EOF;
-        case PM_CONTEXT_DEFAULT_PARAMS:
-            return token->type == PM_TOKEN_COMMA || token->type == PM_TOKEN_PARENTHESIS_RIGHT;
-        case PM_CONTEXT_PREEXE:
-        case PM_CONTEXT_POSTEXE:
-            return token->type == PM_TOKEN_BRACE_RIGHT;
-        case PM_CONTEXT_MODULE:
-        case PM_CONTEXT_CLASS:
-        case PM_CONTEXT_SCLASS:
-        case PM_CONTEXT_LAMBDA_DO_END:
-        case PM_CONTEXT_DEF:
-        case PM_CONTEXT_BLOCK_KEYWORDS:
-            return token->type == PM_TOKEN_KEYWORD_END || token->type == PM_TOKEN_KEYWORD_RESCUE || token->type == PM_TOKEN_KEYWORD_ENSURE;
-        case PM_CONTEXT_WHILE:
-        case PM_CONTEXT_UNTIL:
-        case PM_CONTEXT_ELSE:
-        case PM_CONTEXT_FOR:
-        case PM_CONTEXT_BEGIN_ENSURE:
-        case PM_CONTEXT_BLOCK_ENSURE:
-        case PM_CONTEXT_CLASS_ENSURE:
-        case PM_CONTEXT_DEF_ENSURE:
-        case PM_CONTEXT_LAMBDA_ENSURE:
-        case PM_CONTEXT_MODULE_ENSURE:
-        case PM_CONTEXT_SCLASS_ENSURE:
-            return token->type == PM_TOKEN_KEYWORD_END;
-        case PM_CONTEXT_LOOP_PREDICATE:
-            return token->type == PM_TOKEN_KEYWORD_DO || token->type == PM_TOKEN_KEYWORD_THEN;
-        case PM_CONTEXT_FOR_INDEX:
-            return token->type == PM_TOKEN_KEYWORD_IN;
-        case PM_CONTEXT_CASE_WHEN:
-            return token->type == PM_TOKEN_KEYWORD_WHEN || token->type == PM_TOKEN_KEYWORD_END || token->type == PM_TOKEN_KEYWORD_ELSE;
-        case PM_CONTEXT_CASE_IN:
-            return token->type == PM_TOKEN_KEYWORD_IN || token->type == PM_TOKEN_KEYWORD_END || token->type == PM_TOKEN_KEYWORD_ELSE;
-        case PM_CONTEXT_IF:
-        case PM_CONTEXT_ELSIF:
-            return token->type == PM_TOKEN_KEYWORD_ELSE || token->type == PM_TOKEN_KEYWORD_ELSIF || token->type == PM_TOKEN_KEYWORD_END;
-        case PM_CONTEXT_UNLESS:
-            return token->type == PM_TOKEN_KEYWORD_ELSE || token->type == PM_TOKEN_KEYWORD_END;
-        case PM_CONTEXT_EMBEXPR:
-            return token->type == PM_TOKEN_EMBEXPR_END;
-        case PM_CONTEXT_BLOCK_BRACES:
-            return token->type == PM_TOKEN_BRACE_RIGHT;
-        case PM_CONTEXT_PARENS:
-            return token->type == PM_TOKEN_PARENTHESIS_RIGHT;
-        case PM_CONTEXT_BEGIN:
-        case PM_CONTEXT_BEGIN_RESCUE:
-        case PM_CONTEXT_BLOCK_RESCUE:
-        case PM_CONTEXT_CLASS_RESCUE:
-        case PM_CONTEXT_DEF_RESCUE:
-        case PM_CONTEXT_LAMBDA_RESCUE:
-        case PM_CONTEXT_MODULE_RESCUE:
-        case PM_CONTEXT_SCLASS_RESCUE:
-            return token->type == PM_TOKEN_KEYWORD_ENSURE || token->type == PM_TOKEN_KEYWORD_RESCUE || token->type == PM_TOKEN_KEYWORD_ELSE || token->type == PM_TOKEN_KEYWORD_END;
-        case PM_CONTEXT_BEGIN_ELSE:
-        case PM_CONTEXT_BLOCK_ELSE:
-        case PM_CONTEXT_CLASS_ELSE:
-        case PM_CONTEXT_DEF_ELSE:
-        case PM_CONTEXT_LAMBDA_ELSE:
-        case PM_CONTEXT_MODULE_ELSE:
-        case PM_CONTEXT_SCLASS_ELSE:
-            return token->type == PM_TOKEN_KEYWORD_ENSURE || token->type == PM_TOKEN_KEYWORD_END;
-        case PM_CONTEXT_LAMBDA_BRACES:
-            return token->type == PM_TOKEN_BRACE_RIGHT;
-        case PM_CONTEXT_PREDICATE:
-            return token->type == PM_TOKEN_KEYWORD_THEN || token->type == PM_TOKEN_NEWLINE || token->type == PM_TOKEN_SEMICOLON;
-        case PM_CONTEXT_NONE:
-            return false;
-    }
+static const uint32_t context_terminators[] = {
+    [PM_CONTEXT_NONE] = 0,
+    [PM_CONTEXT_BEGIN] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_BEGIN_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_BEGIN_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_BEGIN_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_BLOCK_BRACES] = (1 << PM_TOKEN_BRACE_RIGHT),
+    [PM_CONTEXT_BLOCK_KEYWORDS] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
+    [PM_CONTEXT_BLOCK_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_BLOCK_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_BLOCK_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_CASE_WHEN] = (1 << PM_TOKEN_KEYWORD_WHEN) | (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_ELSE),
+    [PM_CONTEXT_CASE_IN] = (1 << PM_TOKEN_KEYWORD_IN) | (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_ELSE),
+    [PM_CONTEXT_CLASS] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
+    [PM_CONTEXT_CLASS_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_CLASS_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_CLASS_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_DEF] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
+    [PM_CONTEXT_DEF_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_DEF_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_DEF_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_DEF_PARAMS] = (1 << PM_TOKEN_EOF),
+    [PM_CONTEXT_DEFINED] = (1 << PM_TOKEN_EOF),
+    [PM_CONTEXT_DEFAULT_PARAMS] = (1 << PM_TOKEN_COMMA) | (1 << PM_TOKEN_PARENTHESIS_RIGHT),
+    [PM_CONTEXT_ELSE] = (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_ELSIF] = (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_ELSIF) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_EMBEXPR] = (1 << PM_TOKEN_EMBEXPR_END),
+    [PM_CONTEXT_FOR] = (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_FOR_INDEX] = (1 << PM_TOKEN_KEYWORD_IN),
+    [PM_CONTEXT_IF] = (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_ELSIF) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_LAMBDA_BRACES] = (1 << PM_TOKEN_BRACE_RIGHT),
+    [PM_CONTEXT_LAMBDA_DO_END] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
+    [PM_CONTEXT_LAMBDA_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_LAMBDA_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_LAMBDA_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_LOOP_PREDICATE] = (1 << PM_TOKEN_KEYWORD_DO) | (1 << PM_TOKEN_KEYWORD_THEN),
+    [PM_CONTEXT_MAIN] = (1 << PM_TOKEN_EOF),
+    [PM_CONTEXT_MODULE] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
+    [PM_CONTEXT_MODULE_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_MODULE_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_MODULE_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_MULTI_TARGET] = (1 << PM_TOKEN_EOF),
+    [PM_CONTEXT_PARENS] = (1 << PM_TOKEN_PARENTHESIS_RIGHT),
+    [PM_CONTEXT_POSTEXE] = (1 << PM_TOKEN_BRACE_RIGHT),
+    [PM_CONTEXT_PREDICATE] = (1 << PM_TOKEN_KEYWORD_THEN) | (1 << PM_TOKEN_NEWLINE) | (1 << PM_TOKEN_SEMICOLON),
+    [PM_CONTEXT_PREEXE] = (1 << PM_TOKEN_BRACE_RIGHT),
+    [PM_CONTEXT_RESCUE_MODIFIER] = (1 << PM_TOKEN_EOF),
+    [PM_CONTEXT_SCLASS] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
+    [PM_CONTEXT_SCLASS_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_SCLASS_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_SCLASS_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_TERNARY] = (1 << PM_TOKEN_EOF),
+    [PM_CONTEXT_UNLESS] = (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_UNTIL] = (1 << PM_TOKEN_KEYWORD_END),
+    [PM_CONTEXT_WHILE] = (1 << PM_TOKEN_KEYWORD_END),
+};

-    return false;
+static inline bool
+context_terminator(pm_context_t context, pm_token_t *token) {
+    return token->type < 32 && (context_terminators[context] & (1 << token->type));
 }

 /**