From 4127f413a61199118055d38f14d20757b4f44fab Mon Sep 17 00:00:00 2001 From: John Hawthorn Date: Sun, 9 Apr 2023 20:33:42 -0700 Subject: [PATCH 1/3] Tailcall threaded VM using musttail This adds a compile-time option, OPT_TAILCALL_THREADED_CODE, to use musttail for threading in the VM loop. This works best with the latest LLVM, though the latest GCC also has musttail. This also attempts to use __attribute__((preserve_none)) when available (LLVM 19 or so?). This changes the calling convention of the instruction methods to save all registers and use what would normally be callee-saved registers to pass arguments. This significantly reduces pushing and popping when we end up calling non-tailcall methods from our VM instructions (which is pretty common for us). Aside from performance a thing I really like about this is that profilers (like linux perf) are able to show the VM instructions we're spending time inside. It also allows using objdump to print out the source for individual instructions, which is much easier to read. There have been various popular posts about using this technique: * https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html * https://sillycross.github.io/2022/11/22/2022-11-22/ * https://github.com/wasm3/wasm3/blob/main/docs/Interpreter.md#m3-massey-meta-machine * https://blog.nelhage.com/post/cpython-tail-call/ --- compile.c | 4 ++-- insns.def | 4 ++-- iseq.c | 4 ++-- vm.c | 4 ++-- vm_core.h | 4 ++-- vm_exec.c | 27 +++++++++++++++++++++++++-- vm_exec.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- vm_opts.h | 14 ++++++++------ 8 files changed, 98 insertions(+), 19 deletions(-) diff --git a/compile.c b/compile.c index 7eb953203c..c020168245 100644 --- a/compile.c +++ b/compile.c @@ -978,7 +978,7 @@ rb_iseq_compile_node(rb_iseq_t *iseq, const NODE *node) static int rb_iseq_translate_threaded_code(rb_iseq_t *iseq) { -#if OPT_DIRECT_THREADED_CODE || OPT_CALL_THREADED_CODE +#if OPT_DIRECT_THREADED_CODE || OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE const void * const *table = rb_vm_get_insns_address_table(); unsigned int i; VALUE *encoded = (VALUE *)ISEQ_BODY(iseq)->iseq_encoded; @@ -1009,7 +1009,7 @@ rb_iseq_original_iseq(const rb_iseq_t *iseq) /* cold path */ original_code = ISEQ_ORIGINAL_ISEQ_ALLOC(iseq, ISEQ_BODY(iseq)->iseq_size); MEMCPY(original_code, ISEQ_BODY(iseq)->iseq_encoded, VALUE, ISEQ_BODY(iseq)->iseq_size); -#if OPT_DIRECT_THREADED_CODE || OPT_CALL_THREADED_CODE +#if OPT_DIRECT_THREADED_CODE || OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE { unsigned int i; diff --git a/insns.def b/insns.def index aaa8ec8f5d..1dd7d83ad1 100644 --- a/insns.def +++ b/insns.def @@ -1163,7 +1163,7 @@ leave } if (vm_pop_frame(ec, GET_CFP(), GET_EP())) { -#if OPT_CALL_THREADED_CODE +#if OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE rb_ec_thread_ptr(ec)->retval = val; return 0; #else @@ -1697,7 +1697,7 @@ opt_invokebuiltin_delegate_leave /* leave fastpath */ /* TracePoint/return fallbacks this insn to opt_invokebuiltin_delegate */ if (vm_pop_frame(ec, GET_CFP(), GET_EP())) { -#if OPT_CALL_THREADED_CODE +#if OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE rb_ec_thread_ptr(ec)->retval = val; return 0; #else diff --git a/iseq.c b/iseq.c index f35769198b..df3f49ed4f 100644 --- a/iseq.c +++ b/iseq.c @@ -3773,7 +3773,7 @@ rb_free_encoded_insn_data(void) void rb_vm_encoded_insn_data_table_init(void) { -#if OPT_DIRECT_THREADED_CODE || OPT_CALL_THREADED_CODE +#if OPT_DIRECT_THREADED_CODE || OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE const void * const *table = rb_vm_get_insns_address_table(); #define INSN_CODE(insn) ((VALUE)table[insn]) #else @@ -3858,7 +3858,7 @@ rb_vm_insn_addr2opcode(const void *addr) int rb_vm_insn_decode(const VALUE encoded) { -#if OPT_DIRECT_THREADED_CODE || OPT_CALL_THREADED_CODE +#if OPT_DIRECT_THREADED_CODE || OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE int insn = rb_vm_insn_addr2insn((void *)encoded); #else int insn = (int)encoded; diff --git a/vm.c b/vm.c index d4792b9916..61dbb27f30 100644 --- a/vm.c +++ b/vm.c @@ -3711,7 +3711,7 @@ th_init(rb_thread_t *th, VALUE self, rb_vm_t *vm) th->ec->storage = Qnil; -#if OPT_CALL_THREADED_CODE +#if OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE th->retval = Qundef; #endif th->name = Qnil; @@ -4232,7 +4232,7 @@ Init_VM(void) rb_ary_push(opts, rb_str_new2("direct threaded code")); #elif OPT_TOKEN_THREADED_CODE rb_ary_push(opts, rb_str_new2("token threaded code")); -#elif OPT_CALL_THREADED_CODE +#elif OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE rb_ary_push(opts, rb_str_new2("call threaded code")); #endif diff --git a/vm_core.h b/vm_core.h index 5671a5982a..c581989839 100644 --- a/vm_core.h +++ b/vm_core.h @@ -204,7 +204,7 @@ void *rb_register_sigaltstack(void *); #endif /* call threaded code */ -#if OPT_CALL_THREADED_CODE +#if OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE #if OPT_DIRECT_THREADED_CODE #undef OPT_DIRECT_THREADED_CODE #endif /* OPT_DIRECT_THREADED_CODE */ @@ -1161,7 +1161,7 @@ typedef struct rb_thread_struct { VALUE value; /* temporary place of retval on OPT_CALL_THREADED_CODE */ -#if OPT_CALL_THREADED_CODE +#if OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE VALUE retval; #endif diff --git a/vm_exec.c b/vm_exec.c index 947d4dc421..0549895051 100644 --- a/vm_exec.c +++ b/vm_exec.c @@ -40,7 +40,7 @@ static void vm_analysis_insn(int insn); #endif /* #define DECL_SC_REG(r, reg) VALUE reg_##r */ -#if !OPT_CALL_THREADED_CODE +#if !OPT_CALL_THREADED_CODE && !OPT_TAILCALL_THREADED_CODE static VALUE vm_exec_core(rb_execution_context_t *ec) { @@ -115,7 +115,23 @@ rb_vm_get_insns_address_table(void) return (const void **)vm_exec_core(0); } -#else /* OPT_CALL_THREADED_CODE */ +#else /* OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE */ + +#if OPT_TAILCALL_THREADED_CODE +#undef RESTORE_REGS +#define RESTORE_REGS() \ +{ \ + VM_REG_CFP = ec->cfp; \ + reg_pc = reg_cfp->pc; \ +} + +#undef VM_REG_PC +#define VM_REG_PC reg_pc +#undef GET_PC +#define GET_PC() (reg_pc) +#undef SET_PC +#define SET_PC(x) (reg_cfp->pc = VM_REG_PC = (x)) +#endif #include "vm.inc" #include "vmtc.inc" @@ -132,6 +148,12 @@ vm_exec_core(rb_execution_context_t *ec) register rb_control_frame_t *reg_cfp = ec->cfp; rb_thread_t *th; +#ifdef OPT_TAILCALL_THREADED_CODE + const VALUE *reg_pc = reg_cfp->pc; + reg_cfp = ((rb_insn_tailcall_func_t *) (*GET_PC()))(INSN_FUNC_ARGS); + + RUBY_ASSERT_ALWAYS(reg_cfp == 0); +#else while (1) { reg_cfp = ((rb_insn_func_t) (*GET_PC()))(ec, reg_cfp); @@ -139,6 +161,7 @@ vm_exec_core(rb_execution_context_t *ec) break; } } +#endif if (!UNDEF_P((th = rb_ec_thread_ptr(ec))->retval)) { VALUE ret = th->retval; diff --git a/vm_exec.h b/vm_exec.h index c3b7d4e488..ef6cd87b60 100644 --- a/vm_exec.h +++ b/vm_exec.h @@ -57,6 +57,60 @@ error ! #define START_OF_ORIGINAL_INSN(x) /* ignore */ #define DISPATCH_ORIGINAL_INSN(x) return LABEL(x)(ec, reg_cfp); +/************************************************/ +#elif OPT_TAILCALL_THREADED_CODE + +// https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html +// https://sillycross.github.io/2022/11/22/2022-11-22/ +// https://github.com/wasm3/wasm3/blob/main/docs/Interpreter.md#m3-massey-meta-machine + +// TODO: move elsewhere +#define MUSTTAIL __attribute__((musttail)) + +#define LABEL(x) insn_func_##x +#define ELABEL(x) +#define LABEL_PTR(x) &LABEL(x) + +#if defined __has_attribute +#if __has_attribute (preserve_none) +#define HAS_PRESERVE_NONE 1 +#endif +#endif +#ifndef HAS_PRESERVE_NONE +#define HAS_PRESERVE_NONE 0 +#endif + +#if HAS_PRESERVE_NONE +#define INSN_FUNC_CONV __attribute__((preserve_none)) +#else +#define INSN_FUNC_CONV +#endif + +#define INSN_FUNC_RET rb_control_frame_t * +#define INSN_FUNC_PARAMS rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, const VALUE *reg_pc +#define INSN_FUNC_ARGS ec, reg_cfp, reg_pc + +typedef INSN_FUNC_CONV INSN_FUNC_RET rb_insn_tailcall_func_t(INSN_FUNC_PARAMS); + +#define INSN_FUNC_ATTRIBUTES \ + __attribute__((no_stack_protector)) + +#define INSN_ENTRY(insn) \ + static INSN_FUNC_CONV INSN_FUNC_RET \ + FUNC_FASTCALL(LABEL(insn))(INSN_FUNC_PARAMS) INSN_FUNC_ATTRIBUTES { + +#define TC_DISPATCH(insn) \ + MUSTTAIL return (*(rb_insn_tailcall_func_t *)GET_CURRENT_INSN())(INSN_FUNC_ARGS); + +//#define END_INSN(insn) return reg_cfp;} +#define END_INSN(insn) TC_DISPATCH(__NEXT_INSN__);} + +//#define NEXT_INSN() return reg_cfp; +#define NEXT_INSN() TC_DISPATCH(__NEXT_INSN__) + +#define START_OF_ORIGINAL_INSN(x) /* ignore */ +#define DISPATCH_ORIGINAL_INSN(x) MUSTTAIL return LABEL(x)(INSN_FUNC_ARGS); + /************************************************/ #elif OPT_TOKEN_THREADED_CODE || OPT_DIRECT_THREADED_CODE /* threaded code with gcc */ @@ -156,7 +210,7 @@ default: \ #define VM_SP_CNT(ec, sp) ((sp) - (ec)->vm_stack) -#if OPT_CALL_THREADED_CODE +#if OPT_CALL_THREADED_CODE || OPT_TAILCALL_THREADED_CODE #define THROW_EXCEPTION(exc) do { \ ec->errinfo = (VALUE)(exc); \ return 0; \ diff --git a/vm_opts.h b/vm_opts.h index ce47745b11..5f3dd4dd8a 100644 --- a/vm_opts.h +++ b/vm_opts.h @@ -32,14 +32,16 @@ * 0: direct (using labeled goto using GCC special) * 1: token (switch/case) * 2: call (function call for each insn dispatch) + * 3: call continuation (musttail attribute) */ -#ifndef OPT_THREADED_CODE -#define OPT_THREADED_CODE 0 -#endif +//#ifndef OPT_THREADED_CODE +#define OPT_THREADED_CODE 3 +//#endif -#define OPT_DIRECT_THREADED_CODE (OPT_THREADED_CODE == 0) -#define OPT_TOKEN_THREADED_CODE (OPT_THREADED_CODE == 1) -#define OPT_CALL_THREADED_CODE (OPT_THREADED_CODE == 2) +#define OPT_DIRECT_THREADED_CODE (OPT_THREADED_CODE == 0) +#define OPT_TOKEN_THREADED_CODE (OPT_THREADED_CODE == 1) +#define OPT_CALL_THREADED_CODE (OPT_THREADED_CODE == 2) +#define OPT_TAILCALL_THREADED_CODE (OPT_THREADED_CODE == 3) /* VM running option */ #define OPT_CHECKED_RUN 1 From 79da4ea17639a288732aa19cdc079d26dde15107 Mon Sep 17 00:00:00 2001 From: John Hawthorn Date: Mon, 26 May 2025 20:21:55 -0700 Subject: [PATCH 2/3] Move attributes next to convention --- vm_exec.h | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/vm_exec.h b/vm_exec.h index ef6cd87b60..34d3403453 100644 --- a/vm_exec.h +++ b/vm_exec.h @@ -60,12 +60,8 @@ error ! /************************************************/ #elif OPT_TAILCALL_THREADED_CODE -// https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html -// https://sillycross.github.io/2022/11/22/2022-11-22/ -// https://github.com/wasm3/wasm3/blob/main/docs/Interpreter.md#m3-massey-meta-machine - -// TODO: move elsewhere -#define MUSTTAIL __attribute__((musttail)) +/* Same as __attribute__((musttail)), but slightly wider support (GCC 15) */ +#define MUSTTAIL [[clang::musttail]] #define LABEL(x) insn_func_##x #define ELABEL(x) @@ -73,15 +69,12 @@ error ! #if defined __has_attribute #if __has_attribute (preserve_none) -#define HAS_PRESERVE_NONE 1 +#define ATTR_PRESERVE_NONE __attribute__((preserve_none)) #endif #endif -#ifndef HAS_PRESERVE_NONE -#define HAS_PRESERVE_NONE 0 -#endif -#if HAS_PRESERVE_NONE -#define INSN_FUNC_CONV __attribute__((preserve_none)) +#ifdef ATTR_PRESERVE_NONE +#define INSN_FUNC_CONV ATTR_PRESERVE_NONE #else #define INSN_FUNC_CONV #endif @@ -96,8 +89,8 @@ typedef INSN_FUNC_CONV INSN_FUNC_RET rb_insn_tailcall_func_t(INSN_FUNC_PARAMS); __attribute__((no_stack_protector)) #define INSN_ENTRY(insn) \ - static INSN_FUNC_CONV INSN_FUNC_RET \ - FUNC_FASTCALL(LABEL(insn))(INSN_FUNC_PARAMS) INSN_FUNC_ATTRIBUTES { + static INSN_FUNC_CONV INSN_FUNC_ATTRIBUTES INSN_FUNC_RET \ + FUNC_FASTCALL(LABEL(insn))(INSN_FUNC_PARAMS) { #define TC_DISPATCH(insn) \ MUSTTAIL return (*(rb_insn_tailcall_func_t *)GET_CURRENT_INSN())(INSN_FUNC_ARGS); From 078b17da923ace8dc5ffffe6a6e0988f82442f7d Mon Sep 17 00:00:00 2001 From: John Hawthorn Date: Mon, 26 May 2025 20:50:32 -0700 Subject: [PATCH 3/3] Use tailcall interpreter if musttail is available --- vm_exec.h | 14 +++++++++----- vm_opts.h | 10 ++++++++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/vm_exec.h b/vm_exec.h index 34d3403453..7480a807da 100644 --- a/vm_exec.h +++ b/vm_exec.h @@ -11,6 +11,8 @@ **********************************************************************/ +#include "ruby/internal/has/attribute.h" + typedef long OFFSET; typedef unsigned long lindex_t; typedef VALUE GENTRY; @@ -60,18 +62,20 @@ error ! /************************************************/ #elif OPT_TAILCALL_THREADED_CODE -/* Same as __attribute__((musttail)), but slightly wider support (GCC 15) */ -#define MUSTTAIL [[clang::musttail]] +#if !RBIMPL_HAS_ATTRIBUTE(musttail) +#error support for musttail attribute is required for tailcall threading +#endif + +/* Declares that the function call MUST be tailcall optimized */ +#define MUSTTAIL __attribute__((musttail)) #define LABEL(x) insn_func_##x #define ELABEL(x) #define LABEL_PTR(x) &LABEL(x) -#if defined __has_attribute -#if __has_attribute (preserve_none) +#if RBIMPL_HAS_ATTRIBUTE(preserve_none) #define ATTR_PRESERVE_NONE __attribute__((preserve_none)) #endif -#endif #ifdef ATTR_PRESERVE_NONE #define INSN_FUNC_CONV ATTR_PRESERVE_NONE diff --git a/vm_opts.h b/vm_opts.h index 5f3dd4dd8a..56d08bb6c8 100644 --- a/vm_opts.h +++ b/vm_opts.h @@ -10,6 +10,8 @@ **********************************************************************/ +#include "ruby/internal/has/attribute.h" + /* Compile options. * You can change these options at runtime by VM::CompileOption. * Following definitions are default values. @@ -34,9 +36,13 @@ * 2: call (function call for each insn dispatch) * 3: call continuation (musttail attribute) */ -//#ifndef OPT_THREADED_CODE +#ifndef OPT_THREADED_CODE +#if RBIMPL_HAS_ATTRIBUTE(musttail) #define OPT_THREADED_CODE 3 -//#endif +#else +#define OPT_THREADED_CODE 0 +#endif +#endif #define OPT_DIRECT_THREADED_CODE (OPT_THREADED_CODE == 0) #define OPT_TOKEN_THREADED_CODE (OPT_THREADED_CODE == 1)