From a58a4a6ca7e2e3a20f9d8b5a514487a593dbfb18 Mon Sep 17 00:00:00 2001 From: Daniel Colson Date: Fri, 4 Jul 2025 12:10:31 -0400 Subject: [PATCH] ZJIT: Compile toregexp `toregexp` is fairly similar to `concatstrings`, so this commit extracts a helper for pushing and popping operands on the native stack. There's probably opportunity to move some of this into lir (e.g. Alan suggested a push_many that could use STP on ARM to push 2 at a time), but I might save that for another day. --- internal/re.h | 5 ++ re.c | 5 -- test/ruby/test_zjit.rb | 8 +++ zjit/bindgen/src/main.rs | 7 +++ zjit/src/codegen.rs | 74 ++++++++++++++++++---------- zjit/src/cruby_bindings.inc.rs | 5 ++ zjit/src/hir.rs | 89 ++++++++++++++++++++++++++++++++++ 7 files changed, 163 insertions(+), 30 deletions(-) diff --git a/internal/re.h b/internal/re.h index 2788f8b42a..593e5c464f 100644 --- a/internal/re.h +++ b/internal/re.h @@ -25,4 +25,9 @@ int rb_match_count(VALUE match); VALUE rb_reg_new_ary(VALUE ary, int options); VALUE rb_reg_last_defined(VALUE match); +#define ARG_REG_OPTION_MASK \ + (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND) +#define ARG_ENCODING_FIXED 16 +#define ARG_ENCODING_NONE 32 + #endif /* INTERNAL_RE_H */ diff --git a/re.c b/re.c index 9348622eea..13d7f0ef9e 100644 --- a/re.c +++ b/re.c @@ -290,11 +290,6 @@ rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc) #define KCODE_FIXED FL_USER4 -#define ARG_REG_OPTION_MASK \ - (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND) -#define ARG_ENCODING_FIXED 16 -#define ARG_ENCODING_NONE 32 - static int char_to_option(int c) { diff --git a/test/ruby/test_zjit.rb b/test/ruby/test_zjit.rb index 58fc9ba639..4da3cc3a72 100644 --- a/test/ruby/test_zjit.rb +++ b/test/ruby/test_zjit.rb @@ -1621,6 +1621,14 @@ class TestZJIT < Test::Unit::TestCase }, insns: [:concatstrings] end + def test_regexp_interpolation + assert_compiles '/123/', %q{ + def test = /#{1}#{2}#{3}/ + + test + }, insns: [:toregexp] + end + private # Assert that every method call in `test_script` can be compiled by ZJIT diff --git a/zjit/bindgen/src/main.rs b/zjit/bindgen/src/main.rs index 77299c2657..59b7f9737e 100644 --- a/zjit/bindgen/src/main.rs +++ b/zjit/bindgen/src/main.rs @@ -259,6 +259,13 @@ fn main() { // From internal/re.h .allowlist_function("rb_reg_new_ary") + .allowlist_var("ARG_ENCODING_FIXED") + .allowlist_var("ARG_ENCODING_NONE") + + // From include/ruby/onigmo.h + .allowlist_var("ONIG_OPTION_IGNORECASE") + .allowlist_var("ONIG_OPTION_EXTEND") + .allowlist_var("ONIG_OPTION_MULTILINE") // `ruby_value_type` is a C enum and this stops it from // prefixing all the members with the name of the type diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs index c261ffbcec..67acf734b4 100644 --- a/zjit/src/codegen.rs +++ b/zjit/src/codegen.rs @@ -338,6 +338,7 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio Insn::StringCopy { val, chilled, state } => gen_string_copy(asm, opnd!(val), *chilled, &function.frame_state(*state)), Insn::StringConcat { strings, state } => gen_string_concat(jit, asm, opnds!(strings), &function.frame_state(*state))?, Insn::StringIntern { val, state } => gen_intern(asm, opnd!(val), &function.frame_state(*state))?, + Insn::ToRegexp { opt, values, state } => gen_toregexp(jit, asm, *opt, opnds!(values), &function.frame_state(*state))?, Insn::Param { idx } => unreachable!("block.insns should not have Insn::Param({idx})"), Insn::Snapshot { .. } => return Some(()), // we don't need to do anything for this instruction at the moment Insn::Jump(branch) => return gen_jump(jit, asm, branch), @@ -1508,6 +1509,52 @@ pub fn gen_stub_exit(cb: &mut CodeBlock) -> Option { }) } +fn gen_push_opnds(jit: &mut JITState, asm: &mut Assembler, opnds: &[Opnd]) -> lir::Opnd { + let n = opnds.len(); + + // Calculate the compile-time NATIVE_STACK_PTR offset from NATIVE_BASE_PTR + // At this point, frame_setup(&[], jit.c_stack_slots) has been called, + // which allocated aligned_stack_bytes(jit.c_stack_slots) on the stack + let frame_size = aligned_stack_bytes(jit.c_stack_slots); + let allocation_size = aligned_stack_bytes(n); + + asm_comment!(asm, "allocate {} bytes on C stack for {} values", allocation_size, n); + asm.sub_into(NATIVE_STACK_PTR, allocation_size.into()); + + // Calculate the total offset from NATIVE_BASE_PTR to our buffer + let total_offset_from_base = (frame_size + allocation_size) as i32; + + for (idx, &opnd) in opnds.iter().enumerate() { + let slot_offset = -total_offset_from_base + (idx as i32 * SIZEOF_VALUE_I32); + asm.mov( + Opnd::mem(VALUE_BITS, NATIVE_BASE_PTR, slot_offset), + opnd + ); + } + + asm.lea(Opnd::mem(64, NATIVE_BASE_PTR, -total_offset_from_base)) +} + +fn gen_pop_opnds(asm: &mut Assembler, opnds: &[Opnd]) { + asm_comment!(asm, "restore C stack pointer"); + let allocation_size = aligned_stack_bytes(opnds.len()); + asm.add_into(NATIVE_STACK_PTR, allocation_size.into()); +} + +fn gen_toregexp(jit: &mut JITState, asm: &mut Assembler, opt: usize, values: Vec, state: &FrameState) -> Option { + gen_prepare_non_leaf_call(jit, asm, state)?; + + let first_opnd_ptr = gen_push_opnds(jit, asm, &values); + + let ary = asm_ccall!(asm, rb_ary_tmp_new_from_values, Opnd::Imm(0), values.len().into(), first_opnd_ptr); + let val = asm_ccall!(asm, rb_reg_new_ary, ary, opt.into()); + asm_ccall!(asm, rb_ary_clear, ary); + + gen_pop_opnds(asm, &values); + + Some(val) +} + fn gen_string_concat(jit: &mut JITState, asm: &mut Assembler, strings: Vec, state: &FrameState) -> Option { let n = strings.len(); @@ -1519,32 +1566,9 @@ fn gen_string_concat(jit: &mut JITState, asm: &mut Assembler, strings: Vec gen_prepare_non_leaf_call(jit, asm, state)?; - // Calculate the compile-time NATIVE_STACK_PTR offset from NATIVE_BASE_PTR - // At this point, frame_setup(&[], jit.c_stack_slots) has been called, - // which allocated aligned_stack_bytes(jit.c_stack_slots) on the stack - let frame_size = aligned_stack_bytes(jit.c_stack_slots); - let allocation_size = aligned_stack_bytes(n); - - asm_comment!(asm, "allocate {} bytes on C stack for {} strings", allocation_size, n); - asm.sub_into(NATIVE_STACK_PTR, allocation_size.into()); - - // Calculate the total offset from NATIVE_BASE_PTR to our buffer - let total_offset_from_base = (frame_size + allocation_size) as i32; - - for (idx, &string_opnd) in strings.iter().enumerate() { - let slot_offset = -total_offset_from_base + (idx as i32 * SIZEOF_VALUE_I32); - asm.mov( - Opnd::mem(VALUE_BITS, NATIVE_BASE_PTR, slot_offset), - string_opnd - ); - } - - let first_string_ptr = asm.lea(Opnd::mem(64, NATIVE_BASE_PTR, -total_offset_from_base)); - + let first_string_ptr = gen_push_opnds(jit, asm, &strings); let result = asm_ccall!(asm, rb_str_concat_literals, n.into(), first_string_ptr); - - asm_comment!(asm, "restore C stack pointer"); - asm.add_into(NATIVE_STACK_PTR, allocation_size.into()); + gen_pop_opnds(asm, &strings); Some(result) } diff --git a/zjit/src/cruby_bindings.inc.rs b/zjit/src/cruby_bindings.inc.rs index 5c939fabe7..524b06b580 100644 --- a/zjit/src/cruby_bindings.inc.rs +++ b/zjit/src/cruby_bindings.inc.rs @@ -30,6 +30,11 @@ impl ::std::fmt::Debug for __IncompleteArrayField { fmt.write_str("__IncompleteArrayField") } } +pub const ONIG_OPTION_IGNORECASE: u32 = 1; +pub const ONIG_OPTION_EXTEND: u32 = 2; +pub const ONIG_OPTION_MULTILINE: u32 = 4; +pub const ARG_ENCODING_FIXED: u32 = 16; +pub const ARG_ENCODING_NONE: u32 = 32; pub const INTEGER_REDEFINED_OP_FLAG: u32 = 1; pub const FLOAT_REDEFINED_OP_FLAG: u32 = 2; pub const STRING_REDEFINED_OP_FLAG: u32 = 4; diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs index c93a6858f1..19b232b735 100644 --- a/zjit/src/hir.rs +++ b/zjit/src/hir.rs @@ -448,6 +448,9 @@ pub enum Insn { StringIntern { val: InsnId, state: InsnId }, StringConcat { strings: Vec, state: InsnId }, + /// Combine count stack values into a regexp + ToRegexp { opt: usize, values: Vec, state: InsnId }, + /// Put special object (VMCORE, CBASE, etc.) based on value_type PutSpecialObject { value_type: SpecialObjectType }, @@ -641,6 +644,14 @@ pub struct InsnPrinter<'a> { ptr_map: &'a PtrPrintMap, } +static REGEXP_FLAGS: &[(u32, &str)] = &[ + (ONIG_OPTION_MULTILINE, "MULTILINE"), + (ONIG_OPTION_IGNORECASE, "IGNORECASE"), + (ONIG_OPTION_EXTEND, "EXTENDED"), + (ARG_ENCODING_FIXED, "FIXEDENCODING"), + (ARG_ENCODING_NONE, "NOENCODING"), +]; + impl<'a> std::fmt::Display for InsnPrinter<'a> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match &self.inner { @@ -689,6 +700,28 @@ impl<'a> std::fmt::Display for InsnPrinter<'a> { Ok(()) } + Insn::ToRegexp { values, opt, .. } => { + write!(f, "ToRegexp")?; + let mut prefix = " "; + for value in values { + write!(f, "{prefix}{value}")?; + prefix = ", "; + } + + let opt = *opt as u32; + if opt != 0 { + write!(f, ", ")?; + let mut sep = ""; + for (flag, name) in REGEXP_FLAGS { + if opt & flag != 0 { + write!(f, "{sep}{name}")?; + sep = "|"; + } + } + } + + Ok(()) + } Insn::Test { val } => { write!(f, "Test {val}") } Insn::IsNil { val } => { write!(f, "IsNil {val}") } Insn::Jump(target) => { write!(f, "Jump {target}") } @@ -1150,6 +1183,7 @@ impl Function { &StringCopy { val, chilled, state } => StringCopy { val: find!(val), chilled, state }, &StringIntern { val, state } => StringIntern { val: find!(val), state: find!(state) }, &StringConcat { ref strings, state } => StringConcat { strings: find_vec!(strings), state: find!(state) }, + &ToRegexp { opt, ref values, state } => ToRegexp { opt, values: find_vec!(values), state }, &Test { val } => Test { val: find!(val) }, &IsNil { val } => IsNil { val: find!(val) }, &Jump(ref target) => Jump(find_branch_edge!(target)), @@ -1274,6 +1308,7 @@ impl Function { Insn::StringCopy { .. } => types::StringExact, Insn::StringIntern { .. } => types::Symbol, Insn::StringConcat { .. } => types::StringExact, + Insn::ToRegexp { .. } => types::RegexpExact, Insn::NewArray { .. } => types::ArrayExact, Insn::ArrayDup { .. } => types::ArrayExact, Insn::NewHash { .. } => types::HashExact, @@ -1906,6 +1941,10 @@ impl Function { worklist.extend(strings); worklist.push_back(state); } + &Insn::ToRegexp { ref values, state, .. } => { + worklist.extend(values); + worklist.push_back(state); + } | &Insn::Return { val } | &Insn::Throw { val, .. } | &Insn::Defined { v: val, .. } @@ -2826,6 +2865,15 @@ pub fn iseq_to_hir(iseq: *const rb_iseq_t) -> Result { let insn_id = fun.push_insn(block, Insn::StringConcat { strings, state: exit_id }); state.stack_push(insn_id); } + YARVINSN_toregexp => { + // First arg contains the options (multiline, extended, ignorecase) used to create the regexp + let opt = get_arg(pc, 0).as_usize(); + let count = get_arg(pc, 1).as_usize(); + let exit_id = fun.push_insn(block, Insn::Snapshot { state: exit_state }); + let values = state.stack_pop_n(count)?; + let insn_id = fun.push_insn(block, Insn::ToRegexp { opt, values, state: exit_id }); + state.stack_push(insn_id); + } YARVINSN_newarray => { let count = get_arg(pc, 0).as_usize(); let exit_id = fun.push_insn(block, Insn::Snapshot { state: exit_state }); @@ -5299,6 +5347,47 @@ mod tests { "#]]); } + #[test] + fn test_toregexp() { + eval(r##" + def test = /#{1}#{2}#{3}/ + "##); + assert_method_hir_with_opcode("test", YARVINSN_toregexp, expect![[r#" + fn test@:2: + bb0(v0:BasicObject): + v2:Fixnum[1] = Const Value(1) + v4:BasicObject = ObjToString v2 + v6:String = AnyToString v2, str: v4 + v7:Fixnum[2] = Const Value(2) + v9:BasicObject = ObjToString v7 + v11:String = AnyToString v7, str: v9 + v12:Fixnum[3] = Const Value(3) + v14:BasicObject = ObjToString v12 + v16:String = AnyToString v12, str: v14 + v18:RegexpExact = ToRegexp v6, v11, v16 + Return v18 + "#]]); + } + + #[test] + fn test_toregexp_with_options() { + eval(r##" + def test = /#{1}#{2}/mixn + "##); + assert_method_hir_with_opcode("test", YARVINSN_toregexp, expect![[r#" + fn test@:2: + bb0(v0:BasicObject): + v2:Fixnum[1] = Const Value(1) + v4:BasicObject = ObjToString v2 + v6:String = AnyToString v2, str: v4 + v7:Fixnum[2] = Const Value(2) + v9:BasicObject = ObjToString v7 + v11:String = AnyToString v7, str: v9 + v13:RegexpExact = ToRegexp v6, v11, MULTILINE|IGNORECASE|EXTENDED|NOENCODING + Return v13 + "#]]); + } + #[test] fn throw() { eval("