ZJIT: Compile toregexp

`toregexp` is fairly similar to `concatstrings`, so this commit extracts
a helper for pushing and popping operands on the native stack.

There's probably opportunity to move some of this into lir (e.g. Alan
suggested a push_many that could use STP on ARM to push 2 at a time),
but I might save that for another day.
This commit is contained in:
Daniel Colson 2025-07-04 12:10:31 -04:00
parent c9346a166c
commit a58a4a6ca7
No known key found for this signature in database
GPG key ID: 88A364BBE77B1353
7 changed files with 163 additions and 30 deletions

View file

@ -25,4 +25,9 @@ int rb_match_count(VALUE match);
VALUE rb_reg_new_ary(VALUE ary, int options);
VALUE rb_reg_last_defined(VALUE match);
#define ARG_REG_OPTION_MASK \
(ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
#define ARG_ENCODING_FIXED 16
#define ARG_ENCODING_NONE 32
#endif /* INTERNAL_RE_H */

5
re.c
View file

@ -290,11 +290,6 @@ rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
#define KCODE_FIXED FL_USER4
#define ARG_REG_OPTION_MASK \
(ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
#define ARG_ENCODING_FIXED 16
#define ARG_ENCODING_NONE 32
static int
char_to_option(int c)
{

View file

@ -1621,6 +1621,14 @@ class TestZJIT < Test::Unit::TestCase
}, insns: [:concatstrings]
end
def test_regexp_interpolation
assert_compiles '/123/', %q{
def test = /#{1}#{2}#{3}/
test
}, insns: [:toregexp]
end
private
# Assert that every method call in `test_script` can be compiled by ZJIT

View file

@ -259,6 +259,13 @@ fn main() {
// From internal/re.h
.allowlist_function("rb_reg_new_ary")
.allowlist_var("ARG_ENCODING_FIXED")
.allowlist_var("ARG_ENCODING_NONE")
// From include/ruby/onigmo.h
.allowlist_var("ONIG_OPTION_IGNORECASE")
.allowlist_var("ONIG_OPTION_EXTEND")
.allowlist_var("ONIG_OPTION_MULTILINE")
// `ruby_value_type` is a C enum and this stops it from
// prefixing all the members with the name of the type

View file

@ -338,6 +338,7 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio
Insn::StringCopy { val, chilled, state } => gen_string_copy(asm, opnd!(val), *chilled, &function.frame_state(*state)),
Insn::StringConcat { strings, state } => gen_string_concat(jit, asm, opnds!(strings), &function.frame_state(*state))?,
Insn::StringIntern { val, state } => gen_intern(asm, opnd!(val), &function.frame_state(*state))?,
Insn::ToRegexp { opt, values, state } => gen_toregexp(jit, asm, *opt, opnds!(values), &function.frame_state(*state))?,
Insn::Param { idx } => unreachable!("block.insns should not have Insn::Param({idx})"),
Insn::Snapshot { .. } => return Some(()), // we don't need to do anything for this instruction at the moment
Insn::Jump(branch) => return gen_jump(jit, asm, branch),
@ -1508,6 +1509,52 @@ pub fn gen_stub_exit(cb: &mut CodeBlock) -> Option<CodePtr> {
})
}
fn gen_push_opnds(jit: &mut JITState, asm: &mut Assembler, opnds: &[Opnd]) -> lir::Opnd {
let n = opnds.len();
// Calculate the compile-time NATIVE_STACK_PTR offset from NATIVE_BASE_PTR
// At this point, frame_setup(&[], jit.c_stack_slots) has been called,
// which allocated aligned_stack_bytes(jit.c_stack_slots) on the stack
let frame_size = aligned_stack_bytes(jit.c_stack_slots);
let allocation_size = aligned_stack_bytes(n);
asm_comment!(asm, "allocate {} bytes on C stack for {} values", allocation_size, n);
asm.sub_into(NATIVE_STACK_PTR, allocation_size.into());
// Calculate the total offset from NATIVE_BASE_PTR to our buffer
let total_offset_from_base = (frame_size + allocation_size) as i32;
for (idx, &opnd) in opnds.iter().enumerate() {
let slot_offset = -total_offset_from_base + (idx as i32 * SIZEOF_VALUE_I32);
asm.mov(
Opnd::mem(VALUE_BITS, NATIVE_BASE_PTR, slot_offset),
opnd
);
}
asm.lea(Opnd::mem(64, NATIVE_BASE_PTR, -total_offset_from_base))
}
fn gen_pop_opnds(asm: &mut Assembler, opnds: &[Opnd]) {
asm_comment!(asm, "restore C stack pointer");
let allocation_size = aligned_stack_bytes(opnds.len());
asm.add_into(NATIVE_STACK_PTR, allocation_size.into());
}
fn gen_toregexp(jit: &mut JITState, asm: &mut Assembler, opt: usize, values: Vec<Opnd>, state: &FrameState) -> Option<lir::Opnd> {
gen_prepare_non_leaf_call(jit, asm, state)?;
let first_opnd_ptr = gen_push_opnds(jit, asm, &values);
let ary = asm_ccall!(asm, rb_ary_tmp_new_from_values, Opnd::Imm(0), values.len().into(), first_opnd_ptr);
let val = asm_ccall!(asm, rb_reg_new_ary, ary, opt.into());
asm_ccall!(asm, rb_ary_clear, ary);
gen_pop_opnds(asm, &values);
Some(val)
}
fn gen_string_concat(jit: &mut JITState, asm: &mut Assembler, strings: Vec<Opnd>, state: &FrameState) -> Option<Opnd> {
let n = strings.len();
@ -1519,32 +1566,9 @@ fn gen_string_concat(jit: &mut JITState, asm: &mut Assembler, strings: Vec<Opnd>
gen_prepare_non_leaf_call(jit, asm, state)?;
// Calculate the compile-time NATIVE_STACK_PTR offset from NATIVE_BASE_PTR
// At this point, frame_setup(&[], jit.c_stack_slots) has been called,
// which allocated aligned_stack_bytes(jit.c_stack_slots) on the stack
let frame_size = aligned_stack_bytes(jit.c_stack_slots);
let allocation_size = aligned_stack_bytes(n);
asm_comment!(asm, "allocate {} bytes on C stack for {} strings", allocation_size, n);
asm.sub_into(NATIVE_STACK_PTR, allocation_size.into());
// Calculate the total offset from NATIVE_BASE_PTR to our buffer
let total_offset_from_base = (frame_size + allocation_size) as i32;
for (idx, &string_opnd) in strings.iter().enumerate() {
let slot_offset = -total_offset_from_base + (idx as i32 * SIZEOF_VALUE_I32);
asm.mov(
Opnd::mem(VALUE_BITS, NATIVE_BASE_PTR, slot_offset),
string_opnd
);
}
let first_string_ptr = asm.lea(Opnd::mem(64, NATIVE_BASE_PTR, -total_offset_from_base));
let first_string_ptr = gen_push_opnds(jit, asm, &strings);
let result = asm_ccall!(asm, rb_str_concat_literals, n.into(), first_string_ptr);
asm_comment!(asm, "restore C stack pointer");
asm.add_into(NATIVE_STACK_PTR, allocation_size.into());
gen_pop_opnds(asm, &strings);
Some(result)
}

View file

@ -30,6 +30,11 @@ impl<T> ::std::fmt::Debug for __IncompleteArrayField<T> {
fmt.write_str("__IncompleteArrayField")
}
}
pub const ONIG_OPTION_IGNORECASE: u32 = 1;
pub const ONIG_OPTION_EXTEND: u32 = 2;
pub const ONIG_OPTION_MULTILINE: u32 = 4;
pub const ARG_ENCODING_FIXED: u32 = 16;
pub const ARG_ENCODING_NONE: u32 = 32;
pub const INTEGER_REDEFINED_OP_FLAG: u32 = 1;
pub const FLOAT_REDEFINED_OP_FLAG: u32 = 2;
pub const STRING_REDEFINED_OP_FLAG: u32 = 4;

View file

@ -448,6 +448,9 @@ pub enum Insn {
StringIntern { val: InsnId, state: InsnId },
StringConcat { strings: Vec<InsnId>, state: InsnId },
/// Combine count stack values into a regexp
ToRegexp { opt: usize, values: Vec<InsnId>, state: InsnId },
/// Put special object (VMCORE, CBASE, etc.) based on value_type
PutSpecialObject { value_type: SpecialObjectType },
@ -641,6 +644,14 @@ pub struct InsnPrinter<'a> {
ptr_map: &'a PtrPrintMap,
}
static REGEXP_FLAGS: &[(u32, &str)] = &[
(ONIG_OPTION_MULTILINE, "MULTILINE"),
(ONIG_OPTION_IGNORECASE, "IGNORECASE"),
(ONIG_OPTION_EXTEND, "EXTENDED"),
(ARG_ENCODING_FIXED, "FIXEDENCODING"),
(ARG_ENCODING_NONE, "NOENCODING"),
];
impl<'a> std::fmt::Display for InsnPrinter<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match &self.inner {
@ -689,6 +700,28 @@ impl<'a> std::fmt::Display for InsnPrinter<'a> {
Ok(())
}
Insn::ToRegexp { values, opt, .. } => {
write!(f, "ToRegexp")?;
let mut prefix = " ";
for value in values {
write!(f, "{prefix}{value}")?;
prefix = ", ";
}
let opt = *opt as u32;
if opt != 0 {
write!(f, ", ")?;
let mut sep = "";
for (flag, name) in REGEXP_FLAGS {
if opt & flag != 0 {
write!(f, "{sep}{name}")?;
sep = "|";
}
}
}
Ok(())
}
Insn::Test { val } => { write!(f, "Test {val}") }
Insn::IsNil { val } => { write!(f, "IsNil {val}") }
Insn::Jump(target) => { write!(f, "Jump {target}") }
@ -1150,6 +1183,7 @@ impl Function {
&StringCopy { val, chilled, state } => StringCopy { val: find!(val), chilled, state },
&StringIntern { val, state } => StringIntern { val: find!(val), state: find!(state) },
&StringConcat { ref strings, state } => StringConcat { strings: find_vec!(strings), state: find!(state) },
&ToRegexp { opt, ref values, state } => ToRegexp { opt, values: find_vec!(values), state },
&Test { val } => Test { val: find!(val) },
&IsNil { val } => IsNil { val: find!(val) },
&Jump(ref target) => Jump(find_branch_edge!(target)),
@ -1274,6 +1308,7 @@ impl Function {
Insn::StringCopy { .. } => types::StringExact,
Insn::StringIntern { .. } => types::Symbol,
Insn::StringConcat { .. } => types::StringExact,
Insn::ToRegexp { .. } => types::RegexpExact,
Insn::NewArray { .. } => types::ArrayExact,
Insn::ArrayDup { .. } => types::ArrayExact,
Insn::NewHash { .. } => types::HashExact,
@ -1906,6 +1941,10 @@ impl Function {
worklist.extend(strings);
worklist.push_back(state);
}
&Insn::ToRegexp { ref values, state, .. } => {
worklist.extend(values);
worklist.push_back(state);
}
| &Insn::Return { val }
| &Insn::Throw { val, .. }
| &Insn::Defined { v: val, .. }
@ -2826,6 +2865,15 @@ pub fn iseq_to_hir(iseq: *const rb_iseq_t) -> Result<Function, ParseError> {
let insn_id = fun.push_insn(block, Insn::StringConcat { strings, state: exit_id });
state.stack_push(insn_id);
}
YARVINSN_toregexp => {
// First arg contains the options (multiline, extended, ignorecase) used to create the regexp
let opt = get_arg(pc, 0).as_usize();
let count = get_arg(pc, 1).as_usize();
let exit_id = fun.push_insn(block, Insn::Snapshot { state: exit_state });
let values = state.stack_pop_n(count)?;
let insn_id = fun.push_insn(block, Insn::ToRegexp { opt, values, state: exit_id });
state.stack_push(insn_id);
}
YARVINSN_newarray => {
let count = get_arg(pc, 0).as_usize();
let exit_id = fun.push_insn(block, Insn::Snapshot { state: exit_state });
@ -5299,6 +5347,47 @@ mod tests {
"#]]);
}
#[test]
fn test_toregexp() {
eval(r##"
def test = /#{1}#{2}#{3}/
"##);
assert_method_hir_with_opcode("test", YARVINSN_toregexp, expect![[r#"
fn test@<compiled>:2:
bb0(v0:BasicObject):
v2:Fixnum[1] = Const Value(1)
v4:BasicObject = ObjToString v2
v6:String = AnyToString v2, str: v4
v7:Fixnum[2] = Const Value(2)
v9:BasicObject = ObjToString v7
v11:String = AnyToString v7, str: v9
v12:Fixnum[3] = Const Value(3)
v14:BasicObject = ObjToString v12
v16:String = AnyToString v12, str: v14
v18:RegexpExact = ToRegexp v6, v11, v16
Return v18
"#]]);
}
#[test]
fn test_toregexp_with_options() {
eval(r##"
def test = /#{1}#{2}/mixn
"##);
assert_method_hir_with_opcode("test", YARVINSN_toregexp, expect![[r#"
fn test@<compiled>:2:
bb0(v0:BasicObject):
v2:Fixnum[1] = Const Value(1)
v4:BasicObject = ObjToString v2
v6:String = AnyToString v2, str: v4
v7:Fixnum[2] = Const Value(2)
v9:BasicObject = ObjToString v7
v11:String = AnyToString v7, str: v9
v13:RegexpExact = ToRegexp v6, v11, MULTILINE|IGNORECASE|EXTENDED|NOENCODING
Return v13
"#]]);
}
#[test]
fn throw() {
eval("