Allow encodings to be autoloaded through transcoding functions

Make sure VM lock is not held when calling `load_transcoder_entry`, as
that causes deadlock inside ractors. `String#encode` now works inside
ractors, among others.

Atomic load the rb_encoding_list

Without this, wbcheck would sometimes hit a missing write barrier.

Co-authored-by: John Hawthorn <john.hawthorn@shopify.com>

Hold VM lock when iterating over global_enc_table.names

This st_table can be inserted into at runtime when autoloading
encodings.

minor optimization when calling Encoding.list
This commit is contained in:
Luke Gruber 2025-08-06 14:30:03 -04:00 committed by John Hawthorn
parent 31e8a9fced
commit 1afc07e815
4 changed files with 157 additions and 94 deletions

View file

@ -29,6 +29,7 @@
#include "ruby/util.h"
#include "ruby_assert.h"
#include "vm_sync.h"
#include "ruby_atomic.h"
#ifndef ENC_DEBUG
#define ENC_DEBUG 0
@ -144,10 +145,14 @@ enc_list_update(int index, rb_raw_encoding *encoding)
{
RUBY_ASSERT(index < ENCODING_LIST_CAPA);
VALUE list = rb_encoding_list;
VALUE list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list);
if (list && NIL_P(rb_ary_entry(list, index))) {
VALUE new_list = rb_ary_dup(list);
RBASIC_CLEAR_CLASS(new_list);
/* initialize encoding data */
rb_ary_store(list, index, enc_new(encoding));
rb_ary_store(new_list, index, enc_new(encoding));
RUBY_ATOMIC_VALUE_SET(rb_encoding_list, new_list);
}
}
@ -157,7 +162,7 @@ enc_list_lookup(int idx)
VALUE list, enc = Qnil;
if (idx < ENCODING_LIST_CAPA) {
list = rb_encoding_list;
list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list);
RUBY_ASSERT(list);
enc = rb_ary_entry(list, idx);
}
@ -258,6 +263,7 @@ must_encindex(int index)
int
rb_to_encoding_index(VALUE enc)
{
ASSERT_vm_unlocking(); // can load encoding, so must not hold VM lock
int idx;
const char *name;
@ -667,15 +673,15 @@ int
rb_enc_alias(const char *alias, const char *orig)
{
int idx, r;
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
enc_check_addable(enc_table, alias); // can raise
}
idx = rb_enc_find_index(orig);
if (idx < 0) return -1;
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
enc_check_addable(enc_table, alias);
if ((idx = rb_enc_find_index(orig)) < 0) {
r = -1;
}
else {
r = enc_alias(enc_table, alias, idx);
}
r = enc_alias(enc_table, alias, idx);
}
return r;
@ -742,6 +748,7 @@ int rb_require_internal_silent(VALUE fname);
static int
load_encoding(const char *name)
{
ASSERT_vm_unlocking();
VALUE enclib = rb_sprintf("enc/%s.so", name);
VALUE debug = ruby_debug;
VALUE errinfo;
@ -757,7 +764,7 @@ load_encoding(const char *name)
enclib = rb_fstring(enclib);
ruby_debug = Qfalse;
errinfo = rb_errinfo();
loaded = rb_require_internal_silent(enclib);
loaded = rb_require_internal_silent(enclib); // must run without VM_LOCK
ruby_debug = debug;
rb_set_errinfo(errinfo);
@ -781,6 +788,7 @@ enc_autoload_body(rb_encoding *enc)
{
rb_encoding *base;
int i = 0;
ASSERT_vm_unlocking();
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
base = enc_table->list[ENC_TO_ENCINDEX(enc)].base;
@ -792,30 +800,32 @@ enc_autoload_body(rb_encoding *enc)
}
} while (enc_table->list[i].enc != base && (++i, 1));
}
}
if (i != -1) {
if (base) {
bool do_register = true;
if (rb_enc_autoload_p(base)) {
if (rb_enc_autoload(base) < 0) {
do_register = false;
i = -1;
}
if (i != -1) {
if (base) {
bool do_register = true;
if (rb_enc_autoload_p(base)) {
if (rb_enc_autoload(base) < 0) {
do_register = false;
i = -1;
}
}
i = enc->ruby_encoding_index;
if (do_register) {
if (do_register) {
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
i = enc->ruby_encoding_index;
enc_register_at(enc_table, i & ENC_INDEX_MASK, rb_enc_name(enc), base);
((rb_raw_encoding *)enc)->ruby_encoding_index = i;
}
}
i &= ENC_INDEX_MASK;
}
else {
i = -2;
}
i &= ENC_INDEX_MASK;
}
else {
i = -2;
}
}
return i;
@ -824,6 +834,7 @@ enc_autoload_body(rb_encoding *enc)
int
rb_enc_autoload(rb_encoding *enc)
{
ASSERT_vm_unlocking();
int i = enc_autoload_body(enc);
if (i == -2) {
i = load_encoding(rb_enc_name(enc));
@ -844,6 +855,7 @@ int
rb_enc_find_index(const char *name)
{
int i;
ASSERT_vm_unlocking(); // it needs to be unlocked so it can call `load_encoding` if necessary
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
i = enc_registered(enc_table, name);
}
@ -1019,7 +1031,6 @@ rb_enc_associate_index(VALUE obj, int idx)
rb_encoding *enc;
int oldidx, oldtermlen, termlen;
/* enc_check_capable(obj);*/
rb_check_frozen(obj);
oldidx = rb_enc_get_index(obj);
if (oldidx == idx)
@ -1355,7 +1366,10 @@ enc_names(VALUE self)
args[0] = (VALUE)rb_to_encoding_index(self);
args[1] = rb_ary_new2(0);
st_foreach(global_enc_table.names, enc_names_i, (st_data_t)args);
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
st_foreach(enc_table->names, enc_names_i, (st_data_t)args);
}
return args[1];
}
@ -1380,8 +1394,9 @@ enc_names(VALUE self)
static VALUE
enc_list(VALUE klass)
{
VALUE ary = rb_ary_new2(0);
rb_ary_replace(ary, rb_encoding_list);
VALUE ary = rb_ary_new2(ENCODING_LIST_CAPA);
VALUE list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list);
rb_ary_replace(ary, list);
return ary;
}
@ -1526,6 +1541,9 @@ int rb_locale_charmap_index(void);
int
rb_locale_encindex(void)
{
// `rb_locale_charmap_index` can call `enc_find_index`, which can
// load an encoding. This needs to be done without VM lock held.
ASSERT_vm_unlocking();
int idx = rb_locale_charmap_index();
if (idx < 0) idx = ENCINDEX_UTF_8;
@ -1584,6 +1602,10 @@ enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const cha
/* Already set */
overridden = TRUE;
if (!NIL_P(encoding)) {
enc_check_encoding(encoding); // loads it if necessary. Needs to be done outside of VM lock.
}
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
if (NIL_P(encoding)) {
def->index = -1;
@ -1854,8 +1876,11 @@ rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
static VALUE
rb_enc_name_list(VALUE klass)
{
VALUE ary = rb_ary_new2(global_enc_table.names->num_entries);
st_foreach(global_enc_table.names, rb_enc_name_list_i, (st_data_t)ary);
VALUE ary;
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
ary = rb_ary_new2(enc_table->names->num_entries);
st_foreach(enc_table->names, rb_enc_name_list_i, (st_data_t)ary);
}
return ary;
}
@ -1901,7 +1926,9 @@ rb_enc_aliases(VALUE klass)
aliases[0] = rb_hash_new();
aliases[1] = rb_ary_new();
st_foreach(global_enc_table.names, rb_enc_aliases_enc_i, (st_data_t)aliases);
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
st_foreach(enc_table->names, rb_enc_aliases_enc_i, (st_data_t)aliases);
}
return aliases[0];
}
@ -1969,9 +1996,9 @@ Init_Encoding(void)
struct enc_table *enc_table = &global_enc_table;
rb_gc_register_address(&rb_encoding_list);
list = rb_encoding_list = rb_ary_new2(ENCODING_LIST_CAPA);
RBASIC_CLEAR_CLASS(list);
rb_vm_register_global_object(list);
for (i = 0; i < enc_table->count; ++i) {
rb_ary_push(list, enc_new(enc_table->list[i].enc));
@ -2003,5 +2030,7 @@ Init_encodings(void)
void
rb_enc_foreach_name(int (*func)(st_data_t name, st_data_t idx, st_data_t arg), st_data_t arg)
{
st_foreach(global_enc_table.names, func, arg);
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
st_foreach(enc_table->names, func, arg);
}
}

37
hash.c
View file

@ -5192,25 +5192,26 @@ env_enc_str_new(const char *ptr, long len, rb_encoding *enc)
}
static VALUE
env_str_new(const char *ptr, long len)
env_str_new(const char *ptr, long len, rb_encoding *enc)
{
return env_enc_str_new(ptr, len, env_encoding());
return env_enc_str_new(ptr, len, enc);
}
static VALUE
env_str_new2(const char *ptr)
env_str_new2(const char *ptr, rb_encoding *enc)
{
if (!ptr) return Qnil;
return env_str_new(ptr, strlen(ptr));
return env_str_new(ptr, strlen(ptr), enc);
}
static VALUE
getenv_with_lock(const char *name)
{
VALUE ret;
rb_encoding *enc = env_encoding();
ENV_LOCKING() {
const char *val = getenv(name);
ret = env_str_new2(val);
ret = env_str_new2(val, enc);
}
return ret;
}
@ -5773,13 +5774,14 @@ env_values(void)
{
VALUE ary = rb_ary_new();
rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);
while (*env) {
char *s = strchr(*env, '=');
if (s) {
rb_ary_push(ary, env_str_new2(s+1));
rb_ary_push(ary, env_str_new2(s+1, enc));
}
env++;
}
@ -5865,14 +5867,15 @@ env_each_pair(VALUE ehash)
VALUE ary = rb_ary_new();
rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);
while (*env) {
char *s = strchr(*env, '=');
if (s) {
rb_ary_push(ary, env_str_new(*env, s-*env));
rb_ary_push(ary, env_str_new2(s+1));
rb_ary_push(ary, env_str_new(*env, s-*env, enc));
rb_ary_push(ary, env_str_new2(s+1, enc));
}
env++;
}
@ -6255,13 +6258,14 @@ env_to_a(VALUE _)
{
VALUE ary = rb_ary_new();
rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);
while (*env) {
char *s = strchr(*env, '=');
if (s) {
rb_ary_push(ary, rb_assoc_new(env_str_new(*env, s-*env),
env_str_new2(s+1)));
rb_ary_push(ary, rb_assoc_new(env_str_new(*env, s-*env, enc),
env_str_new2(s+1, enc)));
}
env++;
}
@ -6509,6 +6513,7 @@ env_key(VALUE dmy, VALUE value)
StringValue(value);
VALUE str = Qnil;
rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);
while (*env) {
@ -6516,7 +6521,7 @@ env_key(VALUE dmy, VALUE value)
if (s++) {
long len = strlen(s);
if (RSTRING_LEN(value) == len && strncmp(s, RSTRING_PTR(value), len) == 0) {
str = env_str_new(*env, s-*env-1);
str = env_str_new(*env, s-*env-1, enc);
break;
}
}
@ -6533,13 +6538,14 @@ env_to_hash(void)
{
VALUE hash = rb_hash_new();
rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);
while (*env) {
char *s = strchr(*env, '=');
if (s) {
rb_hash_aset(hash, env_str_new(*env, s-*env),
env_str_new2(s+1));
rb_hash_aset(hash, env_str_new(*env, s-*env, enc),
env_str_new2(s+1, enc));
}
env++;
}
@ -6684,14 +6690,15 @@ env_shift(VALUE _)
VALUE result = Qnil;
VALUE key = Qnil;
rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);
if (*env) {
const char *p = *env;
char *s = strchr(p, '=');
if (s) {
key = env_str_new(p, s-p);
VALUE val = env_str_new2(getenv(RSTRING_PTR(key)));
key = env_str_new(p, s-p, enc);
VALUE val = env_str_new2(getenv(RSTRING_PTR(key)), enc);
result = rb_assoc_new(key, val);
}
}

View file

@ -2320,6 +2320,46 @@ class TestTranscode < Test::Unit::TestCase
assert_equal("A\nB\nC", s.encode(usascii, newline: :lf))
end
def test_ractor_lazy_load_encoding
assert_ractor("#{<<~"begin;"}\n#{<<~'end;'}")
begin;
rs = []
autoload_encodings = Encoding.list.select { |e| e.inspect.include?("(autoload)") }.freeze
7.times do
rs << Ractor.new(autoload_encodings) do |encodings|
str = "\u0300"
encodings.each do |enc|
str.encode(enc) rescue Encoding::UndefinedConversionError
end
end
end
while rs.any?
r, _obj = Ractor.select(*rs)
rs.delete(r)
end
assert rs.empty?
end;
end
def test_ractor_lazy_load_encoding_random
assert_ractor("#{<<~"begin;"}\n#{<<~'end;'}")
begin;
rs = []
100.times do
rs << Ractor.new do
"\u0300".encode(Encoding.list.sample) rescue Encoding::UndefinedConversionError
end
end
while rs.any?
r, _obj = Ractor.select(*rs)
rs.delete(r)
end
assert rs.empty?
end;
end
private
def assert_conversion_both_ways_utf8(utf8, raw, encoding)

View file

@ -340,7 +340,7 @@ transcode_search_path(const char *sname, const char *dname,
bfs.queue_last_ptr = &q->next;
bfs.queue = q;
bfs.visited = st_init_strcasetable();
bfs.visited = st_init_strcasetable(); // due to base encodings, we need to do search in a loop
st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
RB_VM_LOCKING() {
@ -351,14 +351,14 @@ transcode_search_path(const char *sname, const char *dname,
bfs.queue_last_ptr = &bfs.queue;
}
lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val);
lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val); // src => table2
if (!lookup_res) {
xfree(q);
continue;
}
table2 = (st_table *)val;
if (st_lookup(table2, (st_data_t)dname, &val)) {
if (st_lookup(table2, (st_data_t)dname, &val)) { // dest => econv
st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
xfree(q);
found = true;
@ -411,8 +411,7 @@ int rb_require_internal_silent(VALUE fname);
static const rb_transcoder *
load_transcoder_entry(transcoder_entry_t *entry)
{
// changes result of entry->transcoder depending on if it's required or not, so needs lock
ASSERT_vm_locking();
ASSERT_vm_unlocking();
if (entry->transcoder)
return entry->transcoder;
@ -427,7 +426,7 @@ load_transcoder_entry(transcoder_entry_t *entry)
memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
rb_str_set_len(fn, total_len);
OBJ_FREEZE(fn);
rb_require_internal_silent(fn);
rb_require_internal_silent(fn); // Sets entry->transcoder
}
if (entry->transcoder)
@ -981,7 +980,6 @@ rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
{
rb_econv_t *ec;
int i, ret;
ASSERT_vm_locking();
for (i = 0; i < n; i++) {
const rb_transcoder *tr;
@ -1026,10 +1024,8 @@ rb_econv_open0(const char *sname, const char *dname, int ecflags)
transcoder_entry_t **entries = NULL;
int num_trans;
rb_econv_t *ec;
ASSERT_vm_locking();
/* Just check if sname and dname are defined */
/* (This check is needed?) */
// loads encodings if not loaded already
if (*sname) rb_enc_find_index(sname);
if (*dname) rb_enc_find_index(dname);
@ -1117,15 +1113,13 @@ rb_econv_open(const char *sname, const char *dname, int ecflags)
if (num_decorators == -1)
return NULL;
RB_VM_LOCKING() {
ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
if (ec) {
for (i = 0; i < num_decorators; i++) {
if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
rb_econv_close(ec);
ec = NULL;
break;
}
ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
if (ec) {
for (i = 0; i < num_decorators; i++) {
if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
rb_econv_close(ec);
ec = NULL;
break;
}
}
}
@ -1960,12 +1954,9 @@ rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int
if (ec->started != 0)
return -1;
RB_VM_LOCKING() {
entry = get_transcoder_entry(sname, dname);
if (entry) {
tr = load_transcoder_entry(entry);
}
entry = get_transcoder_entry(sname, dname);
if (entry) {
tr = load_transcoder_entry(entry);
}
return tr ? rb_econv_add_transcoder_at(ec, tr, n) : -1;
@ -2681,21 +2672,19 @@ rb_econv_open_opts(const char *source_encoding, const char *destination_encoding
replacement = rb_hash_aref(opthash, sym_replace);
}
RB_VM_LOCKING() {
ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
if (ec) {
if (!NIL_P(replacement)) {
int ret;
rb_encoding *enc = rb_enc_get(replacement);
ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
if (ec) {
if (!NIL_P(replacement)) {
int ret;
rb_encoding *enc = rb_enc_get(replacement);
ret = rb_econv_set_replacement(ec,
(const unsigned char *)RSTRING_PTR(replacement),
RSTRING_LEN(replacement),
rb_enc_name(enc));
if (ret == -1) {
rb_econv_close(ec);
ec = NULL;
}
ret = rb_econv_set_replacement(ec,
(const unsigned char *)RSTRING_PTR(replacement),
RSTRING_LEN(replacement),
rb_enc_name(enc));
if (ret == -1) {
rb_econv_close(ec);
ec = NULL;
}
}
}
@ -3132,10 +3121,8 @@ decorate_convpath(VALUE convpath, int ecflags)
const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
transcoder_entry_t *entry;
const rb_transcoder *tr;
RB_VM_LOCKING() {
entry = get_transcoder_entry(sname, dname);
tr = load_transcoder_entry(entry);
}
entry = get_transcoder_entry(sname, dname);
tr = load_transcoder_entry(entry);
if (!tr)
return -1;
if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&