linux/arch/riscv/kernel/module-sections.c
Samuel Holland be17c0df67
riscv: module: Optimize PLT/GOT entry counting
perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
inside module_frob_arch_sections(). This is because amdgpu.ko contains
about 300000 relocations in its .rela.text section, and the algorithm in
count_max_entries() takes quadratic time.

Apply two optimizations from the arm64 code, which together reduce the
total execution time by 99.58%. First, sort the relocations so duplicate
entries are adjacent. Second, reduce the number of relocations that must
be sorted by filtering to only relocations that need PLT/GOT entries, as
done in commit d4e0340919 ("arm64/module: Optimize module load time by
optimizing PLT counting").

Unlike the arm64 code, here the filtering and sorting is done in a
scratch buffer, because the HI20 relocation search optimization in
apply_relocate_add() depends on the original order of the relocations.
This allows accumulating PLT/GOT relocations across sections so sorting
and counting is only done once per module.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20250409171526.862481-3-samuel.holland@sifive.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
2025-06-05 11:09:43 -07:00

209 lines
5.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0
*
* Copyright (C) 2014-2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* Copyright (C) 2018 Andes Technology Corporation <zong@andestech.com>
*/
#include <linux/elf.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/sort.h>
unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
{
struct mod_section *got_sec = &mod->arch.got;
int i = got_sec->num_entries;
struct got_entry *got = get_got_entry(val, got_sec);
if (got)
return (unsigned long)got;
/* There is no duplicate entry, create a new one */
got = (struct got_entry *)got_sec->shdr->sh_addr;
got[i] = emit_got_entry(val);
got_sec->num_entries++;
BUG_ON(got_sec->num_entries > got_sec->max_entries);
return (unsigned long)&got[i];
}
unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
{
struct mod_section *got_plt_sec = &mod->arch.got_plt;
struct got_entry *got_plt;
struct mod_section *plt_sec = &mod->arch.plt;
struct plt_entry *plt = get_plt_entry(val, plt_sec, got_plt_sec);
int i = plt_sec->num_entries;
if (plt)
return (unsigned long)plt;
/* There is no duplicate entry, create a new one */
got_plt = (struct got_entry *)got_plt_sec->shdr->sh_addr;
got_plt[i] = emit_got_entry(val);
plt = (struct plt_entry *)plt_sec->shdr->sh_addr;
plt[i] = emit_plt_entry(val,
(unsigned long)&plt[i],
(unsigned long)&got_plt[i]);
plt_sec->num_entries++;
got_plt_sec->num_entries++;
BUG_ON(plt_sec->num_entries > plt_sec->max_entries);
return (unsigned long)&plt[i];
}
#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b))
static int cmp_rela(const void *a, const void *b)
{
const Elf_Rela *x = a, *y = b;
int i;
/* sort by type, symbol index and addend */
i = cmp_3way(x->r_info, y->r_info);
if (i == 0)
i = cmp_3way(x->r_addend, y->r_addend);
return i;
}
static bool duplicate_rela(const Elf_Rela *rela, int idx)
{
/*
* Entries are sorted by type, symbol index and addend. That means
* that, if a duplicate entry exists, it must be in the preceding slot.
*/
return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
}
static void count_max_entries(const Elf_Rela *relas, size_t num,
unsigned int *plts, unsigned int *gots)
{
for (size_t i = 0; i < num; i++) {
if (duplicate_rela(relas, i))
continue;
switch (ELF_R_TYPE(relas[i].r_info)) {
case R_RISCV_CALL_PLT:
case R_RISCV_PLT32:
(*plts)++;
break;
case R_RISCV_GOT_HI20:
(*gots)++;
break;
default:
unreachable();
}
}
}
static bool rela_needs_plt_got_entry(const Elf_Rela *rela)
{
switch (ELF_R_TYPE(rela->r_info)) {
case R_RISCV_CALL_PLT:
case R_RISCV_GOT_HI20:
case R_RISCV_PLT32:
return true;
default:
return false;
}
}
int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
char *secstrings, struct module *mod)
{
size_t num_scratch_relas = 0;
unsigned int num_plts = 0;
unsigned int num_gots = 0;
Elf_Rela *scratch = NULL;
size_t scratch_size = 0;
int i;
/*
* Find the empty .got and .plt sections.
*/
for (i = 0; i < ehdr->e_shnum; i++) {
if (!strcmp(secstrings + sechdrs[i].sh_name, ".plt"))
mod->arch.plt.shdr = sechdrs + i;
else if (!strcmp(secstrings + sechdrs[i].sh_name, ".got"))
mod->arch.got.shdr = sechdrs + i;
else if (!strcmp(secstrings + sechdrs[i].sh_name, ".got.plt"))
mod->arch.got_plt.shdr = sechdrs + i;
}
if (!mod->arch.plt.shdr) {
pr_err("%s: module PLT section(s) missing\n", mod->name);
return -ENOEXEC;
}
if (!mod->arch.got.shdr) {
pr_err("%s: module GOT section(s) missing\n", mod->name);
return -ENOEXEC;
}
if (!mod->arch.got_plt.shdr) {
pr_err("%s: module GOT.PLT section(s) missing\n", mod->name);
return -ENOEXEC;
}
/* Calculate the maxinum number of entries */
for (i = 0; i < ehdr->e_shnum; i++) {
size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela);
Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
size_t scratch_size_needed;
if (sechdrs[i].sh_type != SHT_RELA)
continue;
/* ignore relocations that operate on non-exec sections */
if (!(dst_sec->sh_flags & SHF_EXECINSTR))
continue;
/*
* apply_relocate_add() relies on HI20 and LO12 relocation pairs being
* close together, so sort a copy of the section to avoid interfering.
*/
scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch);
if (scratch_size_needed > scratch_size) {
scratch_size = scratch_size_needed;
scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
if (!scratch)
return -ENOMEM;
}
for (size_t j = 0; j < num_relas; j++)
if (rela_needs_plt_got_entry(&relas[j]))
scratch[num_scratch_relas++] = relas[j];
}
if (scratch) {
/* sort the accumulated PLT/GOT relocations so duplicates are adjacent */
sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL);
count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots);
kvfree(scratch);
}
mod->arch.plt.shdr->sh_type = SHT_NOBITS;
mod->arch.plt.shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
mod->arch.plt.shdr->sh_addralign = L1_CACHE_BYTES;
mod->arch.plt.shdr->sh_size = (num_plts + 1) * sizeof(struct plt_entry);
mod->arch.plt.num_entries = 0;
mod->arch.plt.max_entries = num_plts;
mod->arch.got.shdr->sh_type = SHT_NOBITS;
mod->arch.got.shdr->sh_flags = SHF_ALLOC;
mod->arch.got.shdr->sh_addralign = L1_CACHE_BYTES;
mod->arch.got.shdr->sh_size = (num_gots + 1) * sizeof(struct got_entry);
mod->arch.got.num_entries = 0;
mod->arch.got.max_entries = num_gots;
mod->arch.got_plt.shdr->sh_type = SHT_NOBITS;
mod->arch.got_plt.shdr->sh_flags = SHF_ALLOC;
mod->arch.got_plt.shdr->sh_addralign = L1_CACHE_BYTES;
mod->arch.got_plt.shdr->sh_size = (num_plts + 1) * sizeof(struct got_entry);
mod->arch.got_plt.num_entries = 0;
mod->arch.got_plt.max_entries = num_plts;
return 0;
}