mirror of
https://github.com/torvalds/linux.git
synced 2025-08-15 06:01:56 +02:00

When booting a new kernel with kexec_file, the kernel picks a target location that the kernel should live at, then allocates random pages, checks whether any of those patches magically happens to coincide with a target address range and if so, uses them for that range. For every page allocated this way, it then creates a page list that the relocation code - code that executes while all CPUs are off and we are just about to jump into the new kernel - copies to their final memory location. We can not put them there before, because chances are pretty good that at least some page in the target range is already in use by the currently running Linux environment. Copying is happening from a single CPU at RAM rate, which takes around 4-50 ms per 100 MiB. All of this is inefficient and error prone. To successfully kexec, we need to quiesce all devices of the outgoing kernel so they don't scribble over the new kernel's memory. We have seen cases where that does not happen properly (*cough* GIC *cough*) and hence the new kernel was corrupted. This started a month long journey to root cause failing kexecs to eventually see memory corruption, because the new kernel was corrupted severely enough that it could not emit output to tell us about the fact that it was corrupted. By allocating memory for the next kernel from a memory range that is guaranteed scribbling free, we can boot the next kernel up to a point where it is at least able to detect corruption and maybe even stop it before it becomes severe. This increases the chance for successful kexecs. Since kexec got introduced, Linux has gained the CMA framework which can perform physically contiguous memory mappings, while keeping that memory available for movable memory when it is not needed for contiguous allocations. The default CMA allocator is for DMA allocations. This patch adds logic to the kexec file loader to attempt to place the target payload at a location allocated from CMA. If successful, it uses that memory range directly instead of creating copy instructions during the hot phase. To ensure that there is a safety net in case anything goes wrong with the CMA allocation, it also adds a flag for user space to force disable CMA allocations. Using CMA allocations has two advantages: 1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the hot phase. 2) More robust. Even if by accident some page is still in use for DMA, the new kernel image will be safe from that access because it resides in a memory region that is considered allocated in the old kernel and has a chance to reinitialize that component. Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com Signed-off-by: Alexander Graf <graf@amazon.com> Acked-by: Baoquan He <bhe@redhat.com> Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Zhongkun He <hezhongkun.hzk@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
145 lines
3.5 KiB
C
145 lines
3.5 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Load ELF vmlinux file for the kexec_file_load syscall.
|
|
*
|
|
* Copyright (C) 2021 Huawei Technologies Co, Ltd.
|
|
*
|
|
* Author: Liao Chang (liaochang1@huawei.com)
|
|
*
|
|
* Based on kexec-tools' kexec-elf-riscv.c, heavily modified
|
|
* for kernel.
|
|
*/
|
|
|
|
#define pr_fmt(fmt) "kexec_image: " fmt
|
|
|
|
#include <linux/elf.h>
|
|
#include <linux/kexec.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/of.h>
|
|
#include <linux/libfdt.h>
|
|
#include <linux/types.h>
|
|
#include <linux/memblock.h>
|
|
#include <asm/setup.h>
|
|
|
|
static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
|
|
struct kexec_elf_info *elf_info, unsigned long old_pbase,
|
|
unsigned long new_pbase)
|
|
{
|
|
int i;
|
|
int ret = 0;
|
|
size_t size;
|
|
struct kexec_buf kbuf;
|
|
const struct elf_phdr *phdr;
|
|
|
|
kbuf.image = image;
|
|
|
|
for (i = 0; i < ehdr->e_phnum; i++) {
|
|
phdr = &elf_info->proghdrs[i];
|
|
if (phdr->p_type != PT_LOAD)
|
|
continue;
|
|
|
|
size = phdr->p_filesz;
|
|
if (size > phdr->p_memsz)
|
|
size = phdr->p_memsz;
|
|
|
|
kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset;
|
|
kbuf.bufsz = size;
|
|
kbuf.buf_align = phdr->p_align;
|
|
kbuf.mem = phdr->p_paddr - old_pbase + new_pbase;
|
|
kbuf.memsz = phdr->p_memsz;
|
|
kbuf.top_down = false;
|
|
ret = kexec_add_buffer(&kbuf);
|
|
if (ret)
|
|
break;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Go through the available phsyical memory regions and find one that hold
|
|
* an image of the specified size.
|
|
*/
|
|
static int elf_find_pbase(struct kimage *image, unsigned long kernel_len,
|
|
struct elfhdr *ehdr, struct kexec_elf_info *elf_info,
|
|
unsigned long *old_pbase, unsigned long *new_pbase)
|
|
{
|
|
int i;
|
|
int ret;
|
|
struct kexec_buf kbuf;
|
|
const struct elf_phdr *phdr;
|
|
unsigned long lowest_paddr = ULONG_MAX;
|
|
unsigned long lowest_vaddr = ULONG_MAX;
|
|
|
|
for (i = 0; i < ehdr->e_phnum; i++) {
|
|
phdr = &elf_info->proghdrs[i];
|
|
if (phdr->p_type != PT_LOAD)
|
|
continue;
|
|
|
|
if (lowest_paddr > phdr->p_paddr)
|
|
lowest_paddr = phdr->p_paddr;
|
|
|
|
if (lowest_vaddr > phdr->p_vaddr)
|
|
lowest_vaddr = phdr->p_vaddr;
|
|
}
|
|
|
|
kbuf.image = image;
|
|
kbuf.buf_min = lowest_paddr;
|
|
kbuf.buf_max = ULONG_MAX;
|
|
|
|
/*
|
|
* Current riscv boot protocol requires 2MB alignment for
|
|
* RV64 and 4MB alignment for RV32
|
|
*
|
|
*/
|
|
kbuf.buf_align = PMD_SIZE;
|
|
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
|
|
kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE);
|
|
kbuf.cma = NULL;
|
|
kbuf.top_down = false;
|
|
ret = arch_kexec_locate_mem_hole(&kbuf);
|
|
if (!ret) {
|
|
*old_pbase = lowest_paddr;
|
|
*new_pbase = kbuf.mem;
|
|
image->start = ehdr->e_entry - lowest_vaddr + kbuf.mem;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
|
|
unsigned long kernel_len, char *initrd,
|
|
unsigned long initrd_len, char *cmdline,
|
|
unsigned long cmdline_len)
|
|
{
|
|
int ret;
|
|
unsigned long old_kernel_pbase = ULONG_MAX;
|
|
unsigned long new_kernel_pbase = 0UL;
|
|
struct elfhdr ehdr;
|
|
struct kexec_elf_info elf_info;
|
|
|
|
ret = kexec_build_elf_info(kernel_buf, kernel_len, &ehdr, &elf_info);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
|
|
ret = elf_find_pbase(image, kernel_len, &ehdr, &elf_info,
|
|
&old_kernel_pbase, &new_kernel_pbase);
|
|
if (ret)
|
|
goto out;
|
|
|
|
/* Add the kernel binary to the image */
|
|
ret = riscv_kexec_elf_load(image, &ehdr, &elf_info,
|
|
old_kernel_pbase, new_kernel_pbase);
|
|
if (ret)
|
|
goto out;
|
|
|
|
ret = load_extra_segments(image, image->start, kernel_len,
|
|
initrd, initrd_len, cmdline, cmdline_len);
|
|
out:
|
|
kexec_free_elf_info(&elf_info);
|
|
return ret ? ERR_PTR(ret) : NULL;
|
|
}
|
|
|
|
const struct kexec_file_ops elf_kexec_ops = {
|
|
.probe = kexec_elf_probe,
|
|
.load = elf_kexec_load,
|
|
};
|