mirror of
https://github.com/torvalds/linux.git
synced 2025-08-15 06:01:56 +02:00

When booting a new kernel with kexec_file, the kernel picks a target location that the kernel should live at, then allocates random pages, checks whether any of those patches magically happens to coincide with a target address range and if so, uses them for that range. For every page allocated this way, it then creates a page list that the relocation code - code that executes while all CPUs are off and we are just about to jump into the new kernel - copies to their final memory location. We can not put them there before, because chances are pretty good that at least some page in the target range is already in use by the currently running Linux environment. Copying is happening from a single CPU at RAM rate, which takes around 4-50 ms per 100 MiB. All of this is inefficient and error prone. To successfully kexec, we need to quiesce all devices of the outgoing kernel so they don't scribble over the new kernel's memory. We have seen cases where that does not happen properly (*cough* GIC *cough*) and hence the new kernel was corrupted. This started a month long journey to root cause failing kexecs to eventually see memory corruption, because the new kernel was corrupted severely enough that it could not emit output to tell us about the fact that it was corrupted. By allocating memory for the next kernel from a memory range that is guaranteed scribbling free, we can boot the next kernel up to a point where it is at least able to detect corruption and maybe even stop it before it becomes severe. This increases the chance for successful kexecs. Since kexec got introduced, Linux has gained the CMA framework which can perform physically contiguous memory mappings, while keeping that memory available for movable memory when it is not needed for contiguous allocations. The default CMA allocator is for DMA allocations. This patch adds logic to the kexec file loader to attempt to place the target payload at a location allocated from CMA. If successful, it uses that memory range directly instead of creating copy instructions during the hot phase. To ensure that there is a safety net in case anything goes wrong with the CMA allocation, it also adds a flag for user space to force disable CMA allocations. Using CMA allocations has two advantages: 1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the hot phase. 2) More robust. Even if by accident some page is still in use for DMA, the new kernel image will be safe from that access because it resides in a memory region that is considered allocated in the old kernel and has a chance to reinitialize that component. Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com Signed-off-by: Alexander Graf <graf@amazon.com> Acked-by: Baoquan He <bhe@redhat.com> Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Zhongkun He <hezhongkun.hzk@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
58 lines
1.7 KiB
C
58 lines
1.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef LINUX_KEXEC_INTERNAL_H
|
|
#define LINUX_KEXEC_INTERNAL_H
|
|
|
|
#include <linux/kexec.h>
|
|
|
|
struct kexec_segment;
|
|
|
|
struct kimage *do_kimage_alloc_init(void);
|
|
int sanity_check_segment_list(struct kimage *image);
|
|
void kimage_free_page_list(struct list_head *list);
|
|
void kimage_free(struct kimage *image);
|
|
int kimage_load_segment(struct kimage *image, int idx);
|
|
void kimage_terminate(struct kimage *image);
|
|
int kimage_is_destination_range(struct kimage *image,
|
|
unsigned long start, unsigned long end);
|
|
|
|
/*
|
|
* Whatever is used to serialize accesses to the kexec_crash_image needs to be
|
|
* NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
|
|
* "simple" atomic variable that is acquired with a cmpxchg().
|
|
*/
|
|
extern atomic_t __kexec_lock;
|
|
static inline bool kexec_trylock(void)
|
|
{
|
|
int old = 0;
|
|
return atomic_try_cmpxchg_acquire(&__kexec_lock, &old, 1);
|
|
}
|
|
static inline void kexec_unlock(void)
|
|
{
|
|
atomic_set_release(&__kexec_lock, 0);
|
|
}
|
|
|
|
#ifdef CONFIG_KEXEC_FILE
|
|
#include <linux/purgatory.h>
|
|
void kimage_file_post_load_cleanup(struct kimage *image);
|
|
extern char kexec_purgatory[];
|
|
extern size_t kexec_purgatory_size;
|
|
#else /* CONFIG_KEXEC_FILE */
|
|
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
|
|
#endif /* CONFIG_KEXEC_FILE */
|
|
|
|
struct kexec_buf;
|
|
|
|
#ifdef CONFIG_KEXEC_HANDOVER
|
|
int kho_locate_mem_hole(struct kexec_buf *kbuf,
|
|
int (*func)(struct resource *, void *));
|
|
int kho_fill_kimage(struct kimage *image);
|
|
#else
|
|
static inline int kho_locate_mem_hole(struct kexec_buf *kbuf,
|
|
int (*func)(struct resource *, void *))
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
static inline int kho_fill_kimage(struct kimage *image) { return 0; }
|
|
#endif /* CONFIG_KEXEC_HANDOVER */
|
|
#endif /* LINUX_KEXEC_INTERNAL_H */
|