mirror of
https://github.com/torvalds/linux.git
synced 2025-08-15 06:01:56 +02:00

Patch series "kdump: crashkernel reservation from CMA", v5. This series implements a way to reserve additional crash kernel memory using CMA. Currently, all the memory for the crash kernel is not usable by the 1st (production) kernel. It is also unmapped so that it can't be corrupted by the fault that will eventually trigger the crash. This makes sense for the memory actually used by the kexec-loaded crash kernel image and initrd and the data prepared during the load (vmcoreinfo, ...). However, the reserved space needs to be much larger than that to provide enough run-time memory for the crash kernel and the kdump userspace. Estimating the amount of memory to reserve is difficult. Being too careful makes kdump likely to end in OOM, being too generous takes even more memory from the production system. Also, the reservation only allows reserving a single contiguous block (or two with the "low" suffix). I've seen systems where this fails because the physical memory is fragmented. By reserving additional crashkernel memory from CMA, the main crashkernel reservation can be just large enough to fit the kernel and initrd image, minimizing the memory taken away from the production system. Most of the run-time memory for the crash kernel will be memory previously available to userspace in the production system. As this memory is no longer wasted, the reservation can be done with a generous margin, making kdump more reliable. Kernel memory that we need to preserve for dumping is normally not allocated from CMA, unless it is explicitly allocated as movable. Currently this is only the case for memory ballooning and zswap. Such movable memory will be missing from the vmcore. User data is typically not dumped by makedumpfile. When dumping of user data is intended this new CMA reservation cannot be used. There are five patches in this series: The first adds a new ",cma" suffix to the recenly introduced generic crashkernel parsing code. parse_crashkernel() takes one more argument to store the cma reservation size. The second patch implements reserve_crashkernel_cma() which performs the reservation. If the requested size is not available in a single range, multiple smaller ranges will be reserved. The third patch updates Documentation/, explicitly mentioning the potential DMA corruption of the CMA-reserved memory. The fourth patch adds a short delay before booting the kdump kernel, allowing pending DMA transfers to finish. The fifth patch enables the functionality for x86 as a proof of concept. There are just three things every arch needs to do: - call reserve_crashkernel_cma() - include the CMA-reserved ranges in the physical memory map - exclude the CMA-reserved ranges from the memory available through /proc/vmcore by excluding them from the vmcoreinfo PT_LOAD ranges. Adding other architectures is easy and I can do that as soon as this series is merged. With this series applied, specifying crashkernel=100M craskhernel=1G,cma on the command line will make a standard crashkernel reservation of 100M, where kexec will load the kernel and initrd. An additional 1G will be reserved from CMA, still usable by the production system. The crash kernel will have 1.1G memory available. The 100M can be reliably predicted based on the size of the kernel and initrd. The new cma suffix is completely optional. When no crashkernel=size,cma is specified, everything works as before. This patch (of 5): Add a new cma_size parameter to parse_crashkernel(). When not NULL, call __parse_crashkernel to parse the CMA reservation size from "crashkernel=size,cma" and store it in cma_size. Set cma_size to NULL in all calls to parse_crashkernel(). Link: https://lkml.kernel.org/r/aEqnxxfLZMllMC8I@dwarf.suse.cz Link: https://lkml.kernel.org/r/aEqoQckgoTQNULnh@dwarf.suse.cz Signed-off-by: Jiri Bohac <jbohac@suse.cz> Cc: Baoquan He <bhe@redhat.com> Cc: Dave Young <dyoung@redhat.com> Cc: Donald Dutile <ddutile@redhat.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Philipp Rudo <prudo@redhat.com> Cc: Pingfan Liu <piliu@redhat.com> Cc: Tao Liu <ltao@redhat.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: David Hildenbrand <david@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
196 lines
5.2 KiB
C
196 lines
5.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* machine_kexec.c - handle transition of Linux booting another kernel
|
|
* Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com>
|
|
*
|
|
* GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
|
|
* LANDISK/sh4 supported by kogiidena
|
|
*/
|
|
#include <linux/mm.h>
|
|
#include <linux/kexec.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/reboot.h>
|
|
#include <linux/numa.h>
|
|
#include <linux/ftrace.h>
|
|
#include <linux/suspend.h>
|
|
#include <linux/memblock.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/io.h>
|
|
#include <asm/cacheflush.h>
|
|
#include <asm/sh_bios.h>
|
|
#include <asm/reboot.h>
|
|
|
|
typedef void (*relocate_new_kernel_t)(unsigned long indirection_page,
|
|
unsigned long reboot_code_buffer,
|
|
unsigned long start_address);
|
|
|
|
extern const unsigned char relocate_new_kernel[];
|
|
extern const unsigned int relocate_new_kernel_size;
|
|
extern void *vbr_base;
|
|
|
|
void native_machine_crash_shutdown(struct pt_regs *regs)
|
|
{
|
|
/* Nothing to do for UP, but definitely broken for SMP.. */
|
|
}
|
|
|
|
/*
|
|
* Do what every setup is needed on image and the
|
|
* reboot code buffer to allow us to avoid allocations
|
|
* later.
|
|
*/
|
|
int machine_kexec_prepare(struct kimage *image)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
void machine_kexec_cleanup(struct kimage *image)
|
|
{
|
|
}
|
|
|
|
static void kexec_info(struct kimage *image)
|
|
{
|
|
int i;
|
|
printk("kexec information\n");
|
|
for (i = 0; i < image->nr_segments; i++) {
|
|
printk(" segment[%d]: 0x%08x - 0x%08x (0x%08x)\n",
|
|
i,
|
|
(unsigned int)image->segment[i].mem,
|
|
(unsigned int)image->segment[i].mem +
|
|
image->segment[i].memsz,
|
|
(unsigned int)image->segment[i].memsz);
|
|
}
|
|
printk(" start : 0x%08x\n\n", (unsigned int)image->start);
|
|
}
|
|
|
|
/*
|
|
* Do not allocate memory (or fail in any way) in machine_kexec().
|
|
* We are past the point of no return, committed to rebooting now.
|
|
*/
|
|
void machine_kexec(struct kimage *image)
|
|
{
|
|
unsigned long page_list;
|
|
unsigned long reboot_code_buffer;
|
|
relocate_new_kernel_t rnk;
|
|
unsigned long entry;
|
|
unsigned long *ptr;
|
|
int save_ftrace_enabled;
|
|
|
|
/*
|
|
* Nicked from the mips version of machine_kexec():
|
|
* The generic kexec code builds a page list with physical
|
|
* addresses. Use phys_to_virt() to convert them to virtual.
|
|
*/
|
|
for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE);
|
|
ptr = (entry & IND_INDIRECTION) ?
|
|
phys_to_virt(entry & PAGE_MASK) : ptr + 1) {
|
|
if (*ptr & IND_SOURCE || *ptr & IND_INDIRECTION ||
|
|
*ptr & IND_DESTINATION)
|
|
*ptr = (unsigned long) phys_to_virt(*ptr);
|
|
}
|
|
|
|
#ifdef CONFIG_KEXEC_JUMP
|
|
if (image->preserve_context)
|
|
save_processor_state();
|
|
#endif
|
|
|
|
save_ftrace_enabled = __ftrace_enabled_save();
|
|
|
|
/* Interrupts aren't acceptable while we reboot */
|
|
local_irq_disable();
|
|
|
|
page_list = image->head;
|
|
|
|
/* we need both effective and real address here */
|
|
reboot_code_buffer =
|
|
(unsigned long)page_address(image->control_code_page);
|
|
|
|
/* copy our kernel relocation code to the control code page */
|
|
memcpy((void *)reboot_code_buffer, relocate_new_kernel,
|
|
relocate_new_kernel_size);
|
|
|
|
kexec_info(image);
|
|
flush_cache_all();
|
|
|
|
sh_bios_vbr_reload();
|
|
|
|
/* now call it */
|
|
rnk = (relocate_new_kernel_t) reboot_code_buffer;
|
|
(*rnk)(page_list, reboot_code_buffer,
|
|
(unsigned long)phys_to_virt(image->start));
|
|
|
|
#ifdef CONFIG_KEXEC_JUMP
|
|
asm volatile("ldc %0, vbr" : : "r" (&vbr_base) : "memory");
|
|
|
|
if (image->preserve_context)
|
|
restore_processor_state();
|
|
|
|
/* Convert page list back to physical addresses, what a mess. */
|
|
for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE);
|
|
ptr = (*ptr & IND_INDIRECTION) ?
|
|
phys_to_virt(*ptr & PAGE_MASK) : ptr + 1) {
|
|
if (*ptr & IND_SOURCE || *ptr & IND_INDIRECTION ||
|
|
*ptr & IND_DESTINATION)
|
|
*ptr = virt_to_phys(*ptr);
|
|
}
|
|
#endif
|
|
|
|
__ftrace_enabled_restore(save_ftrace_enabled);
|
|
}
|
|
|
|
void __init reserve_crashkernel(void)
|
|
{
|
|
unsigned long long crash_size, crash_base;
|
|
int ret;
|
|
|
|
if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
|
|
return;
|
|
|
|
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
|
|
&crash_size, &crash_base, NULL, NULL, NULL);
|
|
if (ret == 0 && crash_size > 0) {
|
|
crashk_res.start = crash_base;
|
|
crashk_res.end = crash_base + crash_size - 1;
|
|
}
|
|
|
|
if (crashk_res.end == crashk_res.start)
|
|
goto disable;
|
|
|
|
crash_size = PAGE_ALIGN(resource_size(&crashk_res));
|
|
if (!crashk_res.start) {
|
|
unsigned long max = memblock_end_of_DRAM() - memory_limit;
|
|
crashk_res.start = memblock_phys_alloc_range(crash_size,
|
|
PAGE_SIZE, 0, max);
|
|
if (!crashk_res.start) {
|
|
pr_err("crashkernel allocation failed\n");
|
|
goto disable;
|
|
}
|
|
} else {
|
|
ret = memblock_reserve(crashk_res.start, crash_size);
|
|
if (unlikely(ret < 0)) {
|
|
pr_err("crashkernel reservation failed - "
|
|
"memory is in use\n");
|
|
goto disable;
|
|
}
|
|
}
|
|
|
|
crashk_res.end = crashk_res.start + crash_size - 1;
|
|
|
|
/*
|
|
* Crash kernel trumps memory limit
|
|
*/
|
|
if ((memblock_end_of_DRAM() - memory_limit) <= crashk_res.end) {
|
|
memory_limit = 0;
|
|
pr_info("Disabled memory limit for crashkernel\n");
|
|
}
|
|
|
|
pr_info("Reserving %ldMB of memory at 0x%08lx "
|
|
"for crashkernel (System RAM: %ldMB)\n",
|
|
(unsigned long)(crash_size >> 20),
|
|
(unsigned long)(crashk_res.start),
|
|
(unsigned long)(memblock_phys_mem_size() >> 20));
|
|
|
|
return;
|
|
|
|
disable:
|
|
crashk_res.start = crashk_res.end = 0;
|
|
}
|