From 50b4233a22b1ee9ccd0e847597de66ce21329ddb Mon Sep 17 00:00:00 2001
From: Julian Vetter <julian@outer-limits.org>
Date: Tue, 3 Jun 2025 15:21:21 +0200
Subject: [PATCH 01/80] include/linux/jhash.h: replace __get_unaligned_cpu32 in
 jhash function

__get_unaligned_cpu32() is deprecated.  So, replace it with the more
generic get_unaligned() and just cast the input parameter.

Link: https://lkml.kernel.org/r/20250603132121.3674066-1-julian@outer-limits.org
Signed-off-by: Julian Vetter <julian@outer-limits.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Wei-Hsin Yeh <weihsinyeh168@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/jhash.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/jhash.h b/include/linux/jhash.h
index fa26a2dd3b52..7c1c1821c694 100644
--- a/include/linux/jhash.h
+++ b/include/linux/jhash.h
@@ -24,7 +24,7 @@
  * Jozsef
  */
 #include <linux/bitops.h>
-#include <linux/unaligned/packed_struct.h>
+#include <linux/unaligned.h>
 
 /* Best hash sizes are of power of two */
 #define jhash_size(n)   ((u32)1<<(n))
@@ -77,9 +77,9 @@ static inline u32 jhash(const void *key, u32 length, u32 initval)
 
 	/* All but the last block: affect some 32 bits of (a,b,c) */
 	while (length > 12) {
-		a += __get_unaligned_cpu32(k);
-		b += __get_unaligned_cpu32(k + 4);
-		c += __get_unaligned_cpu32(k + 8);
+		a += get_unaligned((u32 *)k);
+		b += get_unaligned((u32 *)(k + 4));
+		c += get_unaligned((u32 *)(k + 8));
 		__jhash_mix(a, b, c);
 		length -= 12;
 		k += 12;

From 85df0d505ed64d72c86822733f648074d1ae2bca Mon Sep 17 00:00:00 2001
From: Su Hui <suhui@nfschina.com>
Date: Tue, 27 May 2025 17:23:34 +0800
Subject: [PATCH 02/80] ocfs2: replace simple_strtol with kstrtol

kstrtol() is better because simple_strtol() ignores overflow.  And using
kstrtol() is more concise.

Link: https://lkml.kernel.org/r/20250527092333.1917391-1-suhui@nfschina.com
Signed-off-by: Su Hui <suhui@nfschina.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/stack_user.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 77edcd70f72c..0f045e45fa0c 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -360,7 +360,6 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
 					struct ocfs2_control_message_setn *msg)
 {
 	long nodenum;
-	char *ptr = NULL;
 	struct ocfs2_control_private *p = file->private_data;
 
 	if (ocfs2_control_get_handshake_state(file) !=
@@ -375,8 +374,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
 		return -EINVAL;
 	msg->space = msg->newline = '\0';
 
-	nodenum = simple_strtol(msg->nodestr, &ptr, 16);
-	if (!ptr || *ptr)
+	if (kstrtol(msg->nodestr, 16, &nodenum))
 		return -EINVAL;
 
 	if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
@@ -391,7 +389,6 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
 					   struct ocfs2_control_message_setv *msg)
 {
 	long major, minor;
-	char *ptr = NULL;
 	struct ocfs2_control_private *p = file->private_data;
 	struct ocfs2_protocol_version *max =
 		&ocfs2_user_plugin.sp_max_proto;
@@ -409,11 +406,9 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
 		return -EINVAL;
 	msg->space1 = msg->space2 = msg->newline = '\0';
 
-	major = simple_strtol(msg->major, &ptr, 16);
-	if (!ptr || *ptr)
+	if (kstrtol(msg->major, 16, &major))
 		return -EINVAL;
-	minor = simple_strtol(msg->minor, &ptr, 16);
-	if (!ptr || *ptr)
+	if (kstrtol(msg->minor, 16, &minor))
 		return -EINVAL;
 
 	/*
@@ -441,7 +436,6 @@ static int ocfs2_control_do_down_msg(struct file *file,
 				     struct ocfs2_control_message_down *msg)
 {
 	long nodenum;
-	char *p = NULL;
 
 	if (ocfs2_control_get_handshake_state(file) !=
 	    OCFS2_CONTROL_HANDSHAKE_VALID)
@@ -456,8 +450,7 @@ static int ocfs2_control_do_down_msg(struct file *file,
 		return -EINVAL;
 	msg->space1 = msg->space2 = msg->newline = '\0';
 
-	nodenum = simple_strtol(msg->nodestr, &p, 16);
-	if (!p || *p)
+	if (kstrtol(msg->nodestr, 16, &nodenum))
 		return -EINVAL;
 
 	if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||

From 08e2153dd9440ebe6bb82d4dfd7b10bdf14c660c Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Wed, 21 May 2025 14:18:38 +0200
Subject: [PATCH 03/80] alpha: replace sprintf()/strcpy() with
 scnprintf()/strscpy()

Replace sprintf() with the safer variant scnprintf() and use its return
value instead of calculating the string length again using strlen().

Use strscpy() instead of the deprecated strcpy().

No functional changes intended.

Link: https://github.com/KSPP/linux/issues/88
Link: https://lkml.kernel.org/r/20250521121840.5653-1-thorsten.blum@linux.dev
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: guoweikang <guoweikang.kernel@gmail.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/kernel/core_marvel.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c
index b1bfbd11980d..d38f4d6759e4 100644
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -17,6 +17,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mc146818rtc.h>
 #include <linux/rtc.h>
+#include <linux/string.h>
 #include <linux/module.h>
 #include <linux/memblock.h>
 
@@ -79,10 +80,12 @@ mk_resource_name(int pe, int port, char *str)
 {
 	char tmp[80];
 	char *name;
-	
-	sprintf(tmp, "PCI %s PE %d PORT %d", str, pe, port);
-	name = memblock_alloc_or_panic(strlen(tmp) + 1, SMP_CACHE_BYTES);
-	strcpy(name, tmp);
+	size_t sz;
+
+	sz = scnprintf(tmp, sizeof(tmp), "PCI %s PE %d PORT %d", str, pe, port);
+	sz += 1; /* NUL terminator */
+	name = memblock_alloc_or_panic(sz, SMP_CACHE_BYTES);
+	strscpy(name, tmp, sz);
 
 	return name;
 }

From 449e0b4ed5a16c72289a786c5333fc97520402bf Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 9 May 2025 08:29:27 +0200
Subject: [PATCH 04/80] fork: clean-up naming of vm_stack/vm_struct variables
 in vmap stacks code

There are two data types: "struct vm_struct" and "struct vm_stack" that
have the same local variable names: vm_stack, or vm, or s, which makes the
code confusing to read.

Change the code so the naming is consistent:

struct vm_struct is always called vm_area
struct vm_stack is always called vm_stack

One change altering vfree(vm_stack) to vfree(vm_area->addr) may look like
a semantic change but it is not: vm_area->addr points to the vm_stack.
This was done to improve readability.

[linus.walleij@linaro.org: rebased and added new users of the variable names, address review comments]
Link: https://lore.kernel.org/20240311164638.2015063-4-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20250509-fork-fixes-v3-2-e6c69dd356f2@linaro.org
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 60 +++++++++++++++++++++++++--------------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..5fd893c907a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -207,14 +207,14 @@ struct vm_stack {
 	struct vm_struct *stack_vm_area;
 };
 
-static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
 {
 	unsigned int i;
 
 	for (i = 0; i < NR_CACHED_STACKS; i++) {
 		struct vm_struct *tmp = NULL;
 
-		if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm))
+		if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
 			return true;
 	}
 	return false;
@@ -223,11 +223,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
 static void thread_stack_free_rcu(struct rcu_head *rh)
 {
 	struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+	struct vm_struct *vm_area = vm_stack->stack_vm_area;
 
 	if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
 		return;
 
-	vfree(vm_stack);
+	vfree(vm_area->addr);
 }
 
 static void thread_stack_delayed_free(struct task_struct *tsk)
@@ -240,32 +241,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk)
 
 static int free_vm_stack_cache(unsigned int cpu)
 {
-	struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
+	struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
 	int i;
 
 	for (i = 0; i < NR_CACHED_STACKS; i++) {
-		struct vm_struct *vm_stack = cached_vm_stacks[i];
+		struct vm_struct *vm_area = cached_vm_stack_areas[i];
 
-		if (!vm_stack)
+		if (!vm_area)
 			continue;
 
-		vfree(vm_stack->addr);
-		cached_vm_stacks[i] = NULL;
+		vfree(vm_area->addr);
+		cached_vm_stack_areas[i] = NULL;
 	}
 
 	return 0;
 }
 
-static int memcg_charge_kernel_stack(struct vm_struct *vm)
+static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
 {
 	int i;
 	int ret;
 	int nr_charged = 0;
 
-	BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+	BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);
 
 	for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-		ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+		ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
 		if (ret)
 			goto err;
 		nr_charged++;
@@ -273,38 +274,35 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm)
 	return 0;
 err:
 	for (i = 0; i < nr_charged; i++)
-		memcg_kmem_uncharge_page(vm->pages[i], 0);
+		memcg_kmem_uncharge_page(vm_area->pages[i], 0);
 	return ret;
 }
 
 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
-	struct vm_struct *vm;
+	struct vm_struct *vm_area;
 	void *stack;
 	int i;
 
 	for (i = 0; i < NR_CACHED_STACKS; i++) {
-		struct vm_struct *s;
-
-		s = this_cpu_xchg(cached_stacks[i], NULL);
-
-		if (!s)
+		vm_area = this_cpu_xchg(cached_stacks[i], NULL);
+		if (!vm_area)
 			continue;
 
 		/* Reset stack metadata. */
-		kasan_unpoison_range(s->addr, THREAD_SIZE);
+		kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
 
-		stack = kasan_reset_tag(s->addr);
+		stack = kasan_reset_tag(vm_area->addr);
 
 		/* Clear stale pointers from reused stack. */
 		memset(stack, 0, THREAD_SIZE);
 
-		if (memcg_charge_kernel_stack(s)) {
-			vfree(s->addr);
+		if (memcg_charge_kernel_stack(vm_area)) {
+			vfree(vm_area->addr);
 			return -ENOMEM;
 		}
 
-		tsk->stack_vm_area = s;
+		tsk->stack_vm_area = vm_area;
 		tsk->stack = stack;
 		return 0;
 	}
@@ -320,8 +318,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 	if (!stack)
 		return -ENOMEM;
 
-	vm = find_vm_area(stack);
-	if (memcg_charge_kernel_stack(vm)) {
+	vm_area = find_vm_area(stack);
+	if (memcg_charge_kernel_stack(vm_area)) {
 		vfree(stack);
 		return -ENOMEM;
 	}
@@ -330,7 +328,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 	 * free_thread_stack() can be called in interrupt context,
 	 * so cache the vm_struct.
 	 */
-	tsk->stack_vm_area = vm;
+	tsk->stack_vm_area = vm_area;
 	stack = kasan_reset_tag(stack);
 	tsk->stack = stack;
 	return 0;
@@ -437,11 +435,11 @@ static struct kmem_cache *mm_cachep;
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
 	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
-		struct vm_struct *vm = task_stack_vm_area(tsk);
+		struct vm_struct *vm_area = task_stack_vm_area(tsk);
 		int i;
 
 		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
-			mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
+			mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
 					      account * (PAGE_SIZE / 1024));
 	} else {
 		void *stack = task_stack_page(tsk);
@@ -457,12 +455,12 @@ void exit_task_stack_account(struct task_struct *tsk)
 	account_kernel_stack(tsk, -1);
 
 	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
-		struct vm_struct *vm;
+		struct vm_struct *vm_area;
 		int i;
 
-		vm = task_stack_vm_area(tsk);
+		vm_area = task_stack_vm_area(tsk);
 		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
-			memcg_kmem_uncharge_page(vm->pages[i], 0);
+			memcg_kmem_uncharge_page(vm_area->pages[i], 0);
 	}
 }
 

From f7b0ff2bc91d8bb2ba9fdb182da39dd9733b1c50 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 9 May 2025 09:25:09 +0200
Subject: [PATCH 05/80] fork: define a local GFP_VMAP_STACK

The current allocation of VMAP stack memory is using (THREADINFO_GFP &
~__GFP_ACCOUNT) which is a complicated way of saying (GFP_KERNEL |
__GFP_ZERO):

<linux/thread_info.h>:
define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
<linux/gfp_types.h>:
define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)

This is an unfortunate side-effect of independent changes blurring the
picture:

commit 19809c2da28aee5860ad9a2eff760730a0710df0 changed (THREADINFO_GFP |
__GFP_HIGHMEM) to just THREADINFO_GFP since highmem became implicit.

commit 9b6f7e163cd0f468d1b9696b785659d3c27c8667 then added stack caching
and rewrote the allocation to (THREADINFO_GFP & ~__GFP_ACCOUNT) as cached
stacks need to be accounted separately.  However that code, when it
eventually accounts the memory does this:

  ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0)

so the memory is charged as a GFP_KERNEL allocation.

Define a unique GFP_VMAP_STACK to use
GFP_KERNEL | __GFP_ZERO and move the comment there.

Link: https://lkml.kernel.org/r/20250509-gfp-stack-v1-1-82f6f7efc210@linaro.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reported-by: Mateusz Guzik <mjguzik@gmail.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 5fd893c907a5..6616d173307a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -201,6 +201,12 @@ static inline void free_task_struct(struct task_struct *tsk)
  */
 #define NR_CACHED_STACKS 2
 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+/*
+ * Allocated stacks are cached and later reused by new threads, so memcg
+ * accounting is performed by the code assigning/releasing stacks to tasks.
+ * We need a zeroed memory without __GFP_ACCOUNT.
+ */
+#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)
 
 struct vm_stack {
 	struct rcu_head rcu;
@@ -307,13 +313,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 		return 0;
 	}
 
-	/*
-	 * Allocated stacks are cached and later reused by new threads,
-	 * so memcg accounting is performed manually on assigning/releasing
-	 * stacks to tasks. Drop __GFP_ACCOUNT.
-	 */
 	stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
-				     THREADINFO_GFP & ~__GFP_ACCOUNT,
+				     GFP_VMAP_STACK,
 				     node, __builtin_return_address(0));
 	if (!stack)
 		return -ENOMEM;

From 0ba5a25ad1c951fa25baa8c30a526b647ab50d47 Mon Sep 17 00:00:00 2001
From: Elijah Wright <git@elijahs.space>
Date: Tue, 10 Jun 2025 15:56:28 -0700
Subject: [PATCH 06/80] kernel: relay: use __GFP_ZERO in relay_alloc_buf

Passing the __GFP_ZERO flag to alloc_page should result in less overhead
th= an using memset()

Link: https://lkml.kernel.org/r/20250610225639.314970-3-git@elijahs.space
Signed-off-by: Elijah Wright <git@elijahs.space>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/relay.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/relay.c b/kernel/relay.c
index c0c93a04d4ce..3ee5b038d0d9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -118,7 +118,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
 		return NULL;
 
 	for (i = 0; i < n_pages; i++) {
-		buf->page_array[i] = alloc_page(GFP_KERNEL);
+		buf->page_array[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
 		if (unlikely(!buf->page_array[i]))
 			goto depopulate;
 		set_page_private(buf->page_array[i], (unsigned long)buf);
@@ -127,7 +127,6 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
 	if (!mem)
 		goto depopulate;
 
-	memset(mem, 0, *size);
 	buf->page_count = n_pages;
 	return mem;
 

From ca742a822a32aca68adb8ffa75a7d9c8887c41d1 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 12 Jun 2025 15:39:00 +0100
Subject: [PATCH 07/80] squashfs: pass the inode to
 squashfs_readahead_fragment()

Patch series "squashfs: Remove page->mapping references".

We're close to being able to kill page->mapping.  These two patches get us
a little bit closer.


This patch (of 2):

Eliminate a reference to page->mapping by passing the inode from the
caller.

Link: https://lkml.kernel.org/r/20250612143903.2849289-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20250612143903.2849289-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/file.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 5ca2baa16dc2..ce7d661d5ad8 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -493,10 +493,9 @@ out:
 	return res;
 }
 
-static int squashfs_readahead_fragment(struct page **page,
+static int squashfs_readahead_fragment(struct inode *inode, struct page **page,
 	unsigned int pages, unsigned int expected, loff_t start)
 {
-	struct inode *inode = page[0]->mapping->host;
 	struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
 		squashfs_i(inode)->fragment_block,
 		squashfs_i(inode)->fragment_size);
@@ -605,8 +604,8 @@ static void squashfs_readahead(struct readahead_control *ractl)
 
 		if (start >> msblk->block_log == file_end &&
 				squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) {
-			res = squashfs_readahead_fragment(pages, nr_pages,
-							  expected, start);
+			res = squashfs_readahead_fragment(inode, pages,
+					nr_pages, expected, start);
 			if (res)
 				goto skip_pages;
 			continue;

From c9e3fb050e9cb0d3a833b2c62b35ea42cdd81e89 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 12 Jun 2025 15:39:01 +0100
Subject: [PATCH 08/80] squashfs: use folios in squashfs_bio_read_cached()

Remove an access to page->mapping and a few calls to the old page-based
APIs.  This doesn't support large folios, but it's still a nice
improvement.

Link: https://lkml.kernel.org/r/20250612143903.2849289-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/block.c | 45 ++++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 3061043e915c..296c5a0fcc40 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -80,23 +80,22 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
 		struct address_space *cache_mapping, u64 index, int length,
 		u64 read_start, u64 read_end, int page_count)
 {
-	struct page *head_to_cache = NULL, *tail_to_cache = NULL;
+	struct folio *head_to_cache = NULL, *tail_to_cache = NULL;
 	struct block_device *bdev = fullbio->bi_bdev;
 	int start_idx = 0, end_idx = 0;
-	struct bvec_iter_all iter_all;
+	struct folio_iter fi;;
 	struct bio *bio = NULL;
-	struct bio_vec *bv;
 	int idx = 0;
 	int err = 0;
 #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
-	struct page **cache_pages = kmalloc_array(page_count,
+	struct folio **cache_folios = kmalloc_array(page_count,
 			sizeof(void *), GFP_KERNEL | __GFP_ZERO);
 #endif
 
-	bio_for_each_segment_all(bv, fullbio, iter_all) {
-		struct page *page = bv->bv_page;
+	bio_for_each_folio_all(fi, fullbio) {
+		struct folio *folio = fi.folio;
 
-		if (page->mapping == cache_mapping) {
+		if (folio->mapping == cache_mapping) {
 			idx++;
 			continue;
 		}
@@ -111,13 +110,13 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
 		 * adjacent blocks.
 		 */
 		if (idx == 0 && index != read_start)
-			head_to_cache = page;
+			head_to_cache = folio;
 		else if (idx == page_count - 1 && index + length != read_end)
-			tail_to_cache = page;
+			tail_to_cache = folio;
 #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
 		/* Cache all pages in the BIO for repeated reads */
-		else if (cache_pages)
-			cache_pages[idx] = page;
+		else if (cache_folios)
+			cache_folios[idx] = folio;
 #endif
 
 		if (!bio || idx != end_idx) {
@@ -150,45 +149,45 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
 		return err;
 
 	if (head_to_cache) {
-		int ret = add_to_page_cache_lru(head_to_cache, cache_mapping,
+		int ret = filemap_add_folio(cache_mapping, head_to_cache,
 						read_start >> PAGE_SHIFT,
 						GFP_NOIO);
 
 		if (!ret) {
-			SetPageUptodate(head_to_cache);
-			unlock_page(head_to_cache);
+			folio_mark_uptodate(head_to_cache);
+			folio_unlock(head_to_cache);
 		}
 
 	}
 
 	if (tail_to_cache) {
-		int ret = add_to_page_cache_lru(tail_to_cache, cache_mapping,
+		int ret = filemap_add_folio(cache_mapping, tail_to_cache,
 						(read_end >> PAGE_SHIFT) - 1,
 						GFP_NOIO);
 
 		if (!ret) {
-			SetPageUptodate(tail_to_cache);
-			unlock_page(tail_to_cache);
+			folio_mark_uptodate(tail_to_cache);
+			folio_unlock(tail_to_cache);
 		}
 	}
 
 #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
-	if (!cache_pages)
+	if (!cache_folios)
 		goto out;
 
 	for (idx = 0; idx < page_count; idx++) {
-		if (!cache_pages[idx])
+		if (!cache_folios[idx])
 			continue;
-		int ret = add_to_page_cache_lru(cache_pages[idx], cache_mapping,
+		int ret = filemap_add_folio(cache_mapping, cache_folios[idx],
 						(read_start >> PAGE_SHIFT) + idx,
 						GFP_NOIO);
 
 		if (!ret) {
-			SetPageUptodate(cache_pages[idx]);
-			unlock_page(cache_pages[idx]);
+			folio_mark_uptodate(cache_folios[idx]);
+			folio_unlock(cache_folios[idx]);
 		}
 	}
-	kfree(cache_pages);
+	kfree(cache_folios);
 out:
 #endif
 	return 0;

From 2489e958129ff7cbf26a34ee33cdc9ccbd68fe3c Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 12 Jun 2025 14:11:57 +0800
Subject: [PATCH 09/80] relayfs: abolish prev_padding

Patch series "relayfs: misc changes", v5.

The series mostly focuses on the error counters which helps every user
debug their own kernel module.


This patch (of 5):

prev_padding represents the unused space of certain subbuffer.  If the
content of a call of relay_write() exceeds the limit of the remainder of
this subbuffer, it will skip storing in the rest space and record the
start point as buf->prev_padding in relay_switch_subbuf().  Since the buf
is a per-cpu big buffer, the point of prev_padding as a global value for
the whole buffer instead of a single subbuffer (whose padding info is
stored in buf->padding[]) seems meaningless from the real use cases, so we
don't bother to record it any more.

Link: https://lkml.kernel.org/r/20250612061201.34272-1-kerneljasonxing@gmail.com
Link: https://lkml.kernel.org/r/20250612061201.34272-2-kerneljasonxing@gmail.com
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Yushan Zhou <katrinzhou@tencent.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.c |  3 +--
 drivers/net/wwan/iosm/iosm_ipc_trace.c     |  3 +--
 drivers/net/wwan/t7xx/t7xx_port_trace.c    |  2 +-
 include/linux/relay.h                      |  5 +----
 kernel/relay.c                             | 14 ++++++++------
 kernel/trace/blktrace.c                    |  2 +-
 6 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index e8a04e476c57..09a64f224c49 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -220,8 +220,7 @@ static int guc_action_control_log(struct intel_guc *guc, bool enable,
  */
 static int subbuf_start_callback(struct rchan_buf *buf,
 				 void *subbuf,
-				 void *prev_subbuf,
-				 size_t prev_padding)
+				 void *prev_subbuf)
 {
 	/*
 	 * Use no-overwrite mode by default, where relay will stop accepting
diff --git a/drivers/net/wwan/iosm/iosm_ipc_trace.c b/drivers/net/wwan/iosm/iosm_ipc_trace.c
index eeecfa3d10c5..9656254c1c6c 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_trace.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_trace.c
@@ -51,8 +51,7 @@ static int ipc_trace_remove_buf_file_handler(struct dentry *dentry)
 }
 
 static int ipc_trace_subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
-					  void *prev_subbuf,
-					  size_t prev_padding)
+					  void *prev_subbuf)
 {
 	if (relay_buf_full(buf)) {
 		pr_err_ratelimited("Relay_buf full dropping traces");
diff --git a/drivers/net/wwan/t7xx/t7xx_port_trace.c b/drivers/net/wwan/t7xx/t7xx_port_trace.c
index 4ed8b4e29bf1..f16d3b01302c 100644
--- a/drivers/net/wwan/t7xx/t7xx_port_trace.c
+++ b/drivers/net/wwan/t7xx/t7xx_port_trace.c
@@ -33,7 +33,7 @@ static int t7xx_trace_remove_buf_file_handler(struct dentry *dentry)
 }
 
 static int t7xx_trace_subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
-					   void *prev_subbuf, size_t prev_padding)
+					   void *prev_subbuf)
 {
 	if (relay_buf_full(buf)) {
 		pr_err_ratelimited("Relay_buf full dropping traces");
diff --git a/include/linux/relay.h b/include/linux/relay.h
index b3224111d074..e10a0fdf4325 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -47,7 +47,6 @@ struct rchan_buf
 	unsigned int page_count;	/* number of current buffer pages */
 	unsigned int finalized;		/* buffer has been finalized */
 	size_t *padding;		/* padding counts per sub-buffer */
-	size_t prev_padding;		/* temporary variable */
 	size_t bytes_consumed;		/* bytes consumed in cur read subbuf */
 	size_t early_bytes;		/* bytes consumed before VFS inited */
 	unsigned int cpu;		/* this buf's cpu */
@@ -84,7 +83,6 @@ struct rchan_callbacks
 	 * @buf: the channel buffer containing the new sub-buffer
 	 * @subbuf: the start of the new sub-buffer
 	 * @prev_subbuf: the start of the previous sub-buffer
-	 * @prev_padding: unused space at the end of previous sub-buffer
 	 *
 	 * The client should return 1 to continue logging, 0 to stop
 	 * logging.
@@ -100,8 +98,7 @@ struct rchan_callbacks
 	 */
 	int (*subbuf_start) (struct rchan_buf *buf,
 			     void *subbuf,
-			     void *prev_subbuf,
-			     size_t prev_padding);
+			     void *prev_subbuf);
 
 	/*
 	 * create_buf_file - create file to represent a relay channel buffer
diff --git a/kernel/relay.c b/kernel/relay.c
index 3ee5b038d0d9..fc6ad76b789d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -249,13 +249,13 @@ EXPORT_SYMBOL_GPL(relay_buf_full);
  */
 
 static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
-			      void *prev_subbuf, size_t prev_padding)
+			      void *prev_subbuf)
 {
 	if (!buf->chan->cb->subbuf_start)
 		return !relay_buf_full(buf);
 
 	return buf->chan->cb->subbuf_start(buf, subbuf,
-					   prev_subbuf, prev_padding);
+					   prev_subbuf);
 }
 
 /**
@@ -301,7 +301,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 	for (i = 0; i < buf->chan->n_subbufs; i++)
 		buf->padding[i] = 0;
 
-	relay_subbuf_start(buf, buf->data, NULL, 0);
+	relay_subbuf_start(buf, buf->data, NULL);
 }
 
 /**
@@ -554,9 +554,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		goto toobig;
 
 	if (buf->offset != buf->chan->subbuf_size + 1) {
-		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+		size_t prev_padding;
+
+		prev_padding = buf->chan->subbuf_size - buf->offset;
 		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
-		buf->padding[old_subbuf] = buf->prev_padding;
+		buf->padding[old_subbuf] = prev_padding;
 		buf->subbufs_produced++;
 		if (buf->dentry)
 			d_inode(buf->dentry)->i_size +=
@@ -581,7 +583,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
 	new = buf->start + new_subbuf * buf->chan->subbuf_size;
 	buf->offset = 0;
-	if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) {
+	if (!relay_subbuf_start(buf, new, old)) {
 		buf->offset = buf->chan->subbuf_size + 1;
 		return 0;
 	}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3f6a7bdc6edf..d3083c88474e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -461,7 +461,7 @@ static const struct file_operations blk_msg_fops = {
  * the user space app in telling how many lost events there were.
  */
 static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
-				     void *prev_subbuf, size_t prev_padding)
+				     void *prev_subbuf)
 {
 	struct blk_trace *bt;
 

From ca01a90ae7bf9bb22137e719366bdc0f387675c2 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 12 Jun 2025 14:11:58 +0800
Subject: [PATCH 10/80] relayfs: support a counter tracking if per-cpu buffers
 is full

When using relay mechanism, we often encounter the case where new data are
lost or old unconsumed data are overwritten because of slow reader.

Add 'full' field in per-cpu buffer structure to detect if the above case
is happening.  Relay has two modes: 1) non-overwrite mode, 2) overwrite
mode.  So buffer being full here respectively means: 1) relayfs doesn't
intend to accept new data and then simply drop them, or 2) relayfs is
going to start over again and overwrite old unread data with new data.

Note: this counter doesn't need any explicit lock to protect from being
modified by different threads for the better performance consideration.
Writers calling __relay_write/relay_write should consider how to use the
lock and ensure it performs under the lock protection, thus it's not
necessary to add a new small lock here.

Link: https://lkml.kernel.org/r/20250612061201.34272-3-kerneljasonxing@gmail.com
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Yushan Zhou <katrinzhou@tencent.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/relay.h | 9 +++++++++
 kernel/relay.c        | 8 +++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/relay.h b/include/linux/relay.h
index e10a0fdf4325..cd77eb285a48 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -28,6 +28,14 @@
  */
 #define RELAYFS_CHANNEL_VERSION		7
 
+/*
+ * Relay buffer statistics
+ */
+struct rchan_buf_stats
+{
+	unsigned int full_count;	/* counter for buffer full */
+};
+
 /*
  * Per-cpu relay channel buffer
  */
@@ -43,6 +51,7 @@ struct rchan_buf
 	struct irq_work wakeup_work;	/* reader wakeup */
 	struct dentry *dentry;		/* channel file dentry */
 	struct kref kref;		/* channel buffer refcount */
+	struct rchan_buf_stats stats;	/* buffer stats */
 	struct page **page_array;	/* array of current buffer pages */
 	unsigned int page_count;	/* number of current buffer pages */
 	unsigned int finalized;		/* buffer has been finalized */
diff --git a/kernel/relay.c b/kernel/relay.c
index fc6ad76b789d..4b07efddc2cf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -251,8 +251,13 @@ EXPORT_SYMBOL_GPL(relay_buf_full);
 static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
 			      void *prev_subbuf)
 {
+	int full = relay_buf_full(buf);
+
+	if (full)
+		buf->stats.full_count++;
+
 	if (!buf->chan->cb->subbuf_start)
-		return !relay_buf_full(buf);
+		return !full;
 
 	return buf->chan->cb->subbuf_start(buf, subbuf,
 					   prev_subbuf);
@@ -297,6 +302,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 	buf->finalized = 0;
 	buf->data = buf->start;
 	buf->offset = 0;
+	buf->stats.full_count = 0;
 
 	for (i = 0; i < buf->chan->n_subbufs; i++)
 		buf->padding[i] = 0;

From a53202ce7fbafd24f854865b02eff891e246c550 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 12 Jun 2025 14:11:59 +0800
Subject: [PATCH 11/80] relayfs: introduce getting relayfs statistics function

In this version, only support getting the counter for buffer full and
implement the framework of how it works.

Users can pass certain flag to fetch what field/statistics they expect to
know.  Each time it only returns one result.  So do not pass multiple
flags.

Link: https://lkml.kernel.org/r/20250612061201.34272-4-kerneljasonxing@gmail.com
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Yushan Zhou <katrinzhou@tencent.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/relay.h |  7 +++++++
 kernel/relay.c        | 30 ++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/include/linux/relay.h b/include/linux/relay.h
index cd77eb285a48..5310967f9d74 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -31,6 +31,12 @@
 /*
  * Relay buffer statistics
  */
+enum {
+	RELAY_STATS_BUF_FULL = (1 << 0),
+
+	RELAY_STATS_LAST = RELAY_STATS_BUF_FULL,
+};
+
 struct rchan_buf_stats
 {
 	unsigned int full_count;	/* counter for buffer full */
@@ -167,6 +173,7 @@ struct rchan *relay_open(const char *base_filename,
 			 void *private_data);
 extern void relay_close(struct rchan *chan);
 extern void relay_flush(struct rchan *chan);
+size_t relay_stats(struct rchan *chan, int flags);
 extern void relay_subbufs_consumed(struct rchan *chan,
 				   unsigned int cpu,
 				   size_t consumed);
diff --git a/kernel/relay.c b/kernel/relay.c
index 4b07efddc2cf..2fc27c0e771e 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -700,6 +700,36 @@ void relay_flush(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_flush);
 
+/**
+ *	relay_stats - get channel buffer statistics
+ *	@chan: the channel
+ *	@flags: select particular information to get
+ *
+ *	Returns the count of certain field that caller specifies.
+ */
+size_t relay_stats(struct rchan *chan, int flags)
+{
+	unsigned int i, count = 0;
+	struct rchan_buf *rbuf;
+
+	if (!chan || flags > RELAY_STATS_LAST)
+		return 0;
+
+	if (chan->is_global) {
+		rbuf = *per_cpu_ptr(chan->buf, 0);
+		if (flags & RELAY_STATS_BUF_FULL)
+			count = rbuf->stats.full_count;
+	} else {
+		for_each_online_cpu(i) {
+			rbuf = *per_cpu_ptr(chan->buf, i);
+			if (rbuf && flags & RELAY_STATS_BUF_FULL)
+				count += rbuf->stats.full_count;
+		}
+	}
+
+	return count;
+}
+
 /**
  *	relay_file_open - open file op for relay files
  *	@inode: the inode

From 7f2173894f7bfe63bcb241f419b15ed5ce79f0d1 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 12 Jun 2025 14:12:00 +0800
Subject: [PATCH 12/80] blktrace: use rbuf->stats.full as a drop indicator in
 relayfs

Replace internal subbuf_start in blktrace with the default policy in
relayfs.

Remove dropped field from struct blktrace.  Correspondingly, call the
common helper in relay.  By incrementing full_count to keep track of how
many times we encountered a full buffer issue, user space will know how
many events were lost.

Link: https://lkml.kernel.org/r/20250612061201.34272-5-kerneljasonxing@gmail.com
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Yushan Zhou <katrinzhou@tencent.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/trace/blktrace.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d3083c88474e..5401b9006135 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -415,9 +415,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
 				size_t count, loff_t *ppos)
 {
 	struct blk_trace *bt = filp->private_data;
+	size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL);
 	char buf[16];
 
-	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+	snprintf(buf, sizeof(buf), "%zu\n", dropped);
 
 	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
 }
@@ -456,23 +457,6 @@ static const struct file_operations blk_msg_fops = {
 	.llseek =	noop_llseek,
 };
 
-/*
- * Keep track of how many times we encountered a full subbuffer, to aid
- * the user space app in telling how many lost events there were.
- */
-static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
-				     void *prev_subbuf)
-{
-	struct blk_trace *bt;
-
-	if (!relay_buf_full(buf))
-		return 1;
-
-	bt = buf->chan->private_data;
-	atomic_inc(&bt->dropped);
-	return 0;
-}
-
 static int blk_remove_buf_file_callback(struct dentry *dentry)
 {
 	debugfs_remove(dentry);
@@ -491,7 +475,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename,
 }
 
 static const struct rchan_callbacks blk_relay_callbacks = {
-	.subbuf_start		= blk_subbuf_start_callback,
 	.create_buf_file	= blk_create_buf_file_callback,
 	.remove_buf_file	= blk_remove_buf_file_callback,
 };
@@ -580,7 +563,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	}
 
 	bt->dev = dev;
-	atomic_set(&bt->dropped, 0);
 	INIT_LIST_HEAD(&bt->running_list);
 
 	ret = -EIO;

From 19f3cb64a25b80db667a00182785577fae465b3e Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 12 Jun 2025 14:12:01 +0800
Subject: [PATCH 13/80] relayfs: support a counter tracking if data is too big
 to write

It really doesn't matter if the user/admin knows what the last too big
value is.  Record how many times this case is triggered would be helpful.

Solve the existing issue where relay_reset() doesn't restore the value.

Store the counter in the per-cpu buffer structure instead of the global
buffer structure.  It also solves the racy condition which is likely to
happen when a few of per-cpu buffers encounter the too big data case and
then access the global field last_toobig without lock protection.

Remove the printk in relay_close() since kernel module can directly call
relay_stats() as they want.

Link: https://lkml.kernel.org/r/20250612061201.34272-6-kerneljasonxing@gmail.com
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Yushan Zhou <katrinzhou@tencent.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/relay.h |  5 +++--
 kernel/relay.c        | 18 ++++++++++--------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/linux/relay.h b/include/linux/relay.h
index 5310967f9d74..6772a7075840 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -33,13 +33,15 @@
  */
 enum {
 	RELAY_STATS_BUF_FULL = (1 << 0),
+	RELAY_STATS_WRT_BIG = (1 << 1),
 
-	RELAY_STATS_LAST = RELAY_STATS_BUF_FULL,
+	RELAY_STATS_LAST = RELAY_STATS_WRT_BIG,
 };
 
 struct rchan_buf_stats
 {
 	unsigned int full_count;	/* counter for buffer full */
+	unsigned int big_count;		/* counter for too big to write */
 };
 
 /*
@@ -79,7 +81,6 @@ struct rchan
 	const struct rchan_callbacks *cb; /* client callbacks */
 	struct kref kref;		/* channel refcount */
 	void *private_data;		/* for user-defined data */
-	size_t last_toobig;		/* tried to log event > subbuf size */
 	struct rchan_buf * __percpu *buf; /* per-cpu channel buffers */
 	int is_global;			/* One global buffer ? */
 	struct list_head list;		/* for channel list */
diff --git a/kernel/relay.c b/kernel/relay.c
index 2fc27c0e771e..8d915fe98198 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -303,6 +303,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 	buf->data = buf->start;
 	buf->offset = 0;
 	buf->stats.full_count = 0;
+	buf->stats.big_count = 0;
 
 	for (i = 0; i < buf->chan->n_subbufs; i++)
 		buf->padding[i] = 0;
@@ -602,7 +603,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 	return length;
 
 toobig:
-	buf->chan->last_toobig = length;
+	buf->stats.big_count++;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(relay_switch_subbuf);
@@ -662,11 +663,6 @@ void relay_close(struct rchan *chan)
 			if ((buf = *per_cpu_ptr(chan->buf, i)))
 				relay_close_buf(buf);
 
-	if (chan->last_toobig)
-		printk(KERN_WARNING "relay: one or more items not logged "
-		       "[item size (%zd) > sub-buffer size (%zd)]\n",
-		       chan->last_toobig, chan->subbuf_size);
-
 	list_del(&chan->list);
 	kref_put(&chan->kref, relay_destroy_channel);
 	mutex_unlock(&relay_channels_mutex);
@@ -719,11 +715,17 @@ size_t relay_stats(struct rchan *chan, int flags)
 		rbuf = *per_cpu_ptr(chan->buf, 0);
 		if (flags & RELAY_STATS_BUF_FULL)
 			count = rbuf->stats.full_count;
+		else if (flags & RELAY_STATS_WRT_BIG)
+			count = rbuf->stats.big_count;
 	} else {
 		for_each_online_cpu(i) {
 			rbuf = *per_cpu_ptr(chan->buf, i);
-			if (rbuf && flags & RELAY_STATS_BUF_FULL)
-				count += rbuf->stats.full_count;
+			if (rbuf) {
+				if (flags & RELAY_STATS_BUF_FULL)
+					count += rbuf->stats.full_count;
+				else if (flags & RELAY_STATS_WRT_BIG)
+					count += rbuf->stats.big_count;
+			}
 		}
 	}
 

From ad2c8079e9d5637f6d66cb5ce5cf49768ae87658 Mon Sep 17 00:00:00 2001
From: Wei Nanxin <n9winx@163.com>
Date: Sun, 15 Jun 2025 20:32:37 +0800
Subject: [PATCH 14/80] kcov: fix typo in comment of kcov_fault_in_area

change '__santizer_cov_trace_pc()' to '__sanitizer_cov_trace_pc()'

Link: https://lkml.kernel.org/r/20250615123237.110144-1-n9winx@163.com
Signed-off-by: Wei Nanxin <n9winx@163.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Macro Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kcov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 187ba1b80bda..1d85597057e1 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -552,7 +552,7 @@ static int kcov_get_mode(unsigned long arg)
 
 /*
  * Fault in a lazily-faulted vmalloc area before it can be used by
- * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the
+ * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the
  * vmalloc fault handling path is instrumented.
  */
 static void kcov_fault_in_area(struct kcov *kcov)

From d71b90e5ba83b32b4e3980f8c07ba2012ad9378a Mon Sep 17 00:00:00 2001
From: Fushuai Wang <wangfushuai@baidu.com>
Date: Sun, 15 Jun 2025 11:09:30 +0800
Subject: [PATCH 15/80] exit: fix misleading comment in
 forget_original_parent()

The commit 482a3767e508 ("exit: reparent: call forget_original_parent()
under tasklist_lock") moved the comment from exit_notify() to
forget_original_parent().  However, the forget_original_parent() only
handles (A), while (B) is handled in kill_orphaned_pgrp().  So remove the
unrelated part.

Link: https://lkml.kernel.org/r/20250615030930.58051-1-wangfushuai@baidu.com
Signed-off-by: Fushuai Wang <wangfushuai@baidu.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: wangfushuai <wangfushuai@baidu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/exit.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index bb184a67ac73..f03caf17b214 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -692,12 +692,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 }
 
 /*
- * This does two things:
- *
- * A.  Make init inherit all the child processes
- * B.  Check to see if any process groups have become orphaned
- *	as a result of our exiting, and if they have any stopped
- *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+ * Make init inherit all the child processes
  */
 static void forget_original_parent(struct task_struct *father,
 					struct list_head *dead)

From e795000e755c309d1f9bd2a0590eca38b4625f3a Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <npitre@baylibre.com>
Date: Mon, 16 Jun 2025 15:22:44 -0400
Subject: [PATCH 16/80] mul_u64_u64_div_u64: fix the division-by-zero behavior
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current implementation forces a compile-time 1/0 division, which
generates an undefined instruction (ud2 on x86) rather than a proper
runtime division-by-zero exception.

Change to trigger an actual div-by-0 exception at runtime, consistent with
other division operations.  Use a non-1 dividend to prevent the compiler
from optimizing the division into a comparison.

Link: https://lkml.kernel.org/r/q246p466-1453-qon9-29so-37105116009q@onlyvoer.pbz
Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Cc: David Laight <david.laight.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/math/div64.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/lib/math/div64.c b/lib/math/div64.c
index 5faa29208bdb..bf77b9843175 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -212,12 +212,13 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
 
 #endif
 
-	/* make sure c is not zero, trigger exception otherwise */
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdiv-by-zero"
-	if (unlikely(c == 0))
-		return 1/0;
-#pragma GCC diagnostic pop
+	/* make sure c is not zero, trigger runtime exception otherwise */
+	if (unlikely(c == 0)) {
+		unsigned long zero = 0;
+
+		OPTIMIZER_HIDE_VAR(zero);
+		return ~0UL/zero;
+	}
 
 	int shift = __builtin_ctzll(c);
 

From 5eee4c2b2aebfd3c8f11d9722e49d838da4e4150 Mon Sep 17 00:00:00 2001
From: Antonio Borneo <antonio.borneo@foss.st.com>
Date: Mon, 16 Jun 2025 09:59:13 +0200
Subject: [PATCH 17/80] checkpatch: use utf-8 match for spell checking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current code that checks for misspelling verifies, in a more
complex regex, if $rawline matches [^\w]($misspellings)[^\w]

Being $rawline a byte-string, a utf-8 character in $rawline can
match the non-word-char [^\w].
E.g.:
	./scripts/checkpatch.pl --git 81c2f059ab9
	WARNING: 'ment' may be misspelled - perhaps 'meant'?
	#36: FILE: MAINTAINERS:14360:
	+M:     Clément Léger <clement.leger@bootlin.com>
	            ^^^^

Use a utf-8 version of $rawline for spell checking.

Link: https://lkml.kernel.org/r/20250616-b4-checkpatch-upstream-v2-1-5600ce4a3b43@foss.st.com
Signed-off-by: Antonio Borneo <antonio.borneo@foss.st.com>
Signed-off-by: Clément Le Goffic <clement.legoffic@foss.st.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/checkpatch.pl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 664f7b7a622c..489b74d52abe 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3502,9 +3502,10 @@ sub process {
 # Check for various typo / spelling mistakes
 		if (defined($misspellings) &&
 		    ($in_commit_log || $line =~ /^(?:\+|Subject:)/i)) {
-			while ($rawline =~ /(?:^|[^\w\-'`])($misspellings)(?:[^\w\-'`]|$)/gi) {
+			my $rawline_utf8 = decode("utf8", $rawline);
+			while ($rawline_utf8 =~ /(?:^|[^\w\-'`])($misspellings)(?:[^\w\-'`]|$)/gi) {
 				my $typo = $1;
-				my $blank = copy_spacing($rawline);
+				my $blank = copy_spacing($rawline_utf8);
 				my $ptr = substr($blank, 0, $-[1]) . "^" x length($typo);
 				my $hereptr = "$hereline$ptr\n";
 				my $typo_fix = $spelling_fix{lc($typo)};

From aa644c405291a419e92b112e2279c01c410e9a26 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <olsajiri@gmail.com>
Date: Wed, 14 May 2025 12:18:09 +0200
Subject: [PATCH 18/80] uprobes: revert ref_ctr_offset in uprobe_unregister
 error path

There's error path that could lead to inactive uprobe:

  1) uprobe_register succeeds - updates instruction to int3 and
     changes ref_ctr from 0 to 1
  2) uprobe_unregister fails  - int3 stays in place, but ref_ctr
     is changed to 0 (it's not restored to 1 in the fail path)
     uprobe is leaked
  3) another uprobe_register comes and re-uses the leaked uprobe
     and succeds - but int3 is already in place, so ref_ctr update
     is skipped and it stays 0 - uprobe CAN NOT be triggered now
  4) uprobe_unregister fails because ref_ctr value is unexpected

Fix this by reverting the updated ref_ctr value back to 1 in step 2),
which is the case when uprobe_unregister fails (int3 stays in place), but
we have already updated refctr.

The new scenario will go as follows:

  1) uprobe_register succeeds - updates instruction to int3 and
     changes ref_ctr from 0 to 1
  2) uprobe_unregister fails  - int3 stays in place and ref_ctr
     is reverted to 1..  uprobe is leaked
  3) another uprobe_register comes and re-uses the leaked uprobe
     and succeds - but int3 is already in place, so ref_ctr update
     is skipped and it stays 1 - uprobe CAN be triggered now
  4) uprobe_unregister succeeds

Link: https://lkml.kernel.org/r/20250514101809.2010193-1-jolsa@kernel.org
Fixes: 1cc33161a83d ("uprobes: Support SDT markers having reference count (semaphore)")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/events/uprobes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4c965ba77f9f..84ee7b590861 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -581,8 +581,8 @@ retry:
 
 out:
 	/* Revert back reference counter if instruction update failed. */
-	if (ret < 0 && is_register && ref_ctr_updated)
-		update_ref_ctr(uprobe, mm, -1);
+	if (ret < 0 && ref_ctr_updated)
+		update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
 
 	/* try collapse pmd for compound page */
 	if (ret > 0)

From 2ae826799932ff89409f56636ad3c25578fe7cf5 Mon Sep 17 00:00:00 2001
From: Lizhi Xu <lizhi.xu@windriver.com>
Date: Mon, 16 Jun 2025 09:31:40 +0800
Subject: [PATCH 19/80] ocfs2: reset folio to NULL when get folio fails

The reproducer uses FAULT_INJECTION to make memory allocation fail, which
causes __filemap_get_folio() to fail, when initializing w_folios[i] in
ocfs2_grab_folios_for_write(), it only returns an error code and the value
of w_folios[i] is the error code, which causes
ocfs2_unlock_and_free_folios() to recycle the invalid w_folios[i] when
releasing folios.

Link: https://lkml.kernel.org/r/20250616013140.3602219-1-lizhi.xu@windriver.com
Reported-by: syzbot+c2ea94ae47cd7e3881ec@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c2ea94ae47cd7e3881ec
Signed-off-by: Lizhi Xu <lizhi.xu@windriver.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/aops.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 40b6bce12951..89aadc6cdd87 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1071,6 +1071,7 @@ static int ocfs2_grab_folios_for_write(struct address_space *mapping,
 			if (IS_ERR(wc->w_folios[i])) {
 				ret = PTR_ERR(wc->w_folios[i]);
 				mlog_errno(ret);
+				wc->w_folios[i] = NULL;
 				goto out;
 			}
 		}

From 816a8800326833becc93f607eb706fc542c28d75 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Tue, 17 Jun 2025 09:25:34 +0800
Subject: [PATCH 20/80] ocfs2: remove redundant NULL check in rename path

The code checks newfe_bh for NULL after it has already been dereferenced
to access b_data.  This NULL check is unnecessary for two reasons:

1. If ocfs2_inode_lock() succeeds (returns >= 0), newfe_bh is guaranteed
   to be valid.
2. We've already dereferenced newfe_bh to access b_data, so it must be
   non-NULL at this point.

Remove the redundant NULL check in the trace_ocfs2_rename_over_existing()
call to improve code clarity.

Link: https://lkml.kernel.org/r/20250617012534.3458669-1-leo.lilong@huawei.com
Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Su Yue <glass.su@suse.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 99278c8f0e24..26bc59c3a813 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1452,8 +1452,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
 		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
 
 		trace_ocfs2_rename_over_existing(
-		     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
-		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
+		     (unsigned long long)newfe_blkno, newfe_bh,
+		     (unsigned long long)newfe_bh->b_blocknr);
 
 		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
 			status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,

From 64960497ea86a5d09176c296c3616aa7c8668624 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Wed, 18 Jun 2025 15:34:33 +0200
Subject: [PATCH 21/80] fork: clean up ifdef logic around stack allocation

There is an unneeded OR in the ifdef functions that are used to allocate
and free kernel stacks based on direct map or vmap.  Adding dynamic stack
support would complicate this logic even further.

Therefore, clean up by changing the order so OR is no longer needed.

Link: https://lkml.kernel.org/r/20250618-fork-fixes-v4-1-2e05a2e1f5fc@linaro.org
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Link: https://lore.kernel.org/20240311164638.2015063-3-pasha.tatashin@soleen.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 6616d173307a..bd8c21d64746 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -188,13 +188,7 @@ static inline void free_task_struct(struct task_struct *tsk)
 	kmem_cache_free(task_struct_cachep, tsk);
 }
 
-/*
- * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
- * kmemcache based allocator.
- */
-# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-
-#  ifdef CONFIG_VMAP_STACK
+#ifdef CONFIG_VMAP_STACK
 /*
  * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
  * flush.  Try to minimize the number of calls by caching stacks.
@@ -344,7 +338,13 @@ static void free_thread_stack(struct task_struct *tsk)
 	tsk->stack_vm_area = NULL;
 }
 
-#  else /* !CONFIG_VMAP_STACK */
+#else /* !CONFIG_VMAP_STACK */
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+#if THREAD_SIZE >= PAGE_SIZE
 
 static void thread_stack_free_rcu(struct rcu_head *rh)
 {
@@ -376,8 +376,7 @@ static void free_thread_stack(struct task_struct *tsk)
 	tsk->stack = NULL;
 }
 
-#  endif /* CONFIG_VMAP_STACK */
-# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
+#else /* !(THREAD_SIZE >= PAGE_SIZE) */
 
 static struct kmem_cache *thread_stack_cache;
 
@@ -416,7 +415,8 @@ void thread_stack_cache_init(void)
 	BUG_ON(thread_stack_cache == NULL);
 }
 
-# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#endif /* THREAD_SIZE >= PAGE_SIZE */
+#endif /* CONFIG_VMAP_STACK */
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
 static struct kmem_cache *signal_cachep;

From 41a7f737685eed2700654720d3faaffdf0132135 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 18 Jun 2025 15:46:02 +0200
Subject: [PATCH 22/80] scripts: gdb: move MNT_* constants to gdb-parsed

Since these are now no longer defines, but in an enum.

Link: https://lkml.kernel.org/r/20250618134629.25700-2-johannes@sipsolutions.net
Fixes: 101f2bbab541 ("fs: convert mount flags to enum")
Reviewed-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/gdb/linux/constants.py.in | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in
index fd6bd69c5096..d5e3069f42a7 100644
--- a/scripts/gdb/linux/constants.py.in
+++ b/scripts/gdb/linux/constants.py.in
@@ -73,12 +73,12 @@ if IS_BUILTIN(CONFIG_MODULES):
     LX_GDBPARSED(MOD_RO_AFTER_INIT)
 
 /* linux/mount.h */
-LX_VALUE(MNT_NOSUID)
-LX_VALUE(MNT_NODEV)
-LX_VALUE(MNT_NOEXEC)
-LX_VALUE(MNT_NOATIME)
-LX_VALUE(MNT_NODIRATIME)
-LX_VALUE(MNT_RELATIME)
+LX_GDBPARSED(MNT_NOSUID)
+LX_GDBPARSED(MNT_NODEV)
+LX_GDBPARSED(MNT_NOEXEC)
+LX_GDBPARSED(MNT_NOATIME)
+LX_GDBPARSED(MNT_NODIRATIME)
+LX_GDBPARSED(MNT_RELATIME)
 
 /* linux/threads.h */
 LX_VALUE(NR_CPUS)

From 1857fcc847443b0238cb64584b43d8c3a9049a0a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 17 Mar 2025 17:02:28 +0800
Subject: [PATCH 23/80] lib/raid6: replace custom zero page with ZERO_PAGE

Use the system-wide zero page instead of a custom zero page.

[herbert@gondor.apana.org.au: update lib/raid6/recov_rvv.c, per Klara]
  Link: https://lkml.kernel.org/r/aFkUnXWtxcgOTVkw@gondor.apana.org.au
Link: https://lkml.kernel.org/r/Z9flJNkWQICx0PXk@gondor.apana.org.au
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Song Liu <song@kernel.org>
Cc: Yu Kuai <yukuai3@huawei.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 crypto/async_tx/async_pq.c          |  2 +-
 crypto/async_tx/async_raid6_recov.c |  4 ++--
 include/linux/raid/pq.h             | 12 +++++++++++-
 lib/raid6/algos.c                   |  3 ---
 lib/raid6/recov.c                   |  6 +++---
 lib/raid6/recov_avx2.c              |  6 +++---
 lib/raid6/recov_avx512.c            |  6 +++---
 lib/raid6/recov_loongarch_simd.c    | 12 ++++++------
 lib/raid6/recov_neon.c              |  6 +++---
 lib/raid6/recov_s390xc.c            |  6 +++---
 lib/raid6/recov_ssse3.c             |  6 +++---
 11 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index 5e2b2680d7db..9e4bb7fbde25 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -119,7 +119,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int *offsets, int disks,
 	for (i = 0; i < disks; i++) {
 		if (blocks[i] == NULL) {
 			BUG_ON(i > disks - 3); /* P or Q can't be zero */
-			srcs[i] = (void*)raid6_empty_zero_page;
+			srcs[i] = raid6_get_zero_page();
 		} else {
 			srcs[i] = page_address(blocks[i]) + offsets[i];
 
diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c
index 354b8cd5537f..539ea5b378dc 100644
--- a/crypto/async_tx/async_raid6_recov.c
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -414,7 +414,7 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 		async_tx_quiesce(&submit->depend_tx);
 		for (i = 0; i < disks; i++)
 			if (blocks[i] == NULL)
-				ptrs[i] = (void *) raid6_empty_zero_page;
+				ptrs[i] = raid6_get_zero_page();
 			else
 				ptrs[i] = page_address(blocks[i]) + offs[i];
 
@@ -497,7 +497,7 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila,
 		async_tx_quiesce(&submit->depend_tx);
 		for (i = 0; i < disks; i++)
 			if (blocks[i] == NULL)
-				ptrs[i] = (void*)raid6_empty_zero_page;
+				ptrs[i] = raid6_get_zero_page();
 			else
 				ptrs[i] = page_address(blocks[i]) + offs[i];
 
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 72ff44cca864..2467b3be15c9 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -11,8 +11,13 @@
 #ifdef __KERNEL__
 
 #include <linux/blkdev.h>
+#include <linux/mm.h>
 
-extern const char raid6_empty_zero_page[PAGE_SIZE];
+/* This should be const but the raid6 code is too convoluted for that. */
+static inline void *raid6_get_zero_page(void)
+{
+	return page_address(ZERO_PAGE(0));
+}
 
 #else /* ! __KERNEL__ */
 /* Used for testing in user space */
@@ -191,6 +196,11 @@ static inline uint32_t raid6_jiffies(void)
 	return tv.tv_sec*1000 + tv.tv_usec/1000;
 }
 
+static inline void *raid6_get_zero_page(void)
+{
+	return raid6_empty_zero_page;
+}
+
 #endif /* ! __KERNEL__ */
 
 #endif /* LINUX_RAID_RAID6_H */
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 75ce3e134b7c..799e0e5eac26 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -18,9 +18,6 @@
 #else
 #include <linux/module.h>
 #include <linux/gfp.h>
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-EXPORT_SYMBOL(raid6_empty_zero_page);
 #endif
 
 struct raid6_calls raid6_call;
diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c
index a7c1b2bbe40d..b5e47c008b41 100644
--- a/lib/raid6/recov.c
+++ b/lib/raid6/recov.c
@@ -31,10 +31,10 @@ static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -72,7 +72,7 @@ static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c
index 4e8095403ee2..97d598d2535c 100644
--- a/lib/raid6/recov_avx2.c
+++ b/lib/raid6/recov_avx2.c
@@ -28,10 +28,10 @@ static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -196,7 +196,7 @@ static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c
index 310c715db313..7986120ca444 100644
--- a/lib/raid6/recov_avx512.c
+++ b/lib/raid6/recov_avx512.c
@@ -37,10 +37,10 @@ static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
 	 */
 
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -238,7 +238,7 @@ static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
 	 */
 
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c
index 94aeac85e6f7..93dc515997a1 100644
--- a/lib/raid6/recov_loongarch_simd.c
+++ b/lib/raid6/recov_loongarch_simd.c
@@ -42,10 +42,10 @@ static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
 	 * delta p and delta q
 	 */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -197,7 +197,7 @@ static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
 	 * Use the dead data page as temporary storage for delta q
 	 */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -316,10 +316,10 @@ static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
 	 * delta p and delta q
 	 */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -436,7 +436,7 @@ static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
 	 * Use the dead data page as temporary storage for delta q
 	 */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c
index 1bfc14174d4d..70e1404c1512 100644
--- a/lib/raid6/recov_neon.c
+++ b/lib/raid6/recov_neon.c
@@ -36,10 +36,10 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
 	 * delta p and delta q
 	 */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -74,7 +74,7 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
 	 * Use the dead data page as temporary storage for delta q
 	 */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c
index 179eec900cea..1d32c01261be 100644
--- a/lib/raid6/recov_s390xc.c
+++ b/lib/raid6/recov_s390xc.c
@@ -35,10 +35,10 @@ static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -82,7 +82,7 @@ static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
index 4bfa3c6b60de..2e849185c32b 100644
--- a/lib/raid6/recov_ssse3.c
+++ b/lib/raid6/recov_ssse3.c
@@ -30,10 +30,10 @@ static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -203,7 +203,7 @@ static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);

From 4d71d99f361f54bd18a94370ea08e562e511c4e9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 Jun 2025 16:13:17 -0700
Subject: [PATCH 24/80] MAINTAINERS: add lib/raid6/ to "SOFTWARE RAID"

Cc: Song Liu <song@kernel.org>
Cc: Yu Kuai <yukuai3@huawei.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index fad6cb025a19..947ec6bf5b95 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23053,6 +23053,7 @@ F:	drivers/md/md*
 F:	drivers/md/raid*
 F:	include/linux/raid/
 F:	include/uapi/linux/raid/
+F:	lib/raid6/
 
 SOLIDRUN CLEARFOG SUPPORT
 M:	Russell King <linux@armlinux.org.uk>

From caf728dfa7789f7096e8cd4341b4bf8f671f5c1d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Jun 2025 13:19:04 +0200
Subject: [PATCH 25/80] lib: test_objagg: split test_hints_case() into two
 functions

With sanitizers enabled, this function uses a lot of stack, causing a
harmless warning:

lib/test_objagg.c: In function 'test_hints_case.constprop':
lib/test_objagg.c:994:1: error: the frame size of 1440 bytes is larger than 1408 bytes [-Werror=frame-larger-than=]

Most of this is from the two 'struct world' structures.  Since most of the
work in this function is duplicated for the two, split it up into separate
functions that each use one of them.

The combined stack usage is still the same here, but there is no warning
any more, and the code is still safe because of the known call chain.

Link: https://lkml.kernel.org/r/20250620111907.3395296-1-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/test_objagg.c | 77 +++++++++++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/lib/test_objagg.c b/lib/test_objagg.c
index 222b39fc2629..ce5c4c36a084 100644
--- a/lib/test_objagg.c
+++ b/lib/test_objagg.c
@@ -908,50 +908,22 @@ static int check_expect_hints_stats(struct objagg_hints *objagg_hints,
 	return err;
 }
 
-static int test_hints_case(const struct hints_case *hints_case)
+static int test_hints_case2(const struct hints_case *hints_case,
+			    struct objagg_hints *hints, struct objagg *objagg)
 {
 	struct objagg_obj *objagg_obj;
-	struct objagg_hints *hints;
 	struct world world2 = {};
-	struct world world = {};
 	struct objagg *objagg2;
-	struct objagg *objagg;
 	const char *errmsg;
 	int i;
 	int err;
 
-	objagg = objagg_create(&delta_ops, NULL, &world);
-	if (IS_ERR(objagg))
-		return PTR_ERR(objagg);
-
-	for (i = 0; i < hints_case->key_ids_count; i++) {
-		objagg_obj = world_obj_get(&world, objagg,
-					   hints_case->key_ids[i]);
-		if (IS_ERR(objagg_obj)) {
-			err = PTR_ERR(objagg_obj);
-			goto err_world_obj_get;
-		}
-	}
-
-	pr_debug_stats(objagg);
-	err = check_expect_stats(objagg, &hints_case->expect_stats, &errmsg);
-	if (err) {
-		pr_err("Stats: %s\n", errmsg);
-		goto err_check_expect_stats;
-	}
-
-	hints = objagg_hints_get(objagg, OBJAGG_OPT_ALGO_SIMPLE_GREEDY);
-	if (IS_ERR(hints)) {
-		err = PTR_ERR(hints);
-		goto err_hints_get;
-	}
-
 	pr_debug_hints_stats(hints);
 	err = check_expect_hints_stats(hints, &hints_case->expect_stats_hints,
 				       &errmsg);
 	if (err) {
 		pr_err("Hints stats: %s\n", errmsg);
-		goto err_check_expect_hints_stats;
+		return err;
 	}
 
 	objagg2 = objagg_create(&delta_ops, hints, &world2);
@@ -983,7 +955,48 @@ err_world2_obj_get:
 		world_obj_put(&world2, objagg, hints_case->key_ids[i]);
 	i = hints_case->key_ids_count;
 	objagg_destroy(objagg2);
-err_check_expect_hints_stats:
+
+	return err;
+}
+
+static int test_hints_case(const struct hints_case *hints_case)
+{
+	struct objagg_obj *objagg_obj;
+	struct objagg_hints *hints;
+	struct world world = {};
+	struct objagg *objagg;
+	const char *errmsg;
+	int i;
+	int err;
+
+	objagg = objagg_create(&delta_ops, NULL, &world);
+	if (IS_ERR(objagg))
+		return PTR_ERR(objagg);
+
+	for (i = 0; i < hints_case->key_ids_count; i++) {
+		objagg_obj = world_obj_get(&world, objagg,
+					   hints_case->key_ids[i]);
+		if (IS_ERR(objagg_obj)) {
+			err = PTR_ERR(objagg_obj);
+			goto err_world_obj_get;
+		}
+	}
+
+	pr_debug_stats(objagg);
+	err = check_expect_stats(objagg, &hints_case->expect_stats, &errmsg);
+	if (err) {
+		pr_err("Stats: %s\n", errmsg);
+		goto err_check_expect_stats;
+	}
+
+	hints = objagg_hints_get(objagg, OBJAGG_OPT_ALGO_SIMPLE_GREEDY);
+	if (IS_ERR(hints)) {
+		err = PTR_ERR(hints);
+		goto err_hints_get;
+	}
+
+	err = test_hints_case2(hints_case, hints, objagg);
+
 	objagg_hints_put(hints);
 err_hints_get:
 err_check_expect_stats:

From fed307b67c5bbb17b72c54816cd1bce61c23b4d7 Mon Sep 17 00:00:00 2001
From: Jiazi Li <jqqlijiazi@gmail.com>
Date: Fri, 20 Jun 2025 18:07:56 +0800
Subject: [PATCH 26/80] kthread: update comment for __to_kthread

With commit 343f4c49f243 ("kthread: Don't allocate kthread_struct for init
and umh") and commit 753550eb0ce1 ("fork: Explicitly set PF_KTHREAD"), umh
task no longer have struct kthread and PF_KTHREAD flag.

Update the comment to describe what the current rules are to detect
is something is a kthread.

Link: https://lkml.kernel.org/r/20250620100801.23185-1-jqqlijiazi@gmail.com
Signed-off-by: Jiazi Li <jqqlijiazi@gmail.com>
Signed-off-by: mingzhu.wang <mingzhu.wang@transsion.com>
Suggested-by Eric W . Biederman <ebiederm@xmission.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kthread.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 85fc068f0083..0e98b228a8ef 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -88,13 +88,12 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 /*
  * Variant of to_kthread() that doesn't assume @p is a kthread.
  *
- * Per construction; when:
+ * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
+ * always remain a kthread.  For kthreads p->worker_private always
+ * points to a struct kthread.  For tasks that are not kthreads
+ * p->worker_private is used to point to other things.
  *
- *   (p->flags & PF_KTHREAD) && p->worker_private
- *
- * the task is both a kthread and struct kthread is persistent. However
- * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
- * begin_new_exec()).
+ * Return NULL for any task that is not a kthread.
  */
 static inline struct kthread *__to_kthread(struct task_struct *p)
 {

From 896f612273dacfdc7a635315394ccf285c257208 Mon Sep 17 00:00:00 2001
From: Li Chen <chenl311@chinatelecom.cn>
Date: Fri, 20 Jun 2025 10:02:31 +0800
Subject: [PATCH 27/80] fs: fat: Prevent fsfuzzer from dominating the console

fsfuzzer may make many invalid access for FAT-fs and generate many kmsg
like "FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)".

For platforms & os that enables hardware serial device whose speed are
slow, this may cause softlockup easily.

So let's ratelimit the error log.

The log as below:
[11916.242560] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.254485] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.266388] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.278287] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.290180] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.302068] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.313962] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.325848] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.337732] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.349619] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.361505] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.373391] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.385272] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.397144] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.409025] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.420909] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.432791] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.444674] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.456558] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.468446] FAT-fs (loop2): error, invalid access to FAT (entry 0x00000ccb)
[11916.480352] watchdog: BUG: soft lockup - CPU#58 stuck for 26s! [cat:2446035]
[11916.480357] Modules linked in: ...
[11916.480503] CPU: 58 PID: 2446035 Comm: cat Kdump: loaded Tainted: ...
[11916.480508] Hardware name: vclusters VSFT5000 B/VSFT5000 B, BIOS ...
[11916.480510] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[11916.480513] pc : console_emit_next_record+0x1b4/0x288
[11916.480524] lr : console_emit_next_record+0x1ac/0x288
[11916.480525] sp : ffff80009bcdae90
[11916.480527] x29: ffff80009bcdaec0 x28: ffff800082513810 x27: 0000000000000001
[11916.480530] x26: 0000000000000001 x25: ffff800081f66000 x24: 0000000000000000
[11916.480533] x23: 0000000000000000 x22: ffff80009bcdaf8f x21: 0000000000000001
[11916.480535] x20: 0000000000000000 x19: ffff800082513810 x18: ffffffffffffffff
[11916.480538] x17: 0000000000000002 x16: 0000000000000001 x15: ffff80009bcdab30
[11916.480541] x14: 0000000000000000 x13: 205d353330363434 x12: 32545b5d36343438
[11916.480543] x11: 652820544146206f x10: 7420737365636361 x9 : ffff800080159a6c
[11916.480546] x8 : 69202c726f727265 x7 : 545b5d3634343836 x6 : 342e36313931315b
[11916.480549] x5 : ffff800082513a01 x4 : ffff80009bcdad31 x3 : 0000000000000000
[11916.480551] x2 : 00000000ffffffff x1 : 0000000001b9b000 x0 : ffff8000836cef00
[11916.480554] Call trace:
[11916.480557]  console_emit_next_record+0x1b4/0x288
[11916.480560]  console_flush_all+0xcc/0x190
[11916.480563]  console_unlock+0x78/0x138
[11916.480565]  vprintk_emit+0x1c4/0x210
[11916.480568]  vprintk_default+0x40/0x58
[11916.480570]  vprintk+0x84/0xc8
[11916.480572]  _printk+0x68/0xa0
[11916.480578]  _fat_msg+0x6c/0xa0 [fat]
[11916.480593]  __fat_fs_error+0xf8/0x118 [fat]
[11916.480601]  fat_ent_read+0x164/0x238 [fat]
[11916.480609]  fat_get_cluster+0x180/0x2c8 [fat]
[11916.480617]  fat_get_mapped_cluster+0xb8/0x170 [fat]

Link: https://lkml.kernel.org/r/20250620020231.9292-1-me@linux.beauty
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/fatent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 1db348f8f887..a7061c2ad8e4 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -356,7 +356,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
 
 	if (!fat_valid_entry(sbi, entry)) {
 		fatent_brelse(fatent);
-		fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
+		fat_fs_error_ratelimit(sb, "invalid access to FAT (entry 0x%08x)", entry);
 		return -EIO;
 	}
 

From 01bda05819b89b38eebad7e2034b8ab14eee5207 Mon Sep 17 00:00:00 2001
From: Yaxin Wang <wang.yaxin@zte.com.cn>
Date: Thu, 19 Jun 2025 21:18:43 +0800
Subject: [PATCH 28/80] tools/accounting/delaytop: add delaytop to record top-n
 task delay

Problem
=======

The "getdelays" can only display the latency of a single task
by specifying a PID, but it has the following limitations:

1. single-task perspective: only supports querying the latency (CPU, I/O,
memory, etc.) of an individual task via PID and cannot provide a global
analysis of high-latency processes across the system.

2. lack of High-Latency process awareness: when the overall system
   latency is high (e.g., a spike in CPU latency), there is no way to
   quickly identify the top N processes contributing to the highest
   latency.

3. poor interactivity: It lacks dynamic sorting and refresh
   capabilities (similar to top), making it difficult to monitor latency
   changes in real time.

Solution
========

To address these limitations, we introduce the "delaytop" with the
following capabilities:

1. system view: monitors latency metrics (CPU, I/O, memory, IRQ, etc.)
   for all system processes

2. supports field-based sorting (e.g., default sort by CPU latency in
   descending order)

3. dynamic interactive interface: focus on specific processes with
   --pid; limit displayed entries with --processes 20; control monitoring
   duration with --iterations;

Use case
========
bash# ./delaytop
Top 20 processes (sorted by CPU delay):

  PID   TGID  COMMAND            CPU(ms)  IO(ms)   SWAP(ms) RCL(ms) THR(ms)  CMP(ms)  WP(ms)  IRQ(ms)
---------------------------------------------------------------------------------------------
   26     26  kworker/1:0H       5.55     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   32     32  kworker/2:0H-kb    2.93     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   38     38  kworker/3:0H-ev    2.88     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   84     84  kworker/R-vfio-    1.62     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   24     24  ksoftirqd/1        1.43     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   19     19  idle_inject/0      0.99     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   16     16  rcu_exp_par_gp_    0.87     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   11     11  kworker/0:1        0.87     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   22     22  idle_inject/1      0.80     0.00     0.00     0.00    0.00     0.00     0.00     0.00
    3      3  pool_workqueue_    0.74     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   81     81  scsi_eh_1          0.59     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   30     30  ksoftirqd/2        0.42     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   36     36  ksoftirqd/3        0.37     0.00     0.00     0.00    0.00     0.00     0.00     0.00
    9      9  kworker/0:0-eve    0.36     0.00     0.00     0.00    0.00     0.00     0.00     0.00
    8      8  kworker/R-netns    0.34     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   76     76  kworker/1:1-pm     0.32     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   21     21  cpuhp/1            0.30     0.00     0.00     0.00    0.00     0.00     0.00     0.00
    4      4  kworker/R-rcu_g    0.21     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   12     12  kworker/u16:0-i    0.20     0.00     0.00     0.00    0.00     0.00     0.00     0.00
    1      1  init               0.18     0.00     0.00     0.00    0.00     0.00     0.08     0.00

Link: https://lkml.kernel.org/r/20250619211843633h05gWrBDMFkEH6xAVm_5y@zte.com.cn
Co-developed-by: Fan Yu <fan.yu9@zte.com.cn>
Signed-off-by: Fan Yu <fan.yu9@zte.com.cn>
Signed-off-by: Yaxin Wang <wang.yaxin@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Peilin He <he.peilin@zte.com.cn>
Cc: Qiang Tu <tu.qiang35@zte.com.cn>
Cc: wangyong <wang.yong12@zte.com.cn>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Cc: ye xingchen <ye.xingchen@zte.com.cn>
Cc: Yunkai Zhang <zhang.yunkai@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/accounting/Makefile   |   2 +-
 tools/accounting/delaytop.c | 673 ++++++++++++++++++++++++++++++++++++
 2 files changed, 674 insertions(+), 1 deletion(-)
 create mode 100644 tools/accounting/delaytop.c

diff --git a/tools/accounting/Makefile b/tools/accounting/Makefile
index 11def1ad046c..20bbd461515e 100644
--- a/tools/accounting/Makefile
+++ b/tools/accounting/Makefile
@@ -2,7 +2,7 @@
 CC := $(CROSS_COMPILE)gcc
 CFLAGS := -I../../usr/include
 
-PROGS := getdelays procacct
+PROGS := getdelays procacct delaytop
 
 all: $(PROGS)
 
diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c
new file mode 100644
index 000000000000..23e38f39e97d
--- /dev/null
+++ b/tools/accounting/delaytop.c
@@ -0,0 +1,673 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * delaytop.c - task delay monitoring tool.
+ *
+ * This tool provides real-time monitoring and statistics of
+ * system, container, and task-level delays, including CPU,
+ * memory, IO, and IRQ and delay accounting. It supports both
+ * interactive (top-like), and can output delay information
+ * for the whole system, specific containers (cgroups), or
+ * individual tasks (PIDs).
+ *
+ * Key features:
+ *   - Collects per-task delay accounting statistics via taskstats.
+ *   - Supports sorting, filtering.
+ *   - Supports both interactive (screen refresh).
+ *
+ * Copyright (C) Fan Yu, ZTE Corp. 2025
+ * Copyright (C) Wang Yaxin, ZTE Corp. 2025
+ *
+ * Compile with
+ *	gcc -I/usr/src/linux/include delaytop.c -o delaytop
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <signal.h>
+#include <time.h>
+#include <dirent.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <termios.h>
+#include <limits.h>
+#include <linux/genetlink.h>
+#include <linux/taskstats.h>
+#include <linux/cgroupstats.h>
+#include <ncurses.h>
+
+#define NLA_NEXT(na)			((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len)))
+#define NLA_DATA(na)			((void *)((char *)(na) + NLA_HDRLEN))
+#define NLA_PAYLOAD(len)		(len - NLA_HDRLEN)
+
+#define GENLMSG_DATA(glh)		((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
+#define GENLMSG_PAYLOAD(glh)	(NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
+
+#define TASK_COMM_LEN	16
+#define MAX_MSG_SIZE	1024
+#define MAX_TASKS		1000
+#define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field
+
+/* Program settings structure */
+struct config {
+	int delay;				/* Update interval in seconds */
+	int iterations;			/* Number of iterations, 0 == infinite */
+	int max_processes;		/* Maximum number of processes to show */
+	char sort_field;		/* Field to sort by */
+	int output_one_time;	/* Output once and exit */
+	int monitor_pid;		/* Monitor specific PID */
+	char *container_path;	/* Path to container cgroup */
+};
+
+/* Task delay information structure */
+struct task_info {
+	int pid;
+	int tgid;
+	char command[TASK_COMM_LEN];
+	unsigned long long cpu_count;
+	unsigned long long cpu_delay_total;
+	unsigned long long blkio_count;
+	unsigned long long blkio_delay_total;
+	unsigned long long swapin_count;
+	unsigned long long swapin_delay_total;
+	unsigned long long freepages_count;
+	unsigned long long freepages_delay_total;
+	unsigned long long thrashing_count;
+	unsigned long long thrashing_delay_total;
+	unsigned long long compact_count;
+	unsigned long long compact_delay_total;
+	unsigned long long wpcopy_count;
+	unsigned long long wpcopy_delay_total;
+	unsigned long long irq_count;
+	unsigned long long irq_delay_total;
+};
+
+/* Container statistics structure */
+struct container_stats {
+	int nr_sleeping;		/* Number of sleeping processes */
+	int nr_running;			/* Number of running processes */
+	int nr_stopped;			/* Number of stopped processes */
+	int nr_uninterruptible; /* Number of uninterruptible processes */
+	int nr_io_wait;			/* Number of processes in IO wait */
+};
+
+/* Global variables */
+static struct config cfg;
+static struct task_info tasks[MAX_TASKS];
+static int task_count;
+static int running = 1;
+static struct container_stats container_stats;
+
+/* Netlink socket variables */
+static int nl_sd = -1;
+static int family_id;
+
+/* Set terminal to non-canonical mode for q-to-quit */
+static struct termios orig_termios;
+static void enable_raw_mode(void)
+{
+	struct termios raw;
+
+	tcgetattr(STDIN_FILENO, &orig_termios);
+	raw = orig_termios;
+	raw.c_lflag &= ~(ICANON | ECHO);
+	tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw);
+}
+static void disable_raw_mode(void)
+{
+	tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios);
+}
+
+/* Display usage information and command line options */
+static void usage(void)
+{
+	printf("Usage: delaytop [Options]\n"
+	"Options:\n"
+	"  -h, --help               Show this help message and exit\n"
+	"  -d, --delay=SECONDS      Set refresh interval (default: 2 seconds, min: 1)\n"
+	"  -n, --iterations=COUNT   Set number of updates (default: 0 = infinite)\n"
+	"  -P, --processes=NUMBER   Set maximum number of processes to show (default: 20, max: 1000)\n"
+	"  -o, --once               Display once and exit\n"
+	"  -p, --pid=PID            Monitor only the specified PID\n"
+	"  -C, --container=PATH     Monitor the container at specified cgroup path\n");
+	exit(0);
+}
+
+/* Parse command line arguments and set configuration */
+static void parse_args(int argc, char **argv)
+{
+	int c;
+	struct option long_options[] = {
+		{"help", no_argument, 0, 'h'},
+		{"delay", required_argument, 0, 'd'},
+		{"iterations", required_argument, 0, 'n'},
+		{"pid", required_argument, 0, 'p'},
+		{"once", no_argument, 0, 'o'},
+		{"processes", required_argument, 0, 'P'},
+		{"container", required_argument, 0, 'C'},
+		{0, 0, 0, 0}
+	};
+
+	/* Set defaults */
+	cfg.delay = 2;
+	cfg.iterations = 0;
+	cfg.max_processes = 20;
+	cfg.sort_field = 'c';	/* Default sort by CPU delay */
+	cfg.output_one_time = 0;
+	cfg.monitor_pid = 0;	/* 0 means monitor all PIDs */
+	cfg.container_path = NULL;
+
+	while (1) {
+		int option_index = 0;
+
+		c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'h':
+			usage();
+			break;
+		case 'd':
+			cfg.delay = atoi(optarg);
+			if (cfg.delay < 1) {
+				fprintf(stderr, "Error: delay must be >= 1.\n");
+				exit(1);
+			}
+			break;
+		case 'n':
+			cfg.iterations = atoi(optarg);
+			if (cfg.iterations < 0) {
+				fprintf(stderr, "Error: iterations must be >= 0.\n");
+				exit(1);
+			}
+			break;
+		case 'p':
+			cfg.monitor_pid = atoi(optarg);
+			if (cfg.monitor_pid < 1) {
+				fprintf(stderr, "Error: pid must be >= 1.\n");
+				exit(1);
+			}
+			break;
+		case 'o':
+			cfg.output_one_time = 1;
+			break;
+		case 'P':
+			cfg.max_processes = atoi(optarg);
+			if (cfg.max_processes < 1) {
+				fprintf(stderr, "Error: processes must be >= 1.\n");
+				exit(1);
+			}
+			if (cfg.max_processes > MAX_TASKS) {
+				fprintf(stderr, "Warning: processes capped to %d.\n",
+					MAX_TASKS);
+				cfg.max_processes = MAX_TASKS;
+			}
+			break;
+		case 'C':
+			cfg.container_path = strdup(optarg);
+			break;
+		default:
+			fprintf(stderr, "Try 'delaytop --help' for more information.\n");
+			exit(1);
+		}
+	}
+}
+
+/* Create a raw netlink socket and bind */
+static int create_nl_socket(void)
+{
+	int fd;
+	struct sockaddr_nl local;
+
+	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+	if (fd < 0)
+		return -1;
+
+	memset(&local, 0, sizeof(local));
+	local.nl_family = AF_NETLINK;
+
+	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) {
+		close(fd);
+		return -1;
+	}
+
+	return fd;
+}
+
+/* Send a command via netlink */
+static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
+			 __u8 genl_cmd, __u16 nla_type,
+			 void *nla_data, int nla_len)
+{
+	struct sockaddr_nl nladdr;
+	struct nlattr *na;
+	int r, buflen;
+	char *buf;
+
+	struct {
+		struct nlmsghdr n;
+		struct genlmsghdr g;
+		char buf[MAX_MSG_SIZE];
+	} msg;
+
+	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+	msg.n.nlmsg_type = nlmsg_type;
+	msg.n.nlmsg_flags = NLM_F_REQUEST;
+	msg.n.nlmsg_seq = 0;
+	msg.n.nlmsg_pid = nlmsg_pid;
+	msg.g.cmd = genl_cmd;
+	msg.g.version = 0x1;
+	na = (struct nlattr *) GENLMSG_DATA(&msg);
+	na->nla_type = nla_type;
+	na->nla_len = nla_len + NLA_HDRLEN;
+	memcpy(NLA_DATA(na), nla_data, nla_len);
+	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+
+	buf = (char *) &msg;
+	buflen = msg.n.nlmsg_len;
+	memset(&nladdr, 0, sizeof(nladdr));
+	nladdr.nl_family = AF_NETLINK;
+	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
+				   sizeof(nladdr))) < buflen) {
+		if (r > 0) {
+			buf += r;
+			buflen -= r;
+		} else if (errno != EAGAIN)
+			return -1;
+	}
+	return 0;
+}
+
+/* Get family ID for taskstats via netlink */
+static int get_family_id(int sd)
+{
+	struct {
+		struct nlmsghdr n;
+		struct genlmsghdr g;
+		char buf[256];
+	} ans;
+
+	int id = 0, rc;
+	struct nlattr *na;
+	int rep_len;
+	char name[100];
+
+	strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1);
+	name[sizeof(name) - 1] = '\0';
+	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
+			CTRL_ATTR_FAMILY_NAME, (void *)name,
+			strlen(TASKSTATS_GENL_NAME)+1);
+	if (rc < 0)
+		return 0;
+
+	rep_len = recv(sd, &ans, sizeof(ans), 0);
+	if (ans.n.nlmsg_type == NLMSG_ERROR ||
+		(rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
+		return 0;
+
+	na = (struct nlattr *) GENLMSG_DATA(&ans);
+	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
+	if (na->nla_type == CTRL_ATTR_FAMILY_ID)
+		id = *(__u16 *) NLA_DATA(na);
+	return id;
+}
+
+static int read_comm(int pid, char *comm_buf, size_t buf_size)
+{
+	char path[64];
+	size_t len;
+	FILE *fp;
+
+	snprintf(path, sizeof(path), "/proc/%d/comm", pid);
+	fp = fopen(path, "r");
+	if (!fp)
+		return -1;
+	if (fgets(comm_buf, buf_size, fp)) {
+		len = strlen(comm_buf);
+		if (len > 0 && comm_buf[len - 1] == '\n')
+			comm_buf[len - 1] = '\0';
+	} else {
+		fclose(fp);
+		return -1;
+	}
+	fclose(fp);
+	return 0;
+}
+
+static int fetch_and_fill_task_info(int pid, const char *comm)
+{
+	struct {
+		struct nlmsghdr n;
+		struct genlmsghdr g;
+		char buf[MAX_MSG_SIZE];
+	} resp;
+	struct taskstats stats;
+	struct nlattr *nested;
+	struct nlattr *na;
+	int nested_len;
+	int nl_len;
+	int rc;
+
+	if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET,
+				 TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) {
+		return -1;
+	}
+	rc = recv(nl_sd, &resp, sizeof(resp), 0);
+	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR)
+		return -1;
+	nl_len = GENLMSG_PAYLOAD(&resp.n);
+	na = (struct nlattr *) GENLMSG_DATA(&resp);
+	while (nl_len > 0) {
+		if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) {
+			nested = (struct nlattr *) NLA_DATA(na);
+			nested_len = NLA_PAYLOAD(na->nla_len);
+			while (nested_len > 0) {
+				if (nested->nla_type == TASKSTATS_TYPE_STATS) {
+					memcpy(&stats, NLA_DATA(nested), sizeof(stats));
+					if (task_count < MAX_TASKS) {
+						tasks[task_count].pid = pid;
+						tasks[task_count].tgid = pid;
+						strncpy(tasks[task_count].command, comm,
+							TASK_COMM_LEN - 1);
+						tasks[task_count].command[TASK_COMM_LEN - 1] = '\0';
+						SET_TASK_STAT(task_count, cpu_count);
+						SET_TASK_STAT(task_count, cpu_delay_total);
+						SET_TASK_STAT(task_count, blkio_count);
+						SET_TASK_STAT(task_count, blkio_delay_total);
+						SET_TASK_STAT(task_count, swapin_count);
+						SET_TASK_STAT(task_count, swapin_delay_total);
+						SET_TASK_STAT(task_count, freepages_count);
+						SET_TASK_STAT(task_count, freepages_delay_total);
+						SET_TASK_STAT(task_count, thrashing_count);
+						SET_TASK_STAT(task_count, thrashing_delay_total);
+						SET_TASK_STAT(task_count, compact_count);
+						SET_TASK_STAT(task_count, compact_delay_total);
+						SET_TASK_STAT(task_count, wpcopy_count);
+						SET_TASK_STAT(task_count, wpcopy_delay_total);
+						SET_TASK_STAT(task_count, irq_count);
+						SET_TASK_STAT(task_count, irq_delay_total);
+						task_count++;
+					}
+					break;
+				}
+				nested_len -= NLA_ALIGN(nested->nla_len);
+				nested = NLA_NEXT(nested);
+			}
+		}
+		nl_len -= NLA_ALIGN(na->nla_len);
+		na = NLA_NEXT(na);
+	}
+	return 0;
+}
+
+static void get_task_delays(void)
+{
+	char comm[TASK_COMM_LEN];
+	struct dirent *entry;
+	DIR *dir;
+	int pid;
+
+	task_count = 0;
+	if (cfg.monitor_pid > 0) {
+		if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0)
+			fetch_and_fill_task_info(cfg.monitor_pid, comm);
+		return;
+	}
+
+	dir = opendir("/proc");
+	if (!dir) {
+		fprintf(stderr, "Error opening /proc directory\n");
+		return;
+	}
+
+	while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) {
+		if (!isdigit(entry->d_name[0]))
+			continue;
+		pid = atoi(entry->d_name);
+		if (pid == 0)
+			continue;
+		if (read_comm(pid, comm, sizeof(comm)) != 0)
+			continue;
+		fetch_and_fill_task_info(pid, comm);
+	}
+	closedir(dir);
+}
+
+/* Calculate average delay in milliseconds */
+static double average_ms(unsigned long long total, unsigned long long count)
+{
+	if (count == 0)
+		return 0;
+	return (double)total / 1000000.0 / count;
+}
+
+/* Comparison function for sorting tasks */
+static int compare_tasks(const void *a, const void *b)
+{
+	const struct task_info *t1 = (const struct task_info *)a;
+	const struct task_info *t2 = (const struct task_info *)b;
+	double avg1, avg2;
+
+	switch (cfg.sort_field) {
+	case 'c': /* CPU */
+		avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count);
+		avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count);
+		if (avg1 != avg2)
+			return avg2 > avg1 ? 1 : -1;
+		return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1;
+
+	default:
+		return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1;
+	}
+}
+
+/* Sort tasks by selected field */
+static void sort_tasks(void)
+{
+	if (task_count > 0)
+		qsort(tasks, task_count, sizeof(struct task_info), compare_tasks);
+}
+
+/* Get container statistics via cgroupstats */
+static void get_container_stats(void)
+{
+	int rc, cfd;
+	struct {
+		struct nlmsghdr n;
+		struct genlmsghdr g;
+		char buf[MAX_MSG_SIZE];
+	} req, resp;
+	struct nlattr *na;
+	int nl_len;
+	struct cgroupstats stats;
+
+	/* Check if container path is set */
+	if (!cfg.container_path)
+		return;
+
+	/* Open container cgroup */
+	cfd = open(cfg.container_path, O_RDONLY);
+	if (cfd < 0) {
+		fprintf(stderr, "Error opening container path: %s\n", cfg.container_path);
+		return;
+	}
+
+	/* Send request for container stats */
+	if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET,
+				CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) {
+		fprintf(stderr, "Failed to send request for container stats\n");
+		close(cfd);
+		return;
+	}
+
+	/* Receive response */
+	rc = recv(nl_sd, &resp, sizeof(resp), 0);
+	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
+		fprintf(stderr, "Failed to receive response for container stats\n");
+		close(cfd);
+		return;
+	}
+
+	/* Parse response */
+	nl_len = GENLMSG_PAYLOAD(&resp.n);
+	na = (struct nlattr *) GENLMSG_DATA(&resp);
+	while (nl_len > 0) {
+		if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) {
+			/* Get the cgroupstats structure */
+			memcpy(&stats, NLA_DATA(na), sizeof(stats));
+
+			/* Fill container stats */
+			container_stats.nr_sleeping = stats.nr_sleeping;
+			container_stats.nr_running = stats.nr_running;
+			container_stats.nr_stopped = stats.nr_stopped;
+			container_stats.nr_uninterruptible = stats.nr_uninterruptible;
+			container_stats.nr_io_wait = stats.nr_io_wait;
+			break;
+		}
+		nl_len -= NLA_ALIGN(na->nla_len);
+		na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
+	}
+
+	close(cfd);
+}
+
+/* Display results to stdout or log file */
+static void display_results(void)
+{
+	time_t now = time(NULL);
+	struct tm *tm_now = localtime(&now);
+	char timestamp[32];
+	int i, count;
+	FILE *out = stdout;
+
+	fprintf(out, "\033[H\033[J");
+
+	if (cfg.container_path) {
+		fprintf(out, "Container Information (%s):\n", cfg.container_path);
+		fprintf(out, "Processes: running=%d, sleeping=%d, ",
+			container_stats.nr_running, container_stats.nr_sleeping);
+		fprintf(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n",
+			container_stats.nr_stopped, container_stats.nr_uninterruptible,
+			container_stats.nr_io_wait);
+	}
+	fprintf(out, "Top %d processes (sorted by CPU delay):\n\n",
+		   cfg.max_processes);
+	fprintf(out, "  PID	TGID  COMMAND		 CPU(ms)  IO(ms)   ");
+	fprintf(out, "SWAP(ms) RCL(ms) THR(ms)  CMP(ms)  WP(ms)  IRQ(ms)\n");
+	fprintf(out, "-----------------------------------------------");
+	fprintf(out, "----------------------------------------------\n");
+	count = task_count < cfg.max_processes ? task_count : cfg.max_processes;
+
+	for (i = 0; i < count; i++) {
+		fprintf(out, "%5d  %5d  %-15s ",
+			tasks[i].pid, tasks[i].tgid, tasks[i].command);
+		fprintf(out, "%7.2f  %7.2f  %7.2f  %7.2f %7.2f  %7.2f  %7.2f  %7.2f\n",
+			average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count),
+			average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count),
+			average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count),
+			average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count),
+			average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count),
+			average_ms(tasks[i].compact_delay_total, tasks[i].compact_count),
+			average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count),
+			average_ms(tasks[i].irq_delay_total, tasks[i].irq_count));
+	}
+
+	fprintf(out, "\n");
+}
+
+/* Main function */
+int main(int argc, char **argv)
+{
+	int iterations = 0;
+	int use_q_quit = 0;
+
+	/* Parse command line arguments */
+	parse_args(argc, argv);
+
+	/* Setup netlink socket */
+	nl_sd = create_nl_socket();
+	if (nl_sd < 0) {
+		fprintf(stderr, "Error creating netlink socket\n");
+		exit(1);
+	}
+
+	/* Get family ID for taskstats via netlink */
+	family_id = get_family_id(nl_sd);
+	if (!family_id) {
+		fprintf(stderr, "Error getting taskstats family ID\n");
+		close(nl_sd);
+		exit(1);
+	}
+
+	if (!cfg.output_one_time) {
+		use_q_quit = 1;
+		enable_raw_mode();
+		printf("Press 'q' to quit.\n");
+		fflush(stdout);
+	}
+
+	/* Main loop */
+	while (running) {
+		/* Get container stats if container path provided */
+		if (cfg.container_path)
+			get_container_stats();
+
+		/* Get task delays */
+		get_task_delays();
+
+		/* Sort tasks */
+		sort_tasks();
+
+		/* Display results to stdout or log file */
+		display_results();
+
+		/* Check for iterations */
+		if (cfg.iterations > 0 && ++iterations >= cfg.iterations)
+			break;
+
+		/* Exit if output_one_time is set */
+		if (cfg.output_one_time)
+			break;
+
+		/* Check for 'q' key to quit */
+		if (use_q_quit) {
+			struct timeval tv = {cfg.delay, 0};
+			fd_set readfds;
+
+			FD_ZERO(&readfds);
+			FD_SET(STDIN_FILENO, &readfds);
+			int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv);
+
+			if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
+				char ch = 0;
+
+				read(STDIN_FILENO, &ch, 1);
+				if (ch == 'q' || ch == 'Q') {
+					running = 0;
+					break;
+				}
+			}
+		} else {
+			sleep(cfg.delay);
+		}
+	}
+
+	/* Restore terminal mode */
+	if (use_q_quit)
+		disable_raw_mode();
+
+	/* Cleanup */
+	close(nl_sd);
+	if (cfg.container_path)
+		free(cfg.container_path);
+
+	return 0;
+}

From 0c954c57f9e1fcf5d1b3e1be5320978bfaf9cbed Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 23 Jun 2025 23:54:20 +0900
Subject: [PATCH 29/80] ocfs2: embed actual values into ocfs2_sysfile_lock_key
 names

Since lockdep_set_class() uses stringified key name via macro, calling
lockdep_set_class() with an array causes lockdep warning messages to
report variable name than actual index number.

Change ocfs2_init_locked_inode() to pass actual index number for better
readability of lockdep reports.  This patch does not change behavior.

Before:

  Chain exists of:
    &ocfs2_sysfile_lock_key[args->fi_sysfile_type] --> jbd2_handle --> &oi->ip_xattr_sem

   Possible unsafe locking scenario:

         CPU0                    CPU1
         ----                    ----
    lock(&oi->ip_xattr_sem);
                                 lock(jbd2_handle);
                                 lock(&oi->ip_xattr_sem);
    lock(&ocfs2_sysfile_lock_key[args->fi_sysfile_type]);

   *** DEADLOCK ***

After:

  Chain exists of:
    &ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE] --> jbd2_handle --> &oi->ip_xattr_sem

   Possible unsafe locking scenario:

         CPU0                    CPU1
         ----                    ----
    lock(&oi->ip_xattr_sem);
                                 lock(jbd2_handle);
                                 lock(&oi->ip_xattr_sem);
    lock(&ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE]);

   *** DEADLOCK ***

Link: https://lkml.kernel.org/r/29348724-639c-443d-bbce-65c3a0a13a38@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/inode.c | 70 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 66 insertions(+), 4 deletions(-)

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 12e5d1f73325..14bf440ea4df 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -50,8 +50,6 @@ struct ocfs2_find_inode_args
 	unsigned int	fi_sysfile_type;
 };
 
-static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
-
 static int ocfs2_read_locked_inode(struct inode *inode,
 				   struct ocfs2_find_inode_args *args);
 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -250,14 +248,77 @@ bail:
 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 {
 	struct ocfs2_find_inode_args *args = opaque;
+#ifdef CONFIG_LOCKDEP
+	static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
 	static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
 				     ocfs2_file_ip_alloc_sem_key;
+#endif
 
 	inode->i_ino = args->fi_ino;
 	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
-	if (args->fi_sysfile_type != 0)
+#ifdef CONFIG_LOCKDEP
+	switch (args->fi_sysfile_type) {
+	case BAD_BLOCK_SYSTEM_INODE:
+		break;
+	case GLOBAL_INODE_ALLOC_SYSTEM_INODE:
 		lockdep_set_class(&inode->i_rwsem,
-			&ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
+				  &ocfs2_sysfile_lock_key[GLOBAL_INODE_ALLOC_SYSTEM_INODE]);
+		break;
+	case SLOT_MAP_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[SLOT_MAP_SYSTEM_INODE]);
+		break;
+	case HEARTBEAT_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[HEARTBEAT_SYSTEM_INODE]);
+		break;
+	case GLOBAL_BITMAP_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[GLOBAL_BITMAP_SYSTEM_INODE]);
+		break;
+	case USER_QUOTA_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[USER_QUOTA_SYSTEM_INODE]);
+		break;
+	case GROUP_QUOTA_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[GROUP_QUOTA_SYSTEM_INODE]);
+		break;
+	case ORPHAN_DIR_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[ORPHAN_DIR_SYSTEM_INODE]);
+		break;
+	case EXTENT_ALLOC_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE]);
+		break;
+	case INODE_ALLOC_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[INODE_ALLOC_SYSTEM_INODE]);
+		break;
+	case JOURNAL_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[JOURNAL_SYSTEM_INODE]);
+		break;
+	case LOCAL_ALLOC_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[LOCAL_ALLOC_SYSTEM_INODE]);
+		break;
+	case TRUNCATE_LOG_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[TRUNCATE_LOG_SYSTEM_INODE]);
+		break;
+	case LOCAL_USER_QUOTA_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[LOCAL_USER_QUOTA_SYSTEM_INODE]);
+		break;
+	case LOCAL_GROUP_QUOTA_SYSTEM_INODE:
+		lockdep_set_class(&inode->i_rwsem,
+				  &ocfs2_sysfile_lock_key[LOCAL_GROUP_QUOTA_SYSTEM_INODE]);
+		break;
+	default:
+		WARN_ONCE(1, "Unknown sysfile type %d\n", args->fi_sysfile_type);
+	}
 	if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
 	    args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
 	    args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
@@ -267,6 +328,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 	else
 		lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
 				  &ocfs2_file_ip_alloc_sem_key);
+#endif
 
 	return 0;
 }

From 37d0f07bc5a2d2736021c9090ce796b8a66571ba Mon Sep 17 00:00:00 2001
From: Sachin Mokashi <sachin.mokashi@intel.com>
Date: Tue, 24 Jun 2025 10:12:20 -0400
Subject: [PATCH 30/80] mailmap: update Sachin Mokashi's email address

As previous contributions were made with the older email address, which is
no longer in use.  Update my new address to map the old one.

Link: https://lkml.kernel.org/r/20250624141220.1264691-1-sachin.mokashi@intel.com
Signed-off-by: Sachin Mokashi <sachin.mokashi@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .mailmap | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.mailmap b/.mailmap
index b0ace71968ab..c1f4381f9685 100644
--- a/.mailmap
+++ b/.mailmap
@@ -670,6 +670,7 @@ Muchun Song <muchun.song@linux.dev> <smuchun@gmail.com>
 Ross Zwisler <zwisler@kernel.org> <ross.zwisler@linux.intel.com>
 Rudolf Marek <R.Marek@sh.cvut.cz>
 Rui Saraiva <rmps@joel.ist.utl.pt>
+Sachin Mokashi <sachin.mokashi@intel.com> <sachinx.mokashi@intel.com>
 Sachin P Sant <ssant@in.ibm.com>
 Sai Prakash Ranjan <quic_saipraka@quicinc.com> <saiprakash.ranjan@codeaurora.org>
 Sakari Ailus <sakari.ailus@linux.intel.com> <sakari.ailus@iki.fi>

From ad0039db42179064f9b60eab67c797f7359fdefc Mon Sep 17 00:00:00 2001
From: Su Hui <suhui@nfschina.com>
Date: Thu, 26 Jun 2025 18:54:41 +0800
Subject: [PATCH 31/80] fs/proc/vmcore: a few cleanups for
 vmcore_add_device_dump()

There are two cleanups for vmcore_add_device_dump().  Return -ENOMEM
directly rather than goto the label to simplify the code and use
scoped_guard() to simplify the lock/unlock code.

Link: https://lkml.kernel.org/r/20250626105440.1053139-1-suhui@nfschina.com
Signed-off-by: Su Hui <suhui@nfschina.com>
Reviewed-by: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Suhui <suhui@nfschina.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/vmcore.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 10d01eb09c43..f188bd900eb2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1490,10 +1490,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
 		return -EINVAL;
 
 	dump = vzalloc(sizeof(*dump));
-	if (!dump) {
-		ret = -ENOMEM;
-		goto out_err;
-	}
+	if (!dump)
+		return -ENOMEM;
 
 	/* Keep size of the buffer page aligned so that it can be mmaped */
 	data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
@@ -1519,22 +1517,19 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
 	dump->size = data_size;
 
 	/* Add the dump to driver sysfs list and update the elfcore hdr */
-	mutex_lock(&vmcore_mutex);
-	if (vmcore_opened)
-		pr_warn_once("Unexpected adding of device dump\n");
-	if (vmcore_open) {
-		ret = -EBUSY;
-		goto unlock;
+	scoped_guard(mutex, &vmcore_mutex) {
+		if (vmcore_opened)
+			pr_warn_once("Unexpected adding of device dump\n");
+		if (vmcore_open) {
+			ret = -EBUSY;
+			goto out_err;
+		}
+
+		list_add_tail(&dump->list, &vmcoredd_list);
+		vmcoredd_update_size(data_size);
 	}
-
-	list_add_tail(&dump->list, &vmcoredd_list);
-	vmcoredd_update_size(data_size);
-	mutex_unlock(&vmcore_mutex);
 	return 0;
 
-unlock:
-	mutex_unlock(&vmcore_mutex);
-
 out_err:
 	vfree(buf);
 	vfree(dump);

From d0118d7d20bbf9a76f75840bc3b0da0f4d092da9 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 30 Jun 2025 19:21:31 +0900
Subject: [PATCH 32/80] ocfs2: update d_splice_alias() return code checking

When commit d3556babd7fa ("ocfs2: fix d_splice_alias() return code
checking") was merged into v3.18-rc3, d_splice_alias() was returning one
of a valid dentry, NULL or an ERR_PTR.

When commit b5ae6b15bd73 ("merge d_materialise_unique() into
d_splice_alias()") was merged into v3.19-rc1, d_splice_alias() started
returning -ELOOP as one of ERR_PTR values.

Now, when syzkaller mounts a crafted ocfs2 filesystem image that hits
d_splice_alias() == -ELOOP case from ocfs2_lookup(), ocfs2_lookup() fails
to handle -ELOOP case and generic_shutdown_super() hits "VFS: Busy inodes
after unmount" message.

Instead of calling ocfs2_dentry_attach_lock() or ocfs2_dentry_attach_gen()
when d_splice_alias() returned an ERR_PTR value, change ocfs2_lookup() to
bail out immediately.

Also, ocfs2_lookup() needs to call dupt() when ocfs2_dentry_attach_lock()
returned an ERR_PTR value.

Link: https://lkml.kernel.org/r/da5be67d-2a0b-4b93-85d6-42f3b7440135@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+1134d3a5b062e9665a7a@syzkaller.appspotmail.com>
Closes: https://syzkaller.appspot.com/bug?extid=1134d3a5b062e9665a7a
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Tetsuo Handa <penguin-kernel@i-love-sakura.ne.jp>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/namei.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 26bc59c3a813..c90b254da75e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -142,6 +142,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 
 bail_add:
 	ret = d_splice_alias(inode, dentry);
+	if (IS_ERR(ret))
+		goto bail_unlock;
 
 	if (inode) {
 		/*
@@ -154,15 +156,16 @@ bail_add:
 		 * NOTE: This dentry already has ->d_op set from
 		 * ocfs2_get_parent() and ocfs2_get_dentry()
 		 */
-		if (!IS_ERR_OR_NULL(ret))
+		if (ret)
 			dentry = ret;
 
 		status = ocfs2_dentry_attach_lock(dentry, inode,
 						  OCFS2_I(dir)->ip_blkno);
 		if (status) {
 			mlog_errno(status);
+			if (ret)
+				dput(ret);
 			ret = ERR_PTR(status);
-			goto bail_unlock;
 		}
 	} else
 		ocfs2_dentry_attach_gen(dentry);

From 1f04e0e65209be3148a20b4b370e01d02b7ac445 Mon Sep 17 00:00:00 2001
From: Moon Hee Lee <moonhee.lee.ca@gmail.com>
Date: Mon, 23 Jun 2025 11:34:06 -0700
Subject: [PATCH 33/80] selftests: ptrace: add set_syscall_info to .gitignore

Add the set_syscall_info test binary to .gitignore to avoid tracking build
artifacts in the ptrace selftests directory.

Link: https://lkml.kernel.org/r/20250623183405.133434-2-moonhee.lee.ca@gmail.com
Signed-off-by: Moon Hee Lee <moonhee.lee.ca@gmail.com>
Cc: "Dmitry V. Levin" <ldv@strace.io>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/ptrace/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/ptrace/.gitignore b/tools/testing/selftests/ptrace/.gitignore
index b7dde152e75a..f6be8efd57ea 100644
--- a/tools/testing/selftests/ptrace/.gitignore
+++ b/tools/testing/selftests/ptrace/.gitignore
@@ -3,3 +3,4 @@ get_syscall_info
 get_set_sud
 peeksiginfo
 vmaccess
+set_syscall_info

From 22c2ed6996ac34df506040a069fac3e5100b5c0e Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Wed, 2 Jul 2025 16:52:00 -0700
Subject: [PATCH 34/80] checkpatch: check for missing sentinels in ID arrays

All of the ID tables based on <linux/mod_devicetable.h> (of_device_id,
pci_device_id, ...) require their arrays to end in an empty sentinel
value.  That's usually spelled with an empty initializer entry (e.g.,
"{}"), but also sometimes with explicit 0 entries, field initializers
(e.g., '.id = ""'), or even a macro entry (like PCMCIA_DEVICE_NULL).

Without a sentinel, device-matching code may read out of bounds.

I've found a number of such bugs in driver reviews, and we even
occasionally commit one to the tree.  See commit 5751eee5c620 ("i2c:
nomadik: Add missing sentinel to match table") for example.

Teach checkpatch to find these ID tables, and complain if it looks like
there wasn't a sentinel value.

Test output:

  $ git format-patch -1 a0d15cc47f29be6d --stdout | scripts/checkpatch.pl -
  ERROR: missing sentinel in ID array
  #57: FILE: drivers/i2c/busses/i2c-nomadik.c:1073:
  +static const struct of_device_id nmk_i2c_eyeq_match_table[] = {
   	{
   		.compatible = "XXXXXXXXXXXXXXXXXX",
   		.data = (void *)(NMK_I2C_EYEQ_FLAG_32B_BUS | NMK_I2C_EYEQ_FLAG_IS_EYEQ5),
   	},
   };

  total: 1 errors, 0 warnings, 66 lines checked

  NOTE: For some of the reported defects, checkpatch may be able to
        mechanically convert to the typical style using --fix or --fix-inplace.

  "[PATCH] i2c: nomadik: switch from of_device_is_compatible() to" has style problems, please review.

  NOTE: If any of the errors are false positives, please report
        them to the maintainer, see CHECKPATCH in MAINTAINERS.

When run across the entire tree (scripts/checkpatch.pl -q --types
MISSING_SENTINEL -f ...), false positives exist:

* where macros are used that hide the table from analysis
  (e.g., drivers/gpu/drm/radeon/radeon_drv.c / radeon_PCI_IDS).
  There are fewer than 5 of these.

* where such tables are processed correctly via ARRAY_SIZE() (fewer than
  5 instances). This is by far not the typical usage of *_device_id
  arrays.

* some odd parsing artifacts, where ctx_statement_block() seems to quit
  in the middle of a block due to #if/#else/#endif.

Also, not every "struct *_device_id" is in fact a sentinel-requiring
structure, but even with such types, false positives are very rare.

Link: https://lkml.kernel.org/r/20250702235245.1007351-1-briannorris@chromium.org
Signed-off-by: Brian Norris <briannorris@chromium.org>
Acked-by: Joe Perches <joe@perches.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Brian Norris <briannorris@chromium.org>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/checkpatch.pl | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 489b74d52abe..d4c24318548c 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -685,6 +685,9 @@ our $tracing_logging_tags = qr{(?xi:
 	[\.\!:\s]*
 )};
 
+# Device ID types like found in include/linux/mod_devicetable.h.
+our $dev_id_types = qr{\b[a-z]\w*_device_id\b};
+
 sub edit_distance_min {
 	my (@arr) = @_;
 	my $len = scalar @arr;
@@ -7679,6 +7682,31 @@ sub process {
 			WARN("DUPLICATED_SYSCTL_CONST",
 				"duplicated sysctl range checking value '$1', consider using the shared one in include/linux/sysctl.h\n" . $herecurr);
 		}
+
+# Check that *_device_id tables have sentinel entries.
+		if (defined $stat && $line =~ /struct\s+$dev_id_types\s+\w+\s*\[\s*\]\s*=\s*\{/) {
+			my $stripped = $stat;
+
+			# Strip diff line prefixes.
+			$stripped =~ s/(^|\n)./$1/g;
+			# Line continuations.
+			$stripped =~ s/\\\n/\n/g;
+			# Strip whitespace, empty strings, zeroes, and commas.
+			$stripped =~ s/""//g;
+			$stripped =~ s/0x0//g;
+			$stripped =~ s/[\s$;,0]//g;
+			# Strip field assignments.
+			$stripped =~ s/\.$Ident=//g;
+
+			if (!(substr($stripped, -4) eq "{}};" ||
+			      substr($stripped, -6) eq "{{}}};" ||
+			      $stripped =~ /ISAPNP_DEVICE_SINGLE_END}};$/ ||
+			      $stripped =~ /ISAPNP_CARD_END}};$/ ||
+			      $stripped =~ /NULL};$/ ||
+			      $stripped =~ /PCMCIA_DEVICE_NULL};$/)) {
+				ERROR("MISSING_SENTINEL", "missing sentinel in ID array\n" . "$here\n$stat\n");
+			}
+		}
 	}
 
 	# If we have no input at all, then there is nothing to report on

From 35c18f2933c596b4fd6a98baee36f3137d133a5f Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 12 Jun 2025 12:13:21 +0200
Subject: [PATCH 35/80] Add a new optional ",cma" suffix to the crashkernel=
 command line option

Patch series "kdump: crashkernel reservation from CMA", v5.

This series implements a way to reserve additional crash kernel memory
using CMA.

Currently, all the memory for the crash kernel is not usable by the 1st
(production) kernel.  It is also unmapped so that it can't be corrupted by
the fault that will eventually trigger the crash.  This makes sense for
the memory actually used by the kexec-loaded crash kernel image and initrd
and the data prepared during the load (vmcoreinfo, ...).  However, the
reserved space needs to be much larger than that to provide enough
run-time memory for the crash kernel and the kdump userspace.  Estimating
the amount of memory to reserve is difficult.  Being too careful makes
kdump likely to end in OOM, being too generous takes even more memory from
the production system.  Also, the reservation only allows reserving a
single contiguous block (or two with the "low" suffix).  I've seen systems
where this fails because the physical memory is fragmented.

By reserving additional crashkernel memory from CMA, the main crashkernel
reservation can be just large enough to fit the kernel and initrd image,
minimizing the memory taken away from the production system.  Most of the
run-time memory for the crash kernel will be memory previously available
to userspace in the production system.  As this memory is no longer
wasted, the reservation can be done with a generous margin, making kdump
more reliable.  Kernel memory that we need to preserve for dumping is
normally not allocated from CMA, unless it is explicitly allocated as
movable.  Currently this is only the case for memory ballooning and zswap.
Such movable memory will be missing from the vmcore.  User data is
typically not dumped by makedumpfile.  When dumping of user data is
intended this new CMA reservation cannot be used.

There are five patches in this series:

The first adds a new ",cma" suffix to the recenly introduced generic
crashkernel parsing code.  parse_crashkernel() takes one more argument to
store the cma reservation size.

The second patch implements reserve_crashkernel_cma() which performs the
reservation.  If the requested size is not available in a single range,
multiple smaller ranges will be reserved.

The third patch updates Documentation/, explicitly mentioning the
potential DMA corruption of the CMA-reserved memory.

The fourth patch adds a short delay before booting the kdump kernel,
allowing pending DMA transfers to finish.

The fifth patch enables the functionality for x86 as a proof of
concept. There are just three things every arch needs to do:
- call reserve_crashkernel_cma()
- include the CMA-reserved ranges in the physical memory map
- exclude the CMA-reserved ranges from the memory available
  through /proc/vmcore by excluding them from the vmcoreinfo
  PT_LOAD ranges.

Adding other architectures is easy and I can do that as soon as this
series is merged.

With this series applied, specifying
	crashkernel=100M craskhernel=1G,cma
on the command line will make a standard crashkernel reservation
of 100M, where kexec will load the kernel and initrd.

An additional 1G will be reserved from CMA, still usable by the production
system.  The crash kernel will have 1.1G memory available.  The 100M can
be reliably predicted based on the size of the kernel and initrd.

The new cma suffix is completely optional. When no
crashkernel=size,cma is specified, everything works as before.


This patch (of 5):

Add a new cma_size parameter to parse_crashkernel().  When not NULL, call
__parse_crashkernel to parse the CMA reservation size from
"crashkernel=size,cma" and store it in cma_size.

Set cma_size to NULL in all calls to parse_crashkernel().

Link: https://lkml.kernel.org/r/aEqnxxfLZMllMC8I@dwarf.suse.cz
Link: https://lkml.kernel.org/r/aEqoQckgoTQNULnh@dwarf.suse.cz
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Donald Dutile <ddutile@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Philipp Rudo <prudo@redhat.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Tao Liu <ltao@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/kernel/setup.c              |  2 +-
 arch/arm64/mm/init.c                 |  2 +-
 arch/loongarch/kernel/setup.c        |  2 +-
 arch/mips/kernel/setup.c             |  2 +-
 arch/powerpc/kernel/fadump.c         |  2 +-
 arch/powerpc/kexec/core.c            |  2 +-
 arch/powerpc/mm/nohash/kaslr_booke.c |  2 +-
 arch/riscv/mm/init.c                 |  2 +-
 arch/s390/kernel/setup.c             |  2 +-
 arch/sh/kernel/machine_kexec.c       |  2 +-
 arch/x86/kernel/setup.c              |  2 +-
 include/linux/crash_reserve.h        |  3 ++-
 kernel/crash_reserve.c               | 16 ++++++++++++++--
 13 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index a41c93988d2c..0bfd66c7ada0 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -1004,7 +1004,7 @@ static void __init reserve_crashkernel(void)
 	total_mem = get_total_mem();
 	ret = parse_crashkernel(boot_command_line, total_mem,
 				&crash_size, &crash_base,
-				NULL, NULL);
+				NULL, NULL, NULL);
 	/* invalid value specified or crashkernel=0 */
 	if (ret || !crash_size)
 		return;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 0c8c35dd645e..ea84a61ed508 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -106,7 +106,7 @@ static void __init arch_reserve_crashkernel(void)
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
 				&crash_size, &crash_base,
-				&low_size, &high);
+				&low_size, NULL, &high);
 	if (ret)
 		return;
 
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index b99fbb388fe0..22b27cd447a1 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -265,7 +265,7 @@ static void __init arch_reserve_crashkernel(void)
 		return;
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-				&crash_size, &crash_base, &low_size, &high);
+				&crash_size, &crash_base, &low_size, NULL, &high);
 	if (ret)
 		return;
 
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index fbfe0771317e..11b9b6b63e19 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -458,7 +458,7 @@ static void __init mips_parse_crashkernel(void)
 	total_mem = memblock_phys_mem_size();
 	ret = parse_crashkernel(boot_command_line, total_mem,
 				&crash_size, &crash_base,
-				NULL, NULL);
+				NULL, NULL, NULL);
 	if (ret != 0 || crash_size <= 0)
 		return;
 
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ca49e40c473..28cab25d5b33 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -333,7 +333,7 @@ static __init u64 fadump_calculate_reserve_size(void)
 	 * memory at a predefined offset.
 	 */
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-				&size, &base, NULL, NULL);
+				&size, &base, NULL, NULL, NULL);
 	if (ret == 0 && size > 0) {
 		unsigned long max_size;
 
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 00e9c267b912..d1a2d755381c 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -110,7 +110,7 @@ void __init arch_reserve_crashkernel(void)
 
 	/* use common parsing */
 	ret = parse_crashkernel(boot_command_line, total_mem_sz, &crash_size,
-				&crash_base, NULL, NULL);
+				&crash_base, NULL, NULL, NULL);
 
 	if (ret)
 		return;
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index 5c8d1bb98b3e..5e4897daaaea 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -178,7 +178,7 @@ static void __init get_crash_kernel(void *fdt, unsigned long size)
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, size, &crash_size,
-				&crash_base, NULL, NULL);
+				&crash_base, NULL, NULL, NULL);
 	if (ret != 0 || crash_size == 0)
 		return;
 	if (crash_base == 0)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 8d0374d7ce8e..15683ae13fa5 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1408,7 +1408,7 @@ static void __init arch_reserve_crashkernel(void)
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
 				&crash_size, &crash_base,
-				&low_size, &high);
+				&low_size, NULL, &high);
 	if (ret)
 		return;
 
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index f244c5560e7f..b99aeb0db2ee 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -605,7 +605,7 @@ static void __init reserve_crashkernel(void)
 	int rc;
 
 	rc = parse_crashkernel(boot_command_line, ident_map_size,
-			       &crash_size, &crash_base, NULL, NULL);
+			       &crash_size, &crash_base, NULL, NULL, NULL);
 
 	crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN);
 	crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN);
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index 8321b31d2e19..37073ca1e0ad 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -146,7 +146,7 @@ void __init reserve_crashkernel(void)
 		return;
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-			&crash_size, &crash_base, NULL, NULL);
+			&crash_size, &crash_base, NULL, NULL, NULL);
 	if (ret == 0 && crash_size > 0) {
 		crashk_res.start = crash_base;
 		crashk_res.end = crash_base + crash_size - 1;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fb27be697128..c22dc630c297 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -608,7 +608,7 @@ static void __init arch_reserve_crashkernel(void)
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
 				&crash_size, &crash_base,
-				&low_size, &high);
+				&low_size, NULL, &high);
 	if (ret)
 		return;
 
diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
index 1fe7e7d1b214..e784aaff2f5a 100644
--- a/include/linux/crash_reserve.h
+++ b/include/linux/crash_reserve.h
@@ -16,7 +16,8 @@ extern struct resource crashk_low_res;
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
-		unsigned long long *low_size, bool *high);
+		unsigned long long *low_size, unsigned long long *cma_size,
+		bool *high);
 
 #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
 #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index acb6bf42e30d..86ae1365d04e 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -172,17 +172,19 @@ static int __init parse_crashkernel_simple(char *cmdline,
 
 #define SUFFIX_HIGH 0
 #define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
+#define SUFFIX_CMA  2
+#define SUFFIX_NULL 3
 static __initdata char *suffix_tbl[] = {
 	[SUFFIX_HIGH] = ",high",
 	[SUFFIX_LOW]  = ",low",
+	[SUFFIX_CMA]  = ",cma",
 	[SUFFIX_NULL] = NULL,
 };
 
 /*
  * That function parses "suffix"  crashkernel command lines like
  *
- *	crashkernel=size,[high|low]
+ *	crashkernel=size,[high|low|cma]
  *
  * It returns 0 on success and -EINVAL on failure.
  */
@@ -298,9 +300,11 @@ int __init parse_crashkernel(char *cmdline,
 			     unsigned long long *crash_size,
 			     unsigned long long *crash_base,
 			     unsigned long long *low_size,
+			     unsigned long long *cma_size,
 			     bool *high)
 {
 	int ret;
+	unsigned long long __always_unused cma_base;
 
 	/* crashkernel=X[@offset] */
 	ret = __parse_crashkernel(cmdline, system_ram, crash_size,
@@ -331,6 +335,14 @@ int __init parse_crashkernel(char *cmdline,
 
 		*high = true;
 	}
+
+	/*
+	 * optional CMA reservation
+	 * cma_base is ignored
+	 */
+	if (cma_size)
+		__parse_crashkernel(cmdline, 0, cma_size,
+			&cma_base, suffix_tbl[SUFFIX_CMA]);
 #endif
 	if (!*crash_size)
 		ret = -EINVAL;

From ab475510e0422bb5672d465f9d0f523d72fdb7f1 Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 12 Jun 2025 12:16:39 +0200
Subject: [PATCH 36/80] kdump: implement reserve_crashkernel_cma

reserve_crashkernel_cma() reserves CMA ranges for the crash kernel.  If
allocating the requested size fails, try to reserve in smaller blocks.

Store the reserved ranges in the crashk_cma_ranges array and the number of
ranges in crashk_cma_cnt.

Link: https://lkml.kernel.org/r/aEqpBwOy_ekm0gw9@dwarf.suse.cz
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Donald Dutile <ddutile@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Philipp Rudo <prudo@redhat.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Tao Liu <ltao@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_reserve.h | 12 ++++++++
 kernel/crash_reserve.c        | 52 +++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
index e784aaff2f5a..7b44b41d0a20 100644
--- a/include/linux/crash_reserve.h
+++ b/include/linux/crash_reserve.h
@@ -13,12 +13,24 @@
  */
 extern struct resource crashk_res;
 extern struct resource crashk_low_res;
+extern struct range crashk_cma_ranges[];
+#if defined(CONFIG_CMA) && defined(CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION)
+#define CRASHKERNEL_CMA
+#define CRASHKERNEL_CMA_RANGES_MAX 4
+extern int crashk_cma_cnt;
+#else
+#define crashk_cma_cnt 0
+#define CRASHKERNEL_CMA_RANGES_MAX 0
+#endif
+
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
 		unsigned long long *low_size, unsigned long long *cma_size,
 		bool *high);
 
+void __init reserve_crashkernel_cma(unsigned long long cma_size);
+
 #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
 #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
 #define DEFAULT_CRASH_KERNEL_LOW_SIZE	(128UL << 20)
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index 86ae1365d04e..87bf4d41eabb 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -14,6 +14,8 @@
 #include <linux/cpuhotplug.h>
 #include <linux/memblock.h>
 #include <linux/kmemleak.h>
+#include <linux/cma.h>
+#include <linux/crash_reserve.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -469,6 +471,56 @@ retry:
 #endif
 }
 
+struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX];
+#ifdef CRASHKERNEL_CMA
+int crashk_cma_cnt;
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+	unsigned long long request_size = roundup(cma_size, PAGE_SIZE);
+	unsigned long long reserved_size = 0;
+
+	if (!cma_size)
+		return;
+
+	while (cma_size > reserved_size &&
+	       crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) {
+
+		struct cma *res;
+
+		if (cma_declare_contiguous(0, request_size, 0, 0, 0, false,
+				       "crashkernel", &res)) {
+			/* reservation failed, try half-sized blocks */
+			if (request_size <= PAGE_SIZE)
+				break;
+
+			request_size = roundup(request_size / 2, PAGE_SIZE);
+			continue;
+		}
+
+		crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res);
+		crashk_cma_ranges[crashk_cma_cnt].end =
+			crashk_cma_ranges[crashk_cma_cnt].start +
+			cma_get_size(res) - 1;
+		++crashk_cma_cnt;
+		reserved_size += request_size;
+	}
+
+	if (cma_size > reserved_size)
+		pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n",
+			cma_size >> 20, reserved_size >> 20, crashk_cma_cnt);
+	else
+		pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n",
+			reserved_size >> 20, crashk_cma_cnt);
+}
+
+#else /* CRASHKERNEL_CMA */
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+	if (cma_size)
+		pr_warn("crashkernel CMA reservation not supported\n");
+}
+#endif
+
 #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
 static __init int insert_crashkernel_resources(void)
 {

From ce1bf19a34dfa1f418037cebe11f5d2c7adf9d1e Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 12 Jun 2025 12:17:39 +0200
Subject: [PATCH 37/80] kdump, documentation: describe craskernel CMA
 reservation

Describe the new crashkernel ",cma" suffix in Documentation/

Link: https://lkml.kernel.org/r/aEqpQwUy6gqSiUkV@dwarf.suse.cz
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Donald Dutile <ddutile@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Philipp Rudo <prudo@redhat.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Tao Liu <ltao@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kdump/kdump.rst     | 21 ++++++++++++++++++
 .../admin-guide/kernel-parameters.txt         | 22 +++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
index 20fabdf6567e..9c6cd52f69cf 100644
--- a/Documentation/admin-guide/kdump/kdump.rst
+++ b/Documentation/admin-guide/kdump/kdump.rst
@@ -311,6 +311,27 @@ crashkernel syntax
 
             crashkernel=0,low
 
+4) crashkernel=size,cma
+
+	Reserve additional crash kernel memory from CMA. This reservation is
+	usable by the first system's userspace memory and kernel movable
+	allocations (memory balloon, zswap). Pages allocated from this memory
+	range will not be included in the vmcore so this should not be used if
+	dumping of userspace memory is intended and it has to be expected that
+	some movable kernel pages may be missing from the dump.
+
+	A standard crashkernel reservation, as described above, is still needed
+	to hold the crash kernel and initrd.
+
+	This option increases the risk of a kdump failure: DMA transfers
+	configured by the first kernel may end up corrupting the second
+	kernel's memory.
+
+	This reservation method is intended for systems that can't afford to
+	sacrifice enough memory for standard crashkernel reservation and where
+	less reliable and possibly incomplete kdump is preferable to no kdump at
+	all.
+
 Boot into System Kernel
 -----------------------
 1) Update the boot loader (such as grub, yaboot, or lilo) configuration
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f1f2c0874da9..ac4a239b9388 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -986,6 +986,28 @@
 			0: to disable low allocation.
 			It will be ignored when crashkernel=X,high is not used
 			or memory reserved is below 4G.
+	crashkernel=size[KMG],cma
+			[KNL, X86] Reserve additional crash kernel memory from
+			CMA. This reservation is usable by the first system's
+			userspace memory and kernel movable allocations (memory
+			balloon, zswap). Pages allocated from this memory range
+			will not be included in the vmcore so this should not
+			be used if dumping of userspace memory is intended and
+			it has to be expected that some movable kernel pages
+			may be missing from the dump.
+
+			A standard crashkernel reservation, as described above,
+			is still needed to hold the crash kernel and initrd.
+
+			This option increases the risk of a kdump failure: DMA
+			transfers configured by the first kernel may end up
+			corrupting the second kernel's memory.
+
+			This reservation method is intended for systems that
+			can't afford to sacrifice enough memory for standard
+			crashkernel reservation and where less reliable and
+			possibly incomplete kdump is preferable to no kdump at
+			all.
 
 	cryptomgr.notests
 			[KNL] Disable crypto self-tests

From e1280f3071f11abc1bacd84937ecf077dce449f3 Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 12 Jun 2025 12:18:40 +0200
Subject: [PATCH 38/80] kdump: wait for DMA to finish when using CMA

When re-using the CMA area for kdump there is a risk of pending DMA into
pinned user pages in the CMA area.

Pages residing in CMA areas can usually not get long-term pinned and are
instead migrated away from the CMA area, so long-term pinning is typically
not a concern.  (BUGs in the kernel might still lead to long-term pinning
of such pages if everything goes wrong.)

Pages pinned without FOLL_LONGTERM remain in the CMA and may possibly be
the source or destination of a pending DMA transfer.

Although there is no clear specification how long a page may be pinned
without FOLL_LONGTERM, pinning without the flag shows an intent of the
caller to only use the memory for short-lived DMA transfers, not a
transfer initiated by a device asynchronously at a random time in the
future.

Add a delay of CMA_DMA_TIMEOUT_SEC seconds before starting the kdump
kernel, giving such short-lived DMA transfers time to finish before the
CMA memory is re-used by the kdump kernel.

Set CMA_DMA_TIMEOUT_SEC to 10 seconds - chosen arbitrarily as both a huge
margin for a DMA transfer, yet not increasing the kdump time too
significantly.

Link: https://lkml.kernel.org/r/aEqpgDIBndZ5LXSo@dwarf.suse.cz
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Donald Dutile <ddutile@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Philipp Rudo <prudo@redhat.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Tao Liu <ltao@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 335b8425dd4b..a4ef79591eb2 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -21,6 +21,7 @@
 #include <linux/reboot.h>
 #include <linux/btf.h>
 #include <linux/objtool.h>
+#include <linux/delay.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -33,6 +34,11 @@
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
+/* time to wait for possible DMA to finish before starting the kdump kernel
+ * when a CMA reservation is used
+ */
+#define CMA_DMA_TIMEOUT_SEC 10
+
 #ifdef CONFIG_CRASH_DUMP
 
 int kimage_crash_copy_vmcoreinfo(struct kimage *image)
@@ -97,6 +103,14 @@ int kexec_crash_loaded(void)
 }
 EXPORT_SYMBOL_GPL(kexec_crash_loaded);
 
+static void crash_cma_clear_pending_dma(void)
+{
+	if (!crashk_cma_cnt)
+		return;
+
+	mdelay(CMA_DMA_TIMEOUT_SEC * 1000);
+}
+
 /*
  * No panic_cpu check version of crash_kexec().  This function is called
  * only when panic_cpu holds the current CPU number; this is the only CPU
@@ -119,6 +133,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
 			crash_setup_regs(&fixed_regs, regs);
 			crash_save_vmcoreinfo();
 			machine_crash_shutdown(&fixed_regs);
+			crash_cma_clear_pending_dma();
 			machine_kexec(kexec_crash_image);
 		}
 		kexec_unlock();

From bf8be1c3610829056e5445282ca92ca7b7a4ba7b Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 12 Jun 2025 12:20:04 +0200
Subject: [PATCH 39/80] x86: implement crashkernel cma reservation

Implement the crashkernel CMA reservation for x86:
- enable parsing of the cma suffix by parse_crashkernel()
- reserve memory with reserve_crashkernel_cma()
- add the CMA-reserved ranges to the e820 map for the crash kernel
- exclude the CMA-reserved ranges from vmcore

Link: https://lkml.kernel.org/r/aEqp1LD2og4QeBw9@dwarf.suse.cz
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Donald Dutile <ddutile@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Philipp Rudo <prudo@redhat.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Tao Liu <ltao@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/crash.c | 26 ++++++++++++++++++++++----
 arch/x86/kernel/setup.c |  5 +++--
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index bcb534688dfe..c6b12bed173d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -163,10 +163,10 @@ static struct crash_mem *fill_up_crash_elf_data(void)
 		return NULL;
 
 	/*
-	 * Exclusion of crash region and/or crashk_low_res may cause
-	 * another range split. So add extra two slots here.
+	 * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges
+	 * may cause range splits. So add extra slots here.
 	 */
-	nr_ranges += 2;
+	nr_ranges += 2 + crashk_cma_cnt;
 	cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
 	if (!cmem)
 		return NULL;
@@ -184,6 +184,7 @@ static struct crash_mem *fill_up_crash_elf_data(void)
 static int elf_header_exclude_ranges(struct crash_mem *cmem)
 {
 	int ret = 0;
+	int i;
 
 	/* Exclude the low 1M because it is always reserved */
 	ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1);
@@ -198,8 +199,17 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
 	if (crashk_low_res.end)
 		ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
 					      crashk_low_res.end);
+	if (ret)
+		return ret;
 
-	return ret;
+	for (i = 0; i < crashk_cma_cnt; ++i) {
+		ret = crash_exclude_mem_range(cmem, crashk_cma_ranges[i].start,
+					      crashk_cma_ranges[i].end);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }
 
 static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
@@ -374,6 +384,14 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
 		add_e820_entry(params, &ei);
 	}
 
+	for (i = 0; i < crashk_cma_cnt; ++i) {
+		ei.addr = crashk_cma_ranges[i].start;
+		ei.size = crashk_cma_ranges[i].end -
+			  crashk_cma_ranges[i].start + 1;
+		ei.type = E820_TYPE_RAM;
+		add_e820_entry(params, &ei);
+	}
+
 out:
 	vfree(cmem);
 	return ret;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c22dc630c297..680d1b6dfea4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -599,7 +599,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
 
 static void __init arch_reserve_crashkernel(void)
 {
-	unsigned long long crash_base, crash_size, low_size = 0;
+	unsigned long long crash_base, crash_size, low_size = 0, cma_size = 0;
 	bool high = false;
 	int ret;
 
@@ -608,7 +608,7 @@ static void __init arch_reserve_crashkernel(void)
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
 				&crash_size, &crash_base,
-				&low_size, NULL, &high);
+				&low_size, &cma_size, &high);
 	if (ret)
 		return;
 
@@ -618,6 +618,7 @@ static void __init arch_reserve_crashkernel(void)
 	}
 
 	reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
+	reserve_crashkernel_cma(cma_size);
 }
 
 static struct resource standard_io_resources[] = {

From 261743b0135d1d578cab407ba0cf226df30b43d8 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@linux.alibaba.com>
Date: Thu, 3 Jul 2025 10:10:00 +0800
Subject: [PATCH 40/80] panic: clean up code for console replay

Patch series "generalize panic_print's dump function to be used by other
kernel parts", v3.

When working on kernel stability issues, panic, task-hung and
software/hardware lockup are frequently met.  And to debug them, user may
need lots of system information at that time, like task call stacks, lock
info, memory info etc.

panic case already has panic_print_sys_info() for this purpose, and has a
'panic_print' bitmask to control what kinds of information is needed,
which is also helpful to debug other task-hung and lockup cases.

So this patchset extracts the function out to a new file 'lib/sys_info.c',
and makes it available for other cases which also need to dump system info
for debugging.

Also as suggested by Petr Mladek, add 'panic_sys_info=' interface to take
human readable string like "tasks,mem,locks,timers,ftrace,....", and
eventually obsolete the current 'panic_print' bitmap interface.

In RFC and V1 version, hung_task and SW/HW watchdog modules are enabled
with the new sys_info dump interface.  In v2, they are kept out for better
review of current change, and will be posted later.

Locally these have been used in our bug chasing for stability issues and
was proven helpful.

Many thanks to Petr Mladek for great suggestions on both the code and
architectures!


This patch (of 5):

Currently the panic_print_sys_info() was called twice with different
parameters to handle console replay case, which is kind of confusing.

Add panic_console_replay() explicitly and rename
'PANIC_PRINT_ALL_PRINTK_MSG' to 'PANIC_CONSOLE_REPLAY', to make the code
straightforward.  The related kernel document is also updated.

Link: https://lkml.kernel.org/r/20250703021004.42328-1-feng.tang@linux.alibaba.com
Link: https://lkml.kernel.org/r/20250703021004.42328-2-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../admin-guide/kernel-parameters.txt          |  2 +-
 Documentation/admin-guide/sysctl/kernel.rst    |  2 +-
 kernel/panic.c                                 | 18 +++++++++---------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index ac4a239b9388..3780b7e6bfd5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4555,7 +4555,7 @@
 			bit 2: print timer info
 			bit 3: print locks info if CONFIG_LOCKDEP is on
 			bit 4: print ftrace buffer
-			bit 5: print all printk messages in buffer
+			bit 5: replay all messages on consoles at the end of panic
 			bit 6: print all CPUs backtrace (if available in the arch)
 			bit 7: print only tasks in uninterruptible (blocked) state
 			*Be aware* that this option may print a _lot_ of lines,
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index dd49a89a62d3..0d08b7a2db2d 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -889,7 +889,7 @@ bit 1  print system memory info
 bit 2  print timer info
 bit 3  print locks info if ``CONFIG_LOCKDEP`` is on
 bit 4  print ftrace buffer
-bit 5  print all printk messages in buffer
+bit 5  replay all messages on consoles at the end of panic
 bit 6  print all CPUs backtrace (if available in the arch)
 bit 7  print only tasks in uninterruptible (blocked) state
 =====  ============================================
diff --git a/kernel/panic.c b/kernel/panic.c
index b0b9a8bf4560..9b6c5dc28a65 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -74,7 +74,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
 #define PANIC_PRINT_TIMER_INFO		0x00000004
 #define PANIC_PRINT_LOCK_INFO		0x00000008
 #define PANIC_PRINT_FTRACE_INFO		0x00000010
-#define PANIC_PRINT_ALL_PRINTK_MSG	0x00000020
+#define PANIC_CONSOLE_REPLAY		0x00000020
 #define PANIC_PRINT_ALL_CPU_BT		0x00000040
 #define PANIC_PRINT_BLOCKED_TASKS	0x00000080
 unsigned long panic_print;
@@ -238,14 +238,14 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
 }
 EXPORT_SYMBOL(nmi_panic);
 
-static void panic_print_sys_info(bool console_flush)
+static void panic_console_replay(void)
 {
-	if (console_flush) {
-		if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
-			console_flush_on_panic(CONSOLE_REPLAY_ALL);
-		return;
-	}
+	if (panic_print & PANIC_CONSOLE_REPLAY)
+		console_flush_on_panic(CONSOLE_REPLAY_ALL);
+}
 
+static void panic_print_sys_info(void)
+{
 	if (panic_print & PANIC_PRINT_TASK_INFO)
 		show_state();
 
@@ -410,7 +410,7 @@ void panic(const char *fmt, ...)
 	 */
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
-	panic_print_sys_info(false);
+	panic_print_sys_info();
 
 	kmsg_dump_desc(KMSG_DUMP_PANIC, buf);
 
@@ -439,7 +439,7 @@ void panic(const char *fmt, ...)
 	debug_locks_off();
 	console_flush_on_panic(CONSOLE_FLUSH_PENDING);
 
-	panic_print_sys_info(true);
+	panic_console_replay();
 
 	if (!panic_blink)
 		panic_blink = no_blink;

From b76e89e50fc3693b7b8a443ed906320d8ccb93fd Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@linux.alibaba.com>
Date: Thu, 3 Jul 2025 10:10:01 +0800
Subject: [PATCH 41/80] panic: generalize panic_print's function to show sys
 info

'panic_print' was introduced to help debugging kernel panic by dumping
different kinds of system information like tasks' call stack, memory,
ftrace buffer, etc.  Actually this function could also be used to help
debugging other cases like task-hung, soft/hard lockup, etc.  where user
may need the snapshot of system info at that time.

Extract system info dump function related code from panic.c to separate
file sys_info.[ch], for wider usage by other kernel parts for debugging.

Also modify the macro names about singulars/plurals.

Link: https://lkml.kernel.org/r/20250703021004.42328-3-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sys_info.h | 20 ++++++++++++++++++++
 kernel/panic.c           | 36 ++++--------------------------------
 lib/Makefile             |  2 +-
 lib/sys_info.c           | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/sys_info.h
 create mode 100644 lib/sys_info.c

diff --git a/include/linux/sys_info.h b/include/linux/sys_info.h
new file mode 100644
index 000000000000..53b7e27dbf2a
--- /dev/null
+++ b/include/linux/sys_info.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SYS_INFO_H
+#define _LINUX_SYS_INFO_H
+
+/*
+ * SYS_INFO_PANIC_CONSOLE_REPLAY is for panic case only, as it needs special
+ * handling which only fits panic case.
+ */
+#define SYS_INFO_TASKS			0x00000001
+#define SYS_INFO_MEM			0x00000002
+#define SYS_INFO_TIMERS			0x00000004
+#define SYS_INFO_LOCKS			0x00000008
+#define SYS_INFO_FTRACE			0x00000010
+#define SYS_INFO_PANIC_CONSOLE_REPLAY	0x00000020
+#define SYS_INFO_ALL_CPU_BT		0x00000040
+#define SYS_INFO_BLOCKED_TASKS		0x00000080
+
+void sys_info(unsigned long si_mask);
+
+#endif	/* _LINUX_SYS_INFO_H */
diff --git a/kernel/panic.c b/kernel/panic.c
index 9b6c5dc28a65..cbb0681177b3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,6 +36,7 @@
 #include <linux/sysfs.h>
 #include <linux/context_tracking.h>
 #include <linux/seq_buf.h>
+#include <linux/sys_info.h>
 #include <trace/events/error_report.h>
 #include <asm/sections.h>
 
@@ -69,14 +70,6 @@ bool panic_triggering_all_cpu_backtrace;
 int panic_timeout = CONFIG_PANIC_TIMEOUT;
 EXPORT_SYMBOL_GPL(panic_timeout);
 
-#define PANIC_PRINT_TASK_INFO		0x00000001
-#define PANIC_PRINT_MEM_INFO		0x00000002
-#define PANIC_PRINT_TIMER_INFO		0x00000004
-#define PANIC_PRINT_LOCK_INFO		0x00000008
-#define PANIC_PRINT_FTRACE_INFO		0x00000010
-#define PANIC_CONSOLE_REPLAY		0x00000020
-#define PANIC_PRINT_ALL_CPU_BT		0x00000040
-#define PANIC_PRINT_BLOCKED_TASKS	0x00000080
 unsigned long panic_print;
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -240,31 +233,10 @@ EXPORT_SYMBOL(nmi_panic);
 
 static void panic_console_replay(void)
 {
-	if (panic_print & PANIC_CONSOLE_REPLAY)
+	if (panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY)
 		console_flush_on_panic(CONSOLE_REPLAY_ALL);
 }
 
-static void panic_print_sys_info(void)
-{
-	if (panic_print & PANIC_PRINT_TASK_INFO)
-		show_state();
-
-	if (panic_print & PANIC_PRINT_MEM_INFO)
-		show_mem();
-
-	if (panic_print & PANIC_PRINT_TIMER_INFO)
-		sysrq_timer_list_show();
-
-	if (panic_print & PANIC_PRINT_LOCK_INFO)
-		debug_show_all_locks();
-
-	if (panic_print & PANIC_PRINT_FTRACE_INFO)
-		ftrace_dump(DUMP_ALL);
-
-	if (panic_print & PANIC_PRINT_BLOCKED_TASKS)
-		show_state_filter(TASK_UNINTERRUPTIBLE);
-}
-
 void check_panic_on_warn(const char *origin)
 {
 	unsigned int limit;
@@ -285,7 +257,7 @@ void check_panic_on_warn(const char *origin)
  */
 static void panic_other_cpus_shutdown(bool crash_kexec)
 {
-	if (panic_print & PANIC_PRINT_ALL_CPU_BT) {
+	if (panic_print & SYS_INFO_ALL_CPU_BT) {
 		/* Temporary allow non-panic CPUs to write their backtraces. */
 		panic_triggering_all_cpu_backtrace = true;
 		trigger_all_cpu_backtrace();
@@ -410,7 +382,7 @@ void panic(const char *fmt, ...)
 	 */
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
-	panic_print_sys_info();
+	sys_info(panic_print);
 
 	kmsg_dump_desc(KMSG_DUMP_PANIC, buf);
 
diff --git a/lib/Makefile b/lib/Makefile
index c38582f187dd..88d6228089a8 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -40,7 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
 	 nmi_backtrace.o win_minmax.o memcat_p.o \
-	 buildid.o objpool.o iomem_copy.o
+	 buildid.o objpool.o iomem_copy.o sys_info.o
 
 lib-$(CONFIG_UNION_FIND) += union_find.o
 lib-$(CONFIG_PRINTK) += dump_stack.o
diff --git a/lib/sys_info.c b/lib/sys_info.c
new file mode 100644
index 000000000000..53031e5cb98e
--- /dev/null
+++ b/lib/sys_info.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched/debug.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/nmi.h>
+
+#include <linux/sys_info.h>
+
+void sys_info(unsigned long si_mask)
+{
+	if (si_mask & SYS_INFO_TASKS)
+		show_state();
+
+	if (si_mask & SYS_INFO_MEM)
+		show_mem();
+
+	if (si_mask & SYS_INFO_TIMERS)
+		sysrq_timer_list_show();
+
+	if (si_mask & SYS_INFO_LOCKS)
+		debug_show_all_locks();
+
+	if (si_mask & SYS_INFO_FTRACE)
+		ftrace_dump(DUMP_ALL);
+
+	if (si_mask & SYS_INFO_ALL_CPU_BT)
+		trigger_all_cpu_backtrace();
+
+	if (si_mask & SYS_INFO_BLOCKED_TASKS)
+		show_state_filter(TASK_UNINTERRUPTIBLE);
+}

From d747755917bf8ae08f490c3fe7d8e321afab8127 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@linux.alibaba.com>
Date: Thu, 3 Jul 2025 10:10:02 +0800
Subject: [PATCH 42/80] panic: add 'panic_sys_info' sysctl to take human
 readable string parameter

Bitmap definition for 'panic_print' is hard to remember and decode.  Add
'panic_sys_info='sysctl to take human readable string like
"tasks,mem,timers,locks,ftrace,..." and translate it into bitmap.

The detailed mapping is:
	SYS_INFO_TASKS		"tasks"
	SYS_INFO_MEM		"mem"
	SYS_INFO_TIMERS		"timers"
	SYS_INFO_LOCKS		"locks"
	SYS_INFO_FTRACE		"ftrace"
	SYS_INFO_ALL_CPU_BT	"all_bt"
	SYS_INFO_BLOCKED_TASKS	"blocked_tasks"

[nathan@kernel.org: add __maybe_unused to sys_info_avail]
  Link: https://lkml.kernel.org/r/20250708-fix-clang-sys_info_avail-warning-v1-1-60d239eacd64@kernel.org
Link: https://lkml.kernel.org/r/20250703021004.42328-4-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/sysctl/kernel.rst | 18 +++++
 include/linux/sys_info.h                    |  8 ++
 kernel/panic.c                              |  7 ++
 lib/sys_info.c                              | 90 +++++++++++++++++++++
 4 files changed, 123 insertions(+)

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 0d08b7a2db2d..cccb06d1a6bf 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -899,6 +899,24 @@ So for example to print tasks and memory info on panic, user can::
   echo 3 > /proc/sys/kernel/panic_print
 
 
+panic_sys_info
+==============
+
+A comma separated list of extra information to be dumped on panic,
+for example, "tasks,mem,timers,...".  It is a human readable alternative
+to 'panic_print'. Possible values are:
+
+=============   ===================================================
+tasks           print all tasks info
+mem             print system memory info
+timer           print timers info
+lock            print locks info if CONFIG_LOCKDEP is on
+ftrace          print ftrace buffer
+all_bt          print all CPUs backtrace (if available in the arch)
+blocked_tasks   print only tasks in uninterruptible (blocked) state
+=============   ===================================================
+
+
 panic_on_rcu_stall
 ==================
 
diff --git a/include/linux/sys_info.h b/include/linux/sys_info.h
index 53b7e27dbf2a..89d77dc4f2ed 100644
--- a/include/linux/sys_info.h
+++ b/include/linux/sys_info.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_SYS_INFO_H
 #define _LINUX_SYS_INFO_H
 
+#include <linux/sysctl.h>
+
 /*
  * SYS_INFO_PANIC_CONSOLE_REPLAY is for panic case only, as it needs special
  * handling which only fits panic case.
@@ -16,5 +18,11 @@
 #define SYS_INFO_BLOCKED_TASKS		0x00000080
 
 void sys_info(unsigned long si_mask);
+unsigned long sys_info_parse_param(char *str);
 
+#ifdef CONFIG_SYSCTL
+int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write,
+					  void *buffer, size_t *lenp,
+					  loff_t *ppos);
+#endif
 #endif	/* _LINUX_SYS_INFO_H */
diff --git a/kernel/panic.c b/kernel/panic.c
index cbb0681177b3..d7aa427dc23c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -126,6 +126,13 @@ static const struct ctl_table kern_panic_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_douintvec,
 	},
+	{
+		.procname	= "panic_sys_info",
+		.data		= &panic_print,
+		.maxlen         = sizeof(panic_print),
+		.mode		= 0644,
+		.proc_handler	= sysctl_sys_info_handler,
+	},
 };
 
 static __init int kernel_panic_sysctls_init(void)
diff --git a/lib/sys_info.c b/lib/sys_info.c
index 53031e5cb98e..5bf503fd7ec1 100644
--- a/lib/sys_info.c
+++ b/lib/sys_info.c
@@ -3,10 +3,100 @@
 #include <linux/console.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
+#include <linux/sysctl.h>
 #include <linux/nmi.h>
 
 #include <linux/sys_info.h>
 
+struct sys_info_name {
+	unsigned long bit;
+	const char *name;
+};
+
+/*
+ * When 'si_names' gets updated,  please make sure the 'sys_info_avail'
+ * below is updated accordingly.
+ */
+static const struct sys_info_name  si_names[] = {
+	{ SYS_INFO_TASKS,		"tasks" },
+	{ SYS_INFO_MEM,			"mem" },
+	{ SYS_INFO_TIMERS,		"timers" },
+	{ SYS_INFO_LOCKS,		"locks" },
+	{ SYS_INFO_FTRACE,		"ftrace" },
+	{ SYS_INFO_ALL_CPU_BT,		"all_bt" },
+	{ SYS_INFO_BLOCKED_TASKS,	"blocked_tasks" },
+};
+
+/* Expecting string like "xxx_sys_info=tasks,mem,timers,locks,ftrace,..." */
+unsigned long sys_info_parse_param(char *str)
+{
+	unsigned long si_bits = 0;
+	char *s, *name;
+	int i;
+
+	s = str;
+	while ((name = strsep(&s, ",")) && *name) {
+		for (i = 0; i < ARRAY_SIZE(si_names); i++) {
+			if (!strcmp(name, si_names[i].name)) {
+				si_bits |= si_names[i].bit;
+				break;
+			}
+		}
+	}
+
+	return si_bits;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static const char sys_info_avail[] __maybe_unused = "tasks,mem,timers,locks,ftrace,all_bt,blocked_tasks";
+
+int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write,
+					  void *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	char names[sizeof(sys_info_avail) + 1];
+	struct ctl_table table;
+	unsigned long *si_bits_global;
+
+	si_bits_global = ro_table->data;
+
+	if (write) {
+		unsigned long si_bits;
+		int ret;
+
+		table = *ro_table;
+		table.data = names;
+		table.maxlen = sizeof(names);
+		ret = proc_dostring(&table, write, buffer, lenp, ppos);
+		if (ret)
+			return ret;
+
+		si_bits = sys_info_parse_param(names);
+		/* The access to the global value is not synchronized. */
+		WRITE_ONCE(*si_bits_global, si_bits);
+		return 0;
+	} else {
+		/* for 'read' operation */
+		char *delim = "";
+		int i, len = 0;
+
+		for (i = 0; i < ARRAY_SIZE(si_names); i++) {
+			if (*si_bits_global & si_names[i].bit) {
+				len += scnprintf(names + len, sizeof(names) - len,
+					"%s%s", delim, si_names[i].name);
+				delim = ",";
+			}
+		}
+
+		table = *ro_table;
+		table.data = names;
+		table.maxlen = sizeof(names);
+		return proc_dostring(&table, write, buffer, lenp, ppos);
+	}
+}
+#endif
+
 void sys_info(unsigned long si_mask)
 {
 	if (si_mask & SYS_INFO_TASKS)

From 9743d12d0c63968320ece31e2e48723f3235be6d Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@linux.alibaba.com>
Date: Thu, 3 Jul 2025 10:10:03 +0800
Subject: [PATCH 43/80] panic: add 'panic_sys_info=' setup option for kernel
 cmdline

'panic_sys_info=' sysctl interface is already added for runtime setting.
Add counterpart kernel cmdline option for boottime setting.

Link: https://lkml.kernel.org/r/20250703021004.42328-5-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 15 +++++++++++++++
 kernel/panic.c                                  |  9 +++++++++
 2 files changed, 24 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3780b7e6bfd5..55a887d6309c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4563,6 +4563,21 @@
 			Use this option carefully, maybe worth to setup a
 			bigger log buffer with "log_buf_len" along with this.
 
+	panic_sys_info= A comma separated list of extra information to be dumped
+                        on panic.
+                        Format: val[,val...]
+                        Where @val can be any of the following:
+
+                        tasks:          print all tasks info
+                        mem:            print system memory info
+			timers:         print timers info
+                        locks:          print locks info if CONFIG_LOCKDEP is on
+                        ftrace:         print ftrace buffer
+                        all_bt:         print all CPUs backtrace (if available in the arch)
+                        blocked_tasks:  print only tasks in uninterruptible (blocked) state
+
+                        This is a human readable alternative to the 'panic_print' option.
+
 	parkbd.port=	[HW] Parallel port number the keyboard adapter is
 			connected to, default is 0.
 			Format: <parport#>
diff --git a/kernel/panic.c b/kernel/panic.c
index d7aa427dc23c..d9d4fcd5e318 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -143,6 +143,15 @@ static __init int kernel_panic_sysctls_init(void)
 late_initcall(kernel_panic_sysctls_init);
 #endif
 
+/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */
+static int __init setup_panic_sys_info(char *buf)
+{
+	/* There is no risk of race in kernel boot phase */
+	panic_print = sys_info_parse_param(buf);
+	return 1;
+}
+__setup("panic_sys_info=", setup_panic_sys_info);
+
 static atomic_t warn_count = ATOMIC_INIT(0);
 
 #ifdef CONFIG_SYSFS

From ee13240cd78b68430eb50af4721b3f18dd08af29 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@linux.alibaba.com>
Date: Thu, 3 Jul 2025 10:10:04 +0800
Subject: [PATCH 44/80] panic: add note that panic_print sysctl interface is
 deprecated

Add a dedicated core parameter 'panic_console_replay' for controlling
console replay, and add note that 'panic_print' sysctl interface will be
obsoleted by 'panic_sys_info' and 'panic_console_replay'.  When it
happens, the SYS_INFO_PANIC_CONSOLE_REPLAY can be removed as well.

Link: https://lkml.kernel.org/r/20250703021004.42328-6-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../admin-guide/kernel-parameters.txt         |  4 ++++
 kernel/panic.c                                | 21 ++++++++++++-------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 55a887d6309c..3d1e55ed4382 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4578,6 +4578,10 @@
 
                         This is a human readable alternative to the 'panic_print' option.
 
+	panic_console_replay
+			When panic happens, replay all kernel messages on
+			consoles at the end of panic.
+
 	parkbd.port=	[HW] Parallel port number the keyboard adapter is
 			connected to, default is 0.
 			Format: <parport#>
diff --git a/kernel/panic.c b/kernel/panic.c
index d9d4fcd5e318..bb16f254cd02 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -64,6 +64,7 @@ int panic_on_warn __read_mostly;
 unsigned long panic_on_taint;
 bool panic_on_taint_nousertaint = false;
 static unsigned int warn_limit __read_mostly;
+static bool panic_console_replay;
 
 bool panic_triggering_all_cpu_backtrace;
 
@@ -77,6 +78,13 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 EXPORT_SYMBOL(panic_notifier_list);
 
 #ifdef CONFIG_SYSCTL
+static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
+			   void *buffer, size_t *lenp, loff_t *ppos)
+{
+	pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n");
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+
 static const struct ctl_table kern_panic_table[] = {
 #ifdef CONFIG_SMP
 	{
@@ -108,7 +116,7 @@ static const struct ctl_table kern_panic_table[] = {
 		.data		= &panic_print,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
+		.proc_handler	= sysctl_panic_print_handler,
 	},
 	{
 		.procname	= "panic_on_warn",
@@ -247,12 +255,6 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
 }
 EXPORT_SYMBOL(nmi_panic);
 
-static void panic_console_replay(void)
-{
-	if (panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY)
-		console_flush_on_panic(CONSOLE_REPLAY_ALL);
-}
-
 void check_panic_on_warn(const char *origin)
 {
 	unsigned int limit;
@@ -427,7 +429,9 @@ void panic(const char *fmt, ...)
 	debug_locks_off();
 	console_flush_on_panic(CONSOLE_FLUSH_PENDING);
 
-	panic_console_replay();
+	if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) ||
+		panic_console_replay)
+		console_flush_on_panic(CONSOLE_REPLAY_ALL);
 
 	if (!panic_blink)
 		panic_blink = no_blink;
@@ -869,6 +873,7 @@ core_param(panic_print, panic_print, ulong, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
 core_param(panic_on_warn, panic_on_warn, int, 0644);
 core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
+core_param(panic_console_replay, panic_console_replay, bool, 0644);
 
 static int __init oops_setup(char *s)
 {

From 249e7ced7271d8a7e733b1fe7ea7a221eb46698f Mon Sep 17 00:00:00 2001
From: Easwar Hariharan <eahariha@linux.microsoft.com>
Date: Thu, 3 Jul 2025 15:51:32 -0700
Subject: [PATCH 45/80] coccinelle: misc: secs_to_jiffies: implement context
 and report modes

As requested by Ricardo and Jakub, implement report and context modes for
the secs_to_jiffies Coccinelle script.  While here, add the option to look
for opportunities to use secs_to_jiffies() in headers.

Link: https://lkml.kernel.org/r/20250703225145.152288-1-eahariha@linux.microsoft.com
Signed-off-by: Easwar Hariharan <eahariha@linux.microsoft.com>
Closes: https://lore.kernel.org/all/20250129-secs_to_jiffles-v1-1-35a5e16b9f03@chromium.org/
Closes: https://lore.kernel.org/all/20250221162107.409ae333@kernel.org/
Tested-by: Ricardo Ribalda <ribalda@chromium.org>
Cc: Julia Lawall <julia.lawall@inria.fr>
Cc: Nicolas Palix <nicolas.palix@imag.fr>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Ricardo Ribalda <ribalda@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/coccinelle/misc/secs_to_jiffies.cocci | 49 +++++++++++++++++--
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/scripts/coccinelle/misc/secs_to_jiffies.cocci b/scripts/coccinelle/misc/secs_to_jiffies.cocci
index 416f348174ca..f3241ce75a7b 100644
--- a/scripts/coccinelle/misc/secs_to_jiffies.cocci
+++ b/scripts/coccinelle/misc/secs_to_jiffies.cocci
@@ -7,26 +7,65 @@
 // Confidence: High
 // Copyright: (C) 2024 Easwar Hariharan, Microsoft
 // Keywords: secs, seconds, jiffies
-//
+// Options: --include-headers
 
 virtual patch
+virtual report
+virtual context
 
-@depends on patch@ constant C; @@
+@pconst depends on patch@ constant C; @@
 
 - msecs_to_jiffies(C * 1000)
 + secs_to_jiffies(C)
 
-@depends on patch@ constant C; @@
+@pconstms depends on patch@ constant C; @@
 
 - msecs_to_jiffies(C * MSEC_PER_SEC)
 + secs_to_jiffies(C)
 
-@depends on patch@ expression E; @@
+@pexpr depends on patch@ expression E; @@
 
 - msecs_to_jiffies(E * 1000)
 + secs_to_jiffies(E)
 
-@depends on patch@ expression E; @@
+@pexprms depends on patch@ expression E; @@
 
 - msecs_to_jiffies(E * MSEC_PER_SEC)
 + secs_to_jiffies(E)
+
+@r depends on report && !patch@
+constant C;
+expression E;
+position p;
+@@
+
+(
+  msecs_to_jiffies(C@p * 1000)
+|
+  msecs_to_jiffies(C@p * MSEC_PER_SEC)
+|
+  msecs_to_jiffies(E@p * 1000)
+|
+  msecs_to_jiffies(E@p * MSEC_PER_SEC)
+)
+
+@c depends on context && !patch@
+constant C;
+expression E;
+@@
+
+(
+* msecs_to_jiffies(C * 1000)
+|
+* msecs_to_jiffies(C * MSEC_PER_SEC)
+|
+* msecs_to_jiffies(E * 1000)
+|
+* msecs_to_jiffies(E * MSEC_PER_SEC)
+)
+
+@script:python depends on report@
+p << r.p;
+@@
+
+coccilib.report.print_report(p[0], "WARNING opportunity for secs_to_jiffies()")

From ae2da51def76020fa16f53cd3446c00cafe41008 Mon Sep 17 00:00:00 2001
From: Lance Yang <lance.yang@linux.dev>
Date: Fri, 27 Jun 2025 15:29:22 +0800
Subject: [PATCH 46/80] locking/rwsem: make owner helpers globally available

Patch series "extend hung task blocker tracking to rwsems".

Inspired by mutex blocker tracking[1], and having already extended it to
semaphores, let's now add support for reader-writer semaphores (rwsems).

The approach is simple: when a task enters TASK_UNINTERRUPTIBLE while
waiting for an rwsem, we just call hung_task_set_blocker().  The hung task
detector can then query the rwsem's owner to identify the lock holder.

Tracking works reliably for writers, as there can only be a single writer
holding the lock, and its task struct is stored in the owner field.

The main challenge lies with readers.  The owner field points to only one
of many concurrent readers, so we might lose track of the blocker if that
specific reader unlocks, even while others remain.  This is not a
significant issue, however.  In practice, long-lasting lock contention is
almost always caused by a writer.  Therefore, reliably tracking the writer
is the primary goal of this patch series ;)

With this change, the hung task detector can now show blocker task's info
like below:

[Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked for more than 122 seconds.
[Fri Jun 27 15:21:34 2025]       Tainted: G S                  6.16.0-rc3 #8
[Fri Jun 27 15:21:34 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[Fri Jun 27 15:21:34 2025] task:cat             state:D stack:0     pid:28631 tgid:28631 ppid:28501  task_flags:0x400000 flags:0x00004000
[Fri Jun 27 15:21:34 2025] Call Trace:
[Fri Jun 27 15:21:34 2025]  <TASK>
[Fri Jun 27 15:21:34 2025]  __schedule+0x7c7/0x1930
[Fri Jun 27 15:21:34 2025]  ? __pfx___schedule+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? policy_nodemask+0x215/0x340
[Fri Jun 27 15:21:34 2025]  ? _raw_spin_lock_irq+0x8a/0xe0
[Fri Jun 27 15:21:34 2025]  ? __pfx__raw_spin_lock_irq+0x10/0x10
[Fri Jun 27 15:21:34 2025]  schedule+0x6a/0x180
[Fri Jun 27 15:21:34 2025]  schedule_preempt_disabled+0x15/0x30
[Fri Jun 27 15:21:34 2025]  rwsem_down_read_slowpath+0x55e/0xe10
[Fri Jun 27 15:21:34 2025]  ? __pfx_rwsem_down_read_slowpath+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __pfx___might_resched+0x10/0x10
[Fri Jun 27 15:21:34 2025]  down_read+0xc9/0x230
[Fri Jun 27 15:21:34 2025]  ? __pfx_down_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __debugfs_file_get+0x14d/0x700
[Fri Jun 27 15:21:34 2025]  ? __pfx___debugfs_file_get+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? handle_pte_fault+0x52a/0x710
[Fri Jun 27 15:21:34 2025]  ? selinux_file_permission+0x3a9/0x590
[Fri Jun 27 15:21:34 2025]  read_dummy_rwsem_read+0x4a/0x90
[Fri Jun 27 15:21:34 2025]  full_proxy_read+0xff/0x1c0
[Fri Jun 27 15:21:34 2025]  ? rw_verify_area+0x6d/0x410
[Fri Jun 27 15:21:34 2025]  vfs_read+0x177/0xa50
[Fri Jun 27 15:21:34 2025]  ? __pfx_vfs_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? fdget_pos+0x1cf/0x4c0
[Fri Jun 27 15:21:34 2025]  ksys_read+0xfc/0x1d0
[Fri Jun 27 15:21:34 2025]  ? __pfx_ksys_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  do_syscall_64+0x66/0x2d0
[Fri Jun 27 15:21:34 2025]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f3f8faefb40
[Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffdeda5ab98 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f3f8faefb40
[Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 00000000010fa000 RDI: 0000000000000003
[Fri Jun 27 15:21:34 2025] RBP: 00000000010fa000 R08: 0000000000000000 R09: 0000000000010fff
[Fri Jun 27 15:21:34 2025] R10: 00007ffdeda59fe0 R11: 0000000000000246 R12: 00000000010fa000
[Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff
[Fri Jun 27 15:21:34 2025]  </TASK>
[Fri Jun 27 15:21:34 2025] INFO: task cat:28631 <reader> blocked on an rw-semaphore likely owned by task cat:28630 <writer>
[Fri Jun 27 15:21:34 2025] task:cat             state:S stack:0     pid:28630 tgid:28630 ppid:28501  task_flags:0x400000 flags:0x00004000
[Fri Jun 27 15:21:34 2025] Call Trace:
[Fri Jun 27 15:21:34 2025]  <TASK>
[Fri Jun 27 15:21:34 2025]  __schedule+0x7c7/0x1930
[Fri Jun 27 15:21:34 2025]  ? __pfx___schedule+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __mod_timer+0x304/0xa80
[Fri Jun 27 15:21:34 2025]  schedule+0x6a/0x180
[Fri Jun 27 15:21:34 2025]  schedule_timeout+0xfb/0x230
[Fri Jun 27 15:21:34 2025]  ? __pfx_schedule_timeout+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __pfx_process_timeout+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? down_write+0xc4/0x140
[Fri Jun 27 15:21:34 2025]  msleep_interruptible+0xbe/0x150
[Fri Jun 27 15:21:34 2025]  read_dummy_rwsem_write+0x54/0x90
[Fri Jun 27 15:21:34 2025]  full_proxy_read+0xff/0x1c0
[Fri Jun 27 15:21:34 2025]  ? rw_verify_area+0x6d/0x410
[Fri Jun 27 15:21:34 2025]  vfs_read+0x177/0xa50
[Fri Jun 27 15:21:34 2025]  ? __pfx_vfs_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? fdget_pos+0x1cf/0x4c0
[Fri Jun 27 15:21:34 2025]  ksys_read+0xfc/0x1d0
[Fri Jun 27 15:21:34 2025]  ? __pfx_ksys_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  do_syscall_64+0x66/0x2d0
[Fri Jun 27 15:21:34 2025]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f8f288efb40
[Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffffb631038 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f8f288efb40
[Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 000000002a4b5000 RDI: 0000000000000003
[Fri Jun 27 15:21:34 2025] RBP: 000000002a4b5000 R08: 0000000000000000 R09: 0000000000010fff
[Fri Jun 27 15:21:34 2025] R10: 00007ffffb630460 R11: 0000000000000246 R12: 000000002a4b5000
[Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff
[Fri Jun 27 15:21:34 2025]  </TASK>


This patch (of 3):

In preparation for extending blocker tracking to support rwsems, make the
rwsem_owner() and is_rwsem_reader_owned() helpers globally available for
determining if the blocker is a writer or one of the readers.

Additionally, a stale owner pointer in a reader-owned rwsem can lead to
false positives in blocker tracking when CONFIG_DETECT_HUNG_TASK_BLOCKER
is enabled.  To mitigate this, clear the owner field on the reader unlock
path, similar to what CONFIG_DEBUG_RWSEMS does.  A NULL owner is better
than a stale one for diagnostics.

Link: https://lkml.kernel.org/r/20250627072924.36567-1-lance.yang@linux.dev
Link: https://lkml.kernel.org/r/20250627072924.36567-2-lance.yang@linux.dev
Link: https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com/ [1]
Signed-off-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: John Stultz <jstultz@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Mingzhe Yang <mingzhe.yang@ly.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tomasz Figa <tfiga@chromium.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yongliang Gao <leonylgao@tencent.com>
Cc: Zi Li <zi.li@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rwsem.h  | 12 ++++++++++++
 kernel/locking/rwsem.c | 14 +++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index c8b543d428b0..544853bed5b9 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -132,6 +132,18 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
 	return !list_empty(&sem->wait_list);
 }
 
+#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
+/*
+ * Return just the real task structure pointer of the owner
+ */
+extern struct task_struct *rwsem_owner(struct rw_semaphore *sem);
+
+/*
+ * Return true if the rwsem is owned by a reader.
+ */
+extern bool is_rwsem_reader_owned(struct rw_semaphore *sem);
+#endif
+
 #else /* !CONFIG_PREEMPT_RT */
 
 #include <linux/rwbase_rt.h>
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2ddb827e3bea..a310eb9896de 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -181,11 +181,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 	__rwsem_set_reader_owned(sem, current);
 }
 
-#ifdef CONFIG_DEBUG_RWSEMS
+#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
 /*
  * Return just the real task structure pointer of the owner
  */
-static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+struct task_struct *rwsem_owner(struct rw_semaphore *sem)
 {
 	return (struct task_struct *)
 		(atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
@@ -194,7 +194,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
 /*
  * Return true if the rwsem is owned by a reader.
  */
-static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+bool is_rwsem_reader_owned(struct rw_semaphore *sem)
 {
 	/*
 	 * Check the count to see if it is write-locked.
@@ -207,10 +207,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
 }
 
 /*
- * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
- * is a task pointer in owner of a reader-owned rwsem, it will be the
- * real owner or one of the real owners. The only exception is when the
- * unlock is done by up_read_non_owner().
+ * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured,
+ * it will make sure that the owner field of a reader-owned rwsem either
+ * points to a real reader-owner(s) or gets cleared. The only exception is
+ * when the unlock is done by up_read_non_owner().
  */
 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {

From 77da18de55ac6417e48905bec8b3c66f023b15a9 Mon Sep 17 00:00:00 2001
From: Lance Yang <lance.yang@linux.dev>
Date: Fri, 27 Jun 2025 15:29:23 +0800
Subject: [PATCH 47/80] hung_task: extend hung task blocker tracking to rwsems

Inspired by mutex blocker tracking[1], and having already extended it to
semaphores, let's now add support for reader-writer semaphores (rwsems).

The approach is simple: when a task enters TASK_UNINTERRUPTIBLE while
waiting for an rwsem, we just call hung_task_set_blocker().  The hung task
detector can then query the rwsem's owner to identify the lock holder.

Tracking works reliably for writers, as there can only be a single writer
holding the lock, and its task struct is stored in the owner field.

The main challenge lies with readers.  The owner field points to only one
of many concurrent readers, so we might lose track of the blocker if that
specific reader unlocks, even while others remain.  This is not a
significant issue, however.  In practice, long-lasting lock contention is
almost always caused by a writer.  Therefore, reliably tracking the writer
is the primary goal of this patch series ;)

With this change, the hung task detector can now show blocker task's info
like below:

[Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked for more than 122 seconds.
[Fri Jun 27 15:21:34 2025]       Tainted: G S                  6.16.0-rc3 #8
[Fri Jun 27 15:21:34 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[Fri Jun 27 15:21:34 2025] task:cat             state:D stack:0     pid:28631 tgid:28631 ppid:28501  task_flags:0x400000 flags:0x00004000
[Fri Jun 27 15:21:34 2025] Call Trace:
[Fri Jun 27 15:21:34 2025]  <TASK>
[Fri Jun 27 15:21:34 2025]  __schedule+0x7c7/0x1930
[Fri Jun 27 15:21:34 2025]  ? __pfx___schedule+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? policy_nodemask+0x215/0x340
[Fri Jun 27 15:21:34 2025]  ? _raw_spin_lock_irq+0x8a/0xe0
[Fri Jun 27 15:21:34 2025]  ? __pfx__raw_spin_lock_irq+0x10/0x10
[Fri Jun 27 15:21:34 2025]  schedule+0x6a/0x180
[Fri Jun 27 15:21:34 2025]  schedule_preempt_disabled+0x15/0x30
[Fri Jun 27 15:21:34 2025]  rwsem_down_read_slowpath+0x55e/0xe10
[Fri Jun 27 15:21:34 2025]  ? __pfx_rwsem_down_read_slowpath+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __pfx___might_resched+0x10/0x10
[Fri Jun 27 15:21:34 2025]  down_read+0xc9/0x230
[Fri Jun 27 15:21:34 2025]  ? __pfx_down_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __debugfs_file_get+0x14d/0x700
[Fri Jun 27 15:21:34 2025]  ? __pfx___debugfs_file_get+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? handle_pte_fault+0x52a/0x710
[Fri Jun 27 15:21:34 2025]  ? selinux_file_permission+0x3a9/0x590
[Fri Jun 27 15:21:34 2025]  read_dummy_rwsem_read+0x4a/0x90
[Fri Jun 27 15:21:34 2025]  full_proxy_read+0xff/0x1c0
[Fri Jun 27 15:21:34 2025]  ? rw_verify_area+0x6d/0x410
[Fri Jun 27 15:21:34 2025]  vfs_read+0x177/0xa50
[Fri Jun 27 15:21:34 2025]  ? __pfx_vfs_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? fdget_pos+0x1cf/0x4c0
[Fri Jun 27 15:21:34 2025]  ksys_read+0xfc/0x1d0
[Fri Jun 27 15:21:34 2025]  ? __pfx_ksys_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  do_syscall_64+0x66/0x2d0
[Fri Jun 27 15:21:34 2025]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f3f8faefb40
[Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffdeda5ab98 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f3f8faefb40
[Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 00000000010fa000 RDI: 0000000000000003
[Fri Jun 27 15:21:34 2025] RBP: 00000000010fa000 R08: 0000000000000000 R09: 0000000000010fff
[Fri Jun 27 15:21:34 2025] R10: 00007ffdeda59fe0 R11: 0000000000000246 R12: 00000000010fa000
[Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff
[Fri Jun 27 15:21:34 2025]  </TASK>
[Fri Jun 27 15:21:34 2025] INFO: task cat:28631 <reader> blocked on an rw-semaphore likely owned by task cat:28630 <writer>
[Fri Jun 27 15:21:34 2025] task:cat             state:S stack:0     pid:28630 tgid:28630 ppid:28501  task_flags:0x400000 flags:0x00004000
[Fri Jun 27 15:21:34 2025] Call Trace:
[Fri Jun 27 15:21:34 2025]  <TASK>
[Fri Jun 27 15:21:34 2025]  __schedule+0x7c7/0x1930
[Fri Jun 27 15:21:34 2025]  ? __pfx___schedule+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __mod_timer+0x304/0xa80
[Fri Jun 27 15:21:34 2025]  schedule+0x6a/0x180
[Fri Jun 27 15:21:34 2025]  schedule_timeout+0xfb/0x230
[Fri Jun 27 15:21:34 2025]  ? __pfx_schedule_timeout+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __pfx_process_timeout+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? down_write+0xc4/0x140
[Fri Jun 27 15:21:34 2025]  msleep_interruptible+0xbe/0x150
[Fri Jun 27 15:21:34 2025]  read_dummy_rwsem_write+0x54/0x90
[Fri Jun 27 15:21:34 2025]  full_proxy_read+0xff/0x1c0
[Fri Jun 27 15:21:34 2025]  ? rw_verify_area+0x6d/0x410
[Fri Jun 27 15:21:34 2025]  vfs_read+0x177/0xa50
[Fri Jun 27 15:21:34 2025]  ? __pfx_vfs_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? fdget_pos+0x1cf/0x4c0
[Fri Jun 27 15:21:34 2025]  ksys_read+0xfc/0x1d0
[Fri Jun 27 15:21:34 2025]  ? __pfx_ksys_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  do_syscall_64+0x66/0x2d0
[Fri Jun 27 15:21:34 2025]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f8f288efb40
[Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffffb631038 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f8f288efb40
[Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 000000002a4b5000 RDI: 0000000000000003
[Fri Jun 27 15:21:34 2025] RBP: 000000002a4b5000 R08: 0000000000000000 R09: 0000000000010fff
[Fri Jun 27 15:21:34 2025] R10: 00007ffffb630460 R11: 0000000000000246 R12: 000000002a4b5000
[Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff
[Fri Jun 27 15:21:34 2025]  </TASK>

[1] https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com/

Link: https://lkml.kernel.org/r/20250627072924.36567-3-lance.yang@linux.dev
Signed-off-by: Lance Yang <lance.yang@linux.dev>
Suggested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: John Stultz <jstultz@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Mingzhe Yang <mingzhe.yang@ly.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tomasz Figa <tfiga@chromium.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yongliang Gao <leonylgao@tencent.com>
Cc: Zi Li <zi.li@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hung_task.h | 18 +++++++++---------
 kernel/hung_task.c        | 29 +++++++++++++++++++++++++----
 kernel/locking/rwsem.c    | 17 ++++++++++++++++-
 3 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/include/linux/hung_task.h b/include/linux/hung_task.h
index 1bc2b3244613..34e615c76ca5 100644
--- a/include/linux/hung_task.h
+++ b/include/linux/hung_task.h
@@ -21,17 +21,17 @@
  * type.
  *
  * Type encoding:
- * 00 - Blocked on mutex        (BLOCKER_TYPE_MUTEX)
- * 01 - Blocked on semaphore    (BLOCKER_TYPE_SEM)
- * 10 - Blocked on rt-mutex     (BLOCKER_TYPE_RTMUTEX)
- * 11 - Blocked on rw-semaphore (BLOCKER_TYPE_RWSEM)
+ * 00 - Blocked on mutex			(BLOCKER_TYPE_MUTEX)
+ * 01 - Blocked on semaphore			(BLOCKER_TYPE_SEM)
+ * 10 - Blocked on rw-semaphore as READER	(BLOCKER_TYPE_RWSEM_READER)
+ * 11 - Blocked on rw-semaphore as WRITER	(BLOCKER_TYPE_RWSEM_WRITER)
  */
-#define BLOCKER_TYPE_MUTEX      0x00UL
-#define BLOCKER_TYPE_SEM        0x01UL
-#define BLOCKER_TYPE_RTMUTEX    0x02UL
-#define BLOCKER_TYPE_RWSEM      0x03UL
+#define BLOCKER_TYPE_MUTEX		0x00UL
+#define BLOCKER_TYPE_SEM		0x01UL
+#define BLOCKER_TYPE_RWSEM_READER	0x02UL
+#define BLOCKER_TYPE_RWSEM_WRITER	0x03UL
 
-#define BLOCKER_TYPE_MASK       0x03UL
+#define BLOCKER_TYPE_MASK		0x03UL
 
 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
 static inline void hung_task_set_blocker(void *lock, unsigned long type)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d2432df2b905..8708a1205f82 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -23,6 +23,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/sysctl.h>
 #include <linux/hung_task.h>
+#include <linux/rwsem.h>
 
 #include <trace/events/sched.h>
 
@@ -100,6 +101,7 @@ static void debug_show_blocker(struct task_struct *task)
 {
 	struct task_struct *g, *t;
 	unsigned long owner, blocker, blocker_type;
+	const char *rwsem_blocked_by, *rwsem_blocked_as;
 
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
 
@@ -111,12 +113,20 @@ static void debug_show_blocker(struct task_struct *task)
 
 	switch (blocker_type) {
 	case BLOCKER_TYPE_MUTEX:
-		owner = mutex_get_owner(
-			(struct mutex *)hung_task_blocker_to_lock(blocker));
+		owner = mutex_get_owner(hung_task_blocker_to_lock(blocker));
 		break;
 	case BLOCKER_TYPE_SEM:
-		owner = sem_last_holder(
-			(struct semaphore *)hung_task_blocker_to_lock(blocker));
+		owner = sem_last_holder(hung_task_blocker_to_lock(blocker));
+		break;
+	case BLOCKER_TYPE_RWSEM_READER:
+	case BLOCKER_TYPE_RWSEM_WRITER:
+		owner = (unsigned long)rwsem_owner(
+					hung_task_blocker_to_lock(blocker));
+		rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ?
+					"reader" : "writer";
+		rwsem_blocked_by = is_rwsem_reader_owned(
+					hung_task_blocker_to_lock(blocker)) ?
+					"reader" : "writer";
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -134,6 +144,11 @@ static void debug_show_blocker(struct task_struct *task)
 			pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
 			       task->comm, task->pid);
 			break;
+		case BLOCKER_TYPE_RWSEM_READER:
+		case BLOCKER_TYPE_RWSEM_WRITER:
+			pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n",
+			       task->comm, task->pid);
+			break;
 		}
 		return;
 	}
@@ -152,6 +167,12 @@ static void debug_show_blocker(struct task_struct *task)
 			pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
 			       task->comm, task->pid, t->comm, t->pid);
 			break;
+		case BLOCKER_TYPE_RWSEM_READER:
+		case BLOCKER_TYPE_RWSEM_WRITER:
+			pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n",
+			       task->comm, task->pid, rwsem_blocked_as, t->comm,
+			       t->pid, rwsem_blocked_by);
+			break;
 		}
 		sched_show_task(t);
 		return;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index a310eb9896de..92c6332da401 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,6 +27,7 @@
 #include <linux/export.h>
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
+#include <linux/hung_task.h>
 #include <trace/events/lock.h>
 
 #ifndef CONFIG_PREEMPT_RT
@@ -1065,10 +1066,13 @@ queue:
 		wake_up_q(&wake_q);
 
 	trace_contention_begin(sem, LCB_F_READ);
+	set_current_state(state);
+
+	if (state == TASK_UNINTERRUPTIBLE)
+		hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER);
 
 	/* wait to be given the lock */
 	for (;;) {
-		set_current_state(state);
 		if (!smp_load_acquire(&waiter.task)) {
 			/* Matches rwsem_mark_wake()'s smp_store_release(). */
 			break;
@@ -1083,8 +1087,12 @@ queue:
 		}
 		schedule_preempt_disabled();
 		lockevent_inc(rwsem_sleep_reader);
+		set_current_state(state);
 	}
 
+	if (state == TASK_UNINTERRUPTIBLE)
+		hung_task_clear_blocker();
+
 	__set_current_state(TASK_RUNNING);
 	lockevent_inc(rwsem_rlock);
 	trace_contention_end(sem, 0);
@@ -1146,6 +1154,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 	set_current_state(state);
 	trace_contention_begin(sem, LCB_F_WRITE);
 
+	if (state == TASK_UNINTERRUPTIBLE)
+		hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
+
 	for (;;) {
 		if (rwsem_try_write_lock(sem, &waiter)) {
 			/* rwsem_try_write_lock() implies ACQUIRE on success */
@@ -1179,6 +1190,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 trylock_again:
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
+
+	if (state == TASK_UNINTERRUPTIBLE)
+		hung_task_clear_blocker();
+
 	__set_current_state(TASK_RUNNING);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	lockevent_inc(rwsem_wlock);

From 4efec6c0919d9081a52be50d5b5bfa32bf489c75 Mon Sep 17 00:00:00 2001
From: Zi Li <zi.li@linux.dev>
Date: Fri, 27 Jun 2025 15:29:24 +0800
Subject: [PATCH 48/80] samples: enhance hung_task detector test with
 read-write semaphore support

Extend the hung_task detector test module to include read-write semaphore
support alongside existing mutex and semaphore tests.  This module now
creates additional debugfs files under <debugfs>/hung_task, namely
'rw_semaphore_read' and 'rw_semaphore_write', in addition to 'mutex' and
'semaphore'.  Reading these files with multiple processes triggers a
prolonged sleep (256 seconds) while holding the respective lock, enabling
hung_task detector testing for various locking mechanisms.

This change builds on the extensible hung_task_tests module, adding
read-write semaphore functionality to improve test coverage for kernel
locking primitives.  The implementation ensures proper lock handling and
includes checks to prevent redundant data reads.

Usage is:

        > cd /sys/kernel/debug/hung_task
        > cat mutex & cat mutex           # Test mutex blocking
        > cat semaphore & cat semaphore   # Test semaphore blocking
        > cat rw_semaphore_write \
            & cat rw_semaphore_read       # Test rwsem blocking
        > cat rw_semaphore_write \
            & cat rw_semaphore_write      # Test rwsem blocking

Update the Kconfig description to reflect the addition of read-write
semaphore debugfs files.

Link: https://lkml.kernel.org/r/20250627072924.36567-4-lance.yang@linux.dev
Signed-off-by: Zi Li <zi.li@linux.dev>
Suggested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: John Stultz <jstultz@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Mingzhe Yang <mingzhe.yang@ly.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tomasz Figa <tfiga@chromium.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yongliang Gao <leonylgao@tencent.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 samples/Kconfig                     |  7 ++-
 samples/hung_task/hung_task_tests.c | 81 ++++++++++++++++++++++++++---
 2 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/samples/Kconfig b/samples/Kconfig
index ffef99950206..a8880c62d4c8 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -316,10 +316,9 @@ config SAMPLE_HUNG_TASK
 	depends on DETECT_HUNG_TASK && DEBUG_FS
 	help
 	  Build a module that provides debugfs files (e.g., mutex, semaphore,
-	  etc.) under <debugfs>/hung_task. If user reads one of these files,
-	  it will sleep long time (256 seconds) with holding a lock. Thus,
-	  if 2 or more processes read the same file concurrently, it will
-	  be detected by the hung_task watchdog.
+	  rw_semaphore_read, rw_semaphore_write) under <debugfs>/hung_task.
+	  Reading these files with multiple processes triggers hung task
+	  detection by holding locks for a long time (256 seconds).
 
 source "samples/rust/Kconfig"
 
diff --git a/samples/hung_task/hung_task_tests.c b/samples/hung_task/hung_task_tests.c
index a5c09bd3a47d..0360ec916890 100644
--- a/samples/hung_task/hung_task_tests.c
+++ b/samples/hung_task/hung_task_tests.c
@@ -4,11 +4,12 @@
  * semaphore, etc.
  *
  * Usage: Load this module and read `<debugfs>/hung_task/mutex`,
- *        `<debugfs>/hung_task/semaphore`, etc., with 2 or more processes.
+ *        `<debugfs>/hung_task/semaphore`, `<debugfs>/hung_task/rw_semaphore_read`,
+ *        `<debugfs>/hung_task/rw_semaphore_write`, etc., with 2 or more processes.
  *
  * This is for testing kernel hung_task error messages with various locking
- * mechanisms (e.g., mutex, semaphore, etc.). Note that this may freeze
- * your system or cause a panic. Use only for testing purposes.
+ * mechanisms (e.g., mutex, semaphore, rw_semaphore_read, rw_semaphore_write, etc.).
+ * Note that this may freeze your system or cause a panic. Use only for testing purposes.
  */
 
 #include <linux/debugfs.h>
@@ -17,21 +18,29 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/semaphore.h>
+#include <linux/rwsem.h>
 
-#define HUNG_TASK_DIR		"hung_task"
-#define HUNG_TASK_MUTEX_FILE	"mutex"
-#define HUNG_TASK_SEM_FILE	"semaphore"
-#define SLEEP_SECOND		256
+#define HUNG_TASK_DIR			"hung_task"
+#define HUNG_TASK_MUTEX_FILE		"mutex"
+#define HUNG_TASK_SEM_FILE		"semaphore"
+#define HUNG_TASK_RWSEM_READ_FILE	"rw_semaphore_read"
+#define HUNG_TASK_RWSEM_WRITE_FILE	"rw_semaphore_write"
+#define SLEEP_SECOND			256
 
 static const char dummy_string[] = "This is a dummy string.";
 static DEFINE_MUTEX(dummy_mutex);
 static DEFINE_SEMAPHORE(dummy_sem, 1);
+static DECLARE_RWSEM(dummy_rwsem);
 static struct dentry *hung_task_dir;
 
 /* Mutex-based read function */
 static ssize_t read_dummy_mutex(struct file *file, char __user *user_buf,
 				size_t count, loff_t *ppos)
 {
+	/* Check if data is already read */
+	if (*ppos >= sizeof(dummy_string))
+		return 0;
+
 	/* Second task waits on mutex, entering uninterruptible sleep */
 	guard(mutex)(&dummy_mutex);
 
@@ -46,6 +55,10 @@ static ssize_t read_dummy_mutex(struct file *file, char __user *user_buf,
 static ssize_t read_dummy_semaphore(struct file *file, char __user *user_buf,
 				    size_t count, loff_t *ppos)
 {
+	/* Check if data is already read */
+	if (*ppos >= sizeof(dummy_string))
+		return 0;
+
 	/* Second task waits on semaphore, entering uninterruptible sleep */
 	down(&dummy_sem);
 
@@ -58,6 +71,46 @@ static ssize_t read_dummy_semaphore(struct file *file, char __user *user_buf,
 				       sizeof(dummy_string));
 }
 
+/* Read-write semaphore read function */
+static ssize_t read_dummy_rwsem_read(struct file *file, char __user *user_buf,
+				     size_t count, loff_t *ppos)
+{
+	/* Check if data is already read */
+	if (*ppos >= sizeof(dummy_string))
+		return 0;
+
+	/* Acquires read lock, allowing concurrent readers but blocks if write lock is held */
+	down_read(&dummy_rwsem);
+
+	/* Sleeps here, potentially triggering hung task detection if lock is held too long */
+	msleep_interruptible(SLEEP_SECOND * 1000);
+
+	up_read(&dummy_rwsem);
+
+	return simple_read_from_buffer(user_buf, count, ppos, dummy_string,
+				       sizeof(dummy_string));
+}
+
+/* Read-write semaphore write function */
+static ssize_t read_dummy_rwsem_write(struct file *file, char __user *user_buf,
+				      size_t count, loff_t *ppos)
+{
+	/* Check if data is already read */
+	if (*ppos >= sizeof(dummy_string))
+		return 0;
+
+	/* Acquires exclusive write lock, blocking all other readers and writers */
+	down_write(&dummy_rwsem);
+
+	/* Sleeps here, potentially triggering hung task detection if lock is held too long */
+	msleep_interruptible(SLEEP_SECOND * 1000);
+
+	up_write(&dummy_rwsem);
+
+	return simple_read_from_buffer(user_buf, count, ppos, dummy_string,
+				       sizeof(dummy_string));
+}
+
 /* File operations for mutex */
 static const struct file_operations hung_task_mutex_fops = {
 	.read = read_dummy_mutex,
@@ -68,6 +121,16 @@ static const struct file_operations hung_task_sem_fops = {
 	.read = read_dummy_semaphore,
 };
 
+/* File operations for rw_semaphore read */
+static const struct file_operations hung_task_rwsem_read_fops = {
+	.read = read_dummy_rwsem_read,
+};
+
+/* File operations for rw_semaphore write */
+static const struct file_operations hung_task_rwsem_write_fops = {
+	.read = read_dummy_rwsem_write,
+};
+
 static int __init hung_task_tests_init(void)
 {
 	hung_task_dir = debugfs_create_dir(HUNG_TASK_DIR, NULL);
@@ -79,6 +142,10 @@ static int __init hung_task_tests_init(void)
 			    &hung_task_mutex_fops);
 	debugfs_create_file(HUNG_TASK_SEM_FILE, 0400, hung_task_dir, NULL,
 			    &hung_task_sem_fops);
+	debugfs_create_file(HUNG_TASK_RWSEM_READ_FILE, 0400, hung_task_dir, NULL,
+			    &hung_task_rwsem_read_fops);
+	debugfs_create_file(HUNG_TASK_RWSEM_WRITE_FILE, 0400, hung_task_dir, NULL,
+			    &hung_task_rwsem_write_fops);
 
 	return 0;
 }

From 98aa4d5d242d3a73388271e00e11ce91d8c3c3e1 Mon Sep 17 00:00:00 2001
From: Lillian Berry <lillian@star-ark.net>
Date: Mon, 7 Jul 2025 09:14:11 +0000
Subject: [PATCH 49/80] init/main.c: add warning when file specified in rdinit
 is inaccessible

Avoid silently ignoring the initramfs when the file specified in rdinit is
not usable.  This prints an error that clearly explains the issue (file
was not found, vs initramfs was not found).

Link: https://lkml.kernel.org/r/20250707091411.1412681-1-lillian@star-ark.net
Signed-off-by: Lillian Berry <lillian@star-ark.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 init/main.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/init/main.c b/init/main.c
index 225a58279acd..e47984871775 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1592,7 +1592,11 @@ static noinline void __init kernel_init_freeable(void)
 	 * check if there is an early userspace init.  If yes, let it do all
 	 * the work
 	 */
-	if (init_eaccess(ramdisk_execute_command) != 0) {
+	int ramdisk_command_access;
+	ramdisk_command_access = init_eaccess(ramdisk_execute_command);
+	if (ramdisk_command_access != 0) {
+		pr_warn("check access for rdinit=%s failed: %i, ignoring\n",
+			ramdisk_execute_command, ramdisk_command_access);
 		ramdisk_execute_command = NULL;
 		prepare_namespace();
 	}

From 988f451ecb17c359cb34e360fce82039942de7bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ahelenia=20Ziemia=C5=84ska?=
 <nabijaczleweli@nabijaczleweli.xyz>
Date: Thu, 3 Jul 2025 20:21:27 +0200
Subject: [PATCH 50/80] ocfs2/dlm: fix "take a while" typo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 67fc62a49a76..00f52812dbb0 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2632,7 +2632,7 @@ again:
 					 dlm_reco_master_ready(dlm),
 					 msecs_to_jiffies(1000));
 		if (!dlm_reco_master_ready(dlm)) {
-			mlog(0, "%s: reco master taking awhile\n",
+			mlog(0, "%s: reco master taking a while\n",
 			     dlm->name);
 			goto again;
 		}

From 44acc46d182ff36d40cea69db3875440fab72ba5 Mon Sep 17 00:00:00 2001
From: Ivan Pravdin <ipravdin.official@gmail.com>
Date: Mon, 7 Jul 2025 20:10:09 -0400
Subject: [PATCH 51/80] ocfs2: avoid NULL pointer dereference in
 dx_dir_lookup_rec()

When a directory entry is not found, ocfs2_dx_dir_lookup_rec() prints an
error message that unconditionally dereferences the 'rec' pointer.
However, if 'rec' is NULL, this leads to a NULL pointer dereference and a
kernel panic.

Add an explicit check empty extent list to avoid dereferencing NULL
'rec' pointer.

Link: https://lkml.kernel.org/r/20250708001009.372263-1-ipravdin.official@gmail.com
Reported-by: syzbot+20282c1b2184a857ac4c@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/67cd7e29.050a0220.e1a89.0007.GAE@google.com/
Signed-off-by: Ivan Pravdin <ipravdin.official@gmail.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dir.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 7799f4d16ce9..8c9c4825f984 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -798,6 +798,14 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
 		}
 	}
 
+	if (le16_to_cpu(el->l_next_free_rec) == 0) {
+		ret = ocfs2_error(inode->i_sb,
+				  "Inode %lu has empty extent list at depth %u\n",
+				  inode->i_ino,
+				  le16_to_cpu(el->l_tree_depth));
+		goto out;
+	}
+
 	found = 0;
 	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
 		rec = &el->l_recs[i];

From c0f98be69f4b550b19f9517157a30f33877bb14d Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 8 Jul 2025 12:49:00 +0100
Subject: [PATCH 52/80] squashfs: replace ;; with ; and end of fi declaration

There is an extraneous ; after a declaration, remove it.

Link: https://lkml.kernel.org/r/20250708114900.1883130-1-colin.i.king@gmail.com
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 296c5a0fcc40..b3ae3b1cc0e5 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -83,7 +83,7 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
 	struct folio *head_to_cache = NULL, *tail_to_cache = NULL;
 	struct block_device *bdev = fullbio->bi_bdev;
 	int start_idx = 0, end_idx = 0;
-	struct folio_iter fi;;
+	struct folio_iter fi;
 	struct bio *bio = NULL;
 	int idx = 0;
 	int err = 0;

From 97103dcec292b8688de142f7a48bd0d46038d3f6 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 8 Jul 2025 15:26:04 +0100
Subject: [PATCH 53/80] squashfs: fix incorrect argument to sizeof in
 kmalloc_array call

The sizeof(void *) is the incorrect argument in the kmalloc_array call, it
best to fix this by using sizeof(*cache_folios) instead.

Fortunately the sizes of void* and folio* happen to be the same, so this
has not shown up as a run time issue.

[akpm@linux-foundation.org: fix build]
Link: https://lkml.kernel.org/r/20250708142604.1891156-1-colin.i.king@gmail.com
Fixes: 2e227ff5e272 ("squashfs: add optional full compressed block caching")
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Cc: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Chanho Min <chanho.min@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index b3ae3b1cc0e5..b69c294e3ef0 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -89,7 +89,7 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
 	int err = 0;
 #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
 	struct folio **cache_folios = kmalloc_array(page_count,
-			sizeof(void *), GFP_KERNEL | __GFP_ZERO);
+			sizeof(*cache_folios), GFP_KERNEL | __GFP_ZERO);
 #endif
 
 	bio_for_each_folio_all(fi, fullbio) {

From 08eabe4b9e98d940d2dd6cdb70c7a9187ca54aca Mon Sep 17 00:00:00 2001
From: Ivan Pravdin <ipravdin.official@gmail.com>
Date: Mon, 7 Jul 2025 22:06:40 -0400
Subject: [PATCH 54/80] ocfs2: avoid potential ABBA deadlock by reordering
 tl_inode lock

In ocfs2_move_extent(), tl_inode is currently locked after the global
bitmap inode.  However, in ocfs2_flush_truncate_log(), the lock order is
reversed: tl_inode is locked first, followed by the global bitmap inode.

This creates a classic ABBA deadlock scenario if two threads attempt these
operations concurrently and acquire the locks in different orders.

To prevent this, move the tl_inode locking earlier in ocfs2_move_extent(),
so that it always precedes the global bitmap inode lock.

No functional changes beyond lock ordering.

Link: https://lkml.kernel.org/r/20250708020640.387741-1-ipravdin.official@gmail.com
Reported-by: syzbot+6bf948e47f9bac7aacfa@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/67d5645c.050a0220.1dc86f.0004.GAE@google.com/
Signed-off-by: Ivan Pravdin <ipravdin.official@gmail.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/move_extents.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 369c7d27befd..cbe2f8ed8897 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -617,6 +617,8 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 	 */
 	credits += OCFS2_INODE_UPDATE_CREDITS + 1;
 
+	inode_lock(tl_inode);
+
 	/*
 	 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
 	 * logic, while we still need to lock the global_bitmap.
@@ -626,7 +628,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 	if (!gb_inode) {
 		mlog(ML_ERROR, "unable to get global_bitmap inode\n");
 		ret = -EIO;
-		goto out;
+		goto out_unlock_tl_inode;
 	}
 
 	inode_lock(gb_inode);
@@ -634,16 +636,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_unlock_gb_mutex;
+		goto out_unlock_gb_inode;
 	}
 
-	inode_lock(tl_inode);
-
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
-		goto out_unlock_tl_inode;
+		goto out_unlock;
 	}
 
 	new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
@@ -703,15 +703,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 	brelse(gd_bh);
-
-out_unlock_tl_inode:
-	inode_unlock(tl_inode);
-
+out_unlock:
 	ocfs2_inode_unlock(gb_inode, 1);
-out_unlock_gb_mutex:
+out_unlock_gb_inode:
 	inode_unlock(gb_inode);
 	brelse(gb_bh);
 	iput(gb_inode);
+out_unlock_tl_inode:
+	inode_unlock(tl_inode);
 
 out:
 	if (context->meta_ac) {

From b3d5fd6f82dde8c906dc2a587003a44252ae5eae Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Fri, 6 Jun 2025 21:47:56 +0800
Subject: [PATCH 55/80] lib/math/gcd: use static key to select implementation
 at runtime

Patch series "Optimize GCD performance on RISC-V by selecting
implementation at runtime", v3.

The current implementation of gcd() selects between the binary GCD and the
odd-even GCD algorithm at compile time, depending on whether
CONFIG_CPU_NO_EFFICIENT_FFS is set.  On platforms like RISC-V, however,
this compile-time decision can be misleading: even when the compiler emits
ctz instructions based on the assumption that they are efficient (as is
the case when CONFIG_RISCV_ISA_ZBB is enabled), the actual hardware may
lack support for the Zbb extension.  In such cases, ffs() falls back to a
software implementation at runtime, making the binary GCD algorithm
significantly slower than the odd-even variant.

To address this, we introduce a static key to allow runtime selection
between the binary and odd-even GCD implementations.  On RISC-V, the
kernel now checks for Zbb support during boot.  If Zbb is unavailable, the
static key is disabled so that gcd() consistently uses the more efficient
odd-even algorithm in that scenario.  Additionally, to further reduce code
size, we select CONFIG_CPU_NO_EFFICIENT_FFS automatically when
CONFIG_RISCV_ISA_ZBB is not enabled, avoiding compilation of the unused
binary GCD implementation entirely on systems where it would never be
executed.

This series ensures that the most efficient GCD algorithm is used in
practice and avoids compiling unnecessary code based on hardware
capabilities and kernel configuration.


This patch (of 3):

On platforms like RISC-V, the compiler may generate hardware FFS
instructions even if the underlying CPU does not actually support them.
Currently, the GCD implementation is chosen at compile time based on
CONFIG_CPU_NO_EFFICIENT_FFS, which can result in suboptimal behavior on
such systems.

Introduce a static key, efficient_ffs_key, to enable runtime selection
between the binary GCD (using ffs) and the odd-even GCD implementation.
This allows the kernel to default to the faster binary GCD when FFS is
efficient, while retaining the ability to fall back when needed.

Link: https://lkml.kernel.org/r/20250606134758.1308400-1-visitorckw@gmail.com
Link: https://lkml.kernel.org/r/20250606134758.1308400-2-visitorckw@gmail.com
Co-developed-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gcd.h |  3 +++
 lib/math/gcd.c      | 27 +++++++++++++++------------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/include/linux/gcd.h b/include/linux/gcd.h
index cb572677fd7f..616e81a7f7e3 100644
--- a/include/linux/gcd.h
+++ b/include/linux/gcd.h
@@ -3,6 +3,9 @@
 #define _GCD_H
 
 #include <linux/compiler.h>
+#include <linux/jump_label.h>
+
+DECLARE_STATIC_KEY_TRUE(efficient_ffs_key);
 
 unsigned long gcd(unsigned long a, unsigned long b) __attribute_const__;
 
diff --git a/lib/math/gcd.c b/lib/math/gcd.c
index e3b042214d1b..62efca6787ae 100644
--- a/lib/math/gcd.c
+++ b/lib/math/gcd.c
@@ -11,22 +11,16 @@
  * has decent hardware division.
  */
 
+DEFINE_STATIC_KEY_TRUE(efficient_ffs_key);
+
 #if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
 
 /* If __ffs is available, the even/odd algorithm benchmarks slower. */
 
-/**
- * gcd - calculate and return the greatest common divisor of 2 unsigned longs
- * @a: first value
- * @b: second value
- */
-unsigned long gcd(unsigned long a, unsigned long b)
+static unsigned long binary_gcd(unsigned long a, unsigned long b)
 {
 	unsigned long r = a | b;
 
-	if (!a || !b)
-		return r;
-
 	b >>= __ffs(b);
 	if (b == 1)
 		return r & -r;
@@ -44,9 +38,15 @@ unsigned long gcd(unsigned long a, unsigned long b)
 	}
 }
 
-#else
+#endif
 
 /* If normalization is done by loops, the even/odd algorithm is a win. */
+
+/**
+ * gcd - calculate and return the greatest common divisor of 2 unsigned longs
+ * @a: first value
+ * @b: second value
+ */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
 	unsigned long r = a | b;
@@ -54,6 +54,11 @@ unsigned long gcd(unsigned long a, unsigned long b)
 	if (!a || !b)
 		return r;
 
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+	if (static_branch_likely(&efficient_ffs_key))
+		return binary_gcd(a, b);
+#endif
+
 	/* Isolate lsbit of r */
 	r &= -r;
 
@@ -80,6 +85,4 @@ unsigned long gcd(unsigned long a, unsigned long b)
 	}
 }
 
-#endif
-
 EXPORT_SYMBOL_GPL(gcd);

From 26b537edc533058c48f6351569d676703d7d1af3 Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Fri, 6 Jun 2025 21:47:57 +0800
Subject: [PATCH 56/80] riscv: optimize gcd() code size when
 CONFIG_RISCV_ISA_ZBB is disabled

The binary GCD implementation depends on efficient ffs(), which on RISC-V
requires hardware support for the Zbb extension.  When
CONFIG_RISCV_ISA_ZBB is not enabled, the kernel will never use binary GCD,
as runtime logic will always fall back to the odd-even implementation.

To avoid compiling unused code and reduce code size, select
CONFIG_CPU_NO_EFFICIENT_FFS when CONFIG_RISCV_ISA_ZBB is not set.

$ ./scripts/bloat-o-meter ./lib/math/gcd.o.old ./lib/math/gcd.o.new
add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-274 (-274)
Function                                     old     new   delta
gcd                                          360      86    -274
Total: Before=384, After=110, chg -71.35%

Link: https://lkml.kernel.org/r/20250606134758.1308400-3-visitorckw@gmail.com
Co-developed-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Acked-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index d71ea0f4466f..1736768426ec 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -97,6 +97,7 @@ config RISCV
 	select CLINT_TIMER if RISCV_M_MODE
 	select CLONE_BACKWARDS
 	select COMMON_CLK
+	select CPU_NO_EFFICIENT_FFS if !RISCV_ISA_ZBB
 	select CPU_PM if CPU_IDLE || HIBERNATION || SUSPEND
 	select EDAC_SUPPORT
 	select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE)

From 36e22416872114cae812cdcdd84a5b99ef30b3de Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Fri, 6 Jun 2025 21:47:58 +0800
Subject: [PATCH 57/80] riscv: optimize gcd() performance on RISC-V without Zbb
 extension

The binary GCD implementation uses FFS (find first set), which benefits
from hardware support for the ctz instruction, provided by the Zbb
extension on RISC-V.  Without Zbb, this results in slower
software-emulated behavior.

Previously, RISC-V always used the binary GCD, regardless of actual
hardware support.  This patch improves runtime efficiency by disabling the
efficient_ffs_key static branch when Zbb is either not enabled in the
kernel (config) or not supported on the executing CPU.  This selects the
odd-even GCD implementation, which is faster in the absence of efficient
FFS.

This change ensures the most suitable GCD algorithm is chosen dynamically
based on actual hardware capabilities.

Link: https://lkml.kernel.org/r/20250606134758.1308400-4-visitorckw@gmail.com
Co-developed-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Acked-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/kernel/setup.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 14888e5ea19a..f90cce7a3ace 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -21,6 +21,8 @@
 #include <linux/efi.h>
 #include <linux/crash_dump.h>
 #include <linux/panic_notifier.h>
+#include <linux/jump_label.h>
+#include <linux/gcd.h>
 
 #include <asm/acpi.h>
 #include <asm/alternative.h>
@@ -362,6 +364,9 @@ void __init setup_arch(char **cmdline_p)
 
 	riscv_user_isa_enable();
 	riscv_spinlock_init();
+
+	if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB) || !riscv_isa_extension_available(NULL, ZBB))
+		static_branch_disable(&efficient_ffs_key);
 }
 
 bool arch_cpu_is_hotpluggable(int cpu)

From 813b46808822db6838c43e92ba21ce013d23fcdc Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Thu, 10 Jul 2025 21:04:12 +0800
Subject: [PATCH 58/80] selftests/thermal: remove duplicate sprintf() call in
 workload_hint_test

Remove redundant sprintf() call that was duplicating the same operation of
formatting delay_str with argv[1].

Link: https://lkml.kernel.org/r/6338CD0E839B770B+20250710130412.284531-1-wangyuli@uniontech.com
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Cc: Guan Wentao <guanwentao@uniontech.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../selftests/thermal/intel/workload_hint/workload_hint_test.c  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
index a40097232967..bda006af8b1b 100644
--- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
+++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
@@ -67,8 +67,6 @@ int main(int argc, char **argv)
 		if (delay < 0)
 			exit(1);
 
-		sprintf(delay_str, "%s\n", argv[1]);
-
 		sprintf(delay_str, "%s\n", argv[1]);
 		fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR);
 		if (fd < 0) {

From 599579e857ab8d8f97409f631090e08018f8343b Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Thu, 10 Jul 2025 21:47:51 +0800
Subject: [PATCH 59/80] selftests/thermal: remove duplicate newlines in perror
 calls

perror() automatically appends a newline character, so the explicit '\n'
in the format strings is redundant and results in duplicate newlines in
the output.

Remove the redundant '\n' characters from perror() calls in
workload_hint_test.c to fix the formatting.

Link: https://lkml.kernel.org/r/F482FB1EC020000C+20250710134751.306096-1-wangyuli@uniontech.com
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Cc: Guan Wentao <guanwentao@uniontech.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../intel/workload_hint/workload_hint_test.c       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
index bda006af8b1b..ba58589a1145 100644
--- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
+++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
@@ -32,12 +32,12 @@ void workload_hint_exit(int signum)
 
 	fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR);
 	if (fd < 0) {
-		perror("Unable to open workload type feature enable file\n");
+		perror("Unable to open workload type feature enable file");
 		exit(1);
 	}
 
 	if (write(fd, "0\n", 2) < 0) {
-		perror("Can't disable workload hints\n");
+		perror("Can't disable workload hints");
 		exit(1);
 	}
 
@@ -70,12 +70,12 @@ int main(int argc, char **argv)
 		sprintf(delay_str, "%s\n", argv[1]);
 		fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR);
 		if (fd < 0) {
-			perror("Unable to open workload notification delay\n");
+			perror("Unable to open workload notification delay");
 			exit(1);
 		}
 
 		if (write(fd, delay_str, strlen(delay_str)) < 0) {
-			perror("Can't set delay\n");
+			perror("Can't set delay");
 			exit(1);
 		}
 
@@ -92,12 +92,12 @@ int main(int argc, char **argv)
 	/* Enable feature via sysfs knob */
 	fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR);
 	if (fd < 0) {
-		perror("Unable to open workload type feature enable file\n");
+		perror("Unable to open workload type feature enable file");
 		exit(1);
 	}
 
 	if (write(fd, "1\n", 2) < 0) {
-		perror("Can't enable workload hints\n");
+		perror("Can't enable workload hints");
 		exit(1);
 	}
 
@@ -108,7 +108,7 @@ int main(int argc, char **argv)
 	while (1) {
 		fd = open(WORKLOAD_TYPE_INDEX_ATTRIBUTE, O_RDONLY);
 		if (fd < 0) {
-			perror("Unable to open workload type file\n");
+			perror("Unable to open workload type file");
 			exit(1);
 		}
 

From 6b47c9f8ee3960d4b46bda4de63429fdfe468989 Mon Sep 17 00:00:00 2001
From: Wang Yaxin <wang.yaxin@zte.com.cn>
Date: Thu, 10 Jul 2025 13:54:51 +0800
Subject: [PATCH 60/80] delaytop: add psi info to show system delay

Support showing whole delay of system by reading PSI, just like the first
few lines of information output by the top command.  the output of
delaytop includes both system-wide delay and delay of individual tasks,
providing a more comprehensive reflection of system latency status.

Use case
========
bash# ./delaytop
System Pressure Information: (avg10/avg60/avg300/total)
CPU:    full:    0.0%/   0.0%/   0.0%/0           some:    0.1%/   0.0%/   0.0%/14216596
Memory: full:    0.0%/   0.0%/   0.0%/34010659    some:    0.0%/   0.0%/   0.0%/35406492
IO:     full:    0.1%/   0.0%/   0.0%/51029453    some:    0.1%/   0.0%/   0.0%/55330465
IRQ:    full:    0.0%/   0.0%/   0.0%/0

Top 20 processes (sorted by CPU delay):

  PID   TGID  COMMAND            CPU(ms)  IO(ms)        SWAP(ms) RCL(ms) THR(ms)  CMP(ms)  WP(ms)  IRQ(ms)
---------------------------------------------------------------------------------------------
   32     32  kworker/2:0H-sy   23.65     0.00     0.00     0.00    0.00     0.00     0.00     0.00
  497    497  kworker/R-scsi_    1.20     0.00     0.00     0.00    0.00     0.00     0.00     0.00
  495    495  kworker/R-scsi_    1.13     0.00     0.00     0.00    0.00     0.00     0.00     0.00
  494    494  scsi_eh_0          1.12     0.00     0.00     0.00    0.00     0.00     0.00     0.00
  485    485  kworker/R-ata_s    0.90     0.00     0.00     0.00    0.00     0.00     0.00     0.00
  574    574  kworker/R-kdmfl    0.36     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   34     34  idle_inject/3      0.33     0.00     0.00     0.00    0.00     0.00     0.00     0.00
 1123   1123  nde-netfilter      0.28     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   60     60  ksoftirqd/7        0.25     0.00     0.00     0.00    0.00     0.00     0.00     0.00
  114    114  kworker/0:2-cgr    0.25     0.00     0.00     0.00    0.00     0.00     0.00     0.00
  496    496  scsi_eh_1          0.24     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   51     51  cpuhp/6            0.24     0.00     0.00     0.00    0.00     0.00     0.00     0.00
 1667   1667  atd                0.24     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   45     45  cpuhp/5            0.23     0.00     0.00     0.00    0.00     0.00     0.00     0.00
 1102   1102  nde-backupservi    0.22     0.00     0.00     0.00    0.00     0.00     0.00     0.00
 1098   1098  systemsettings     0.21     0.00     0.00     0.00    0.00     0.00     0.00     0.00
 1100   1100  audit-monitor      0.20     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   53     53  migration/6        0.20     0.00     0.00     0.00    0.00     0.00     0.00     0.00
 1482   1482  sshd               0.19     0.00     0.00     0.00    0.00     0.00     0.00     0.00
   39     39  cpuhp/4            0.19     0.00     0.00     0.00    0.00     0.00     0.00     0.00

Link: https://lkml.kernel.org/r/20250710135451340_5pOgpIFi0M5AE7H44W1D@zte.com.cn
Co-developed-by: Fan Yu <fan.yu9@zte.com.cn>
Signed-off-by: Fan Yu <fan.yu9@zte.com.cn>
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Signed-off-by: Jiang Kun <jiang.kun2@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Peilin He <he.peilin@zte.com.cn>
Cc: Qiang Tu <tu.qiang35@zte.com.cn>
Cc: wangyong <wang.yong12@zte.com.cn>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Cc: Yunkai Zhang <zhang.yunkai@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/accounting/delaytop.c | 161 +++++++++++++++++++++++++++++++++---
 1 file changed, 148 insertions(+), 13 deletions(-)

diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c
index 23e38f39e97d..cd848af9a856 100644
--- a/tools/accounting/delaytop.c
+++ b/tools/accounting/delaytop.c
@@ -10,9 +10,9 @@
  * individual tasks (PIDs).
  *
  * Key features:
- *   - Collects per-task delay accounting statistics via taskstats.
- *   - Supports sorting, filtering.
- *   - Supports both interactive (screen refresh).
+ *	- Collects per-task delay accounting statistics via taskstats.
+ *	- Supports sorting, filtering.
+ *	- Supports both interactive (screen refresh).
  *
  * Copyright (C) Fan Yu, ZTE Corp. 2025
  * Copyright (C) Wang Yaxin, ZTE Corp. 2025
@@ -43,6 +43,14 @@
 #include <linux/cgroupstats.h>
 #include <ncurses.h>
 
+#define PSI_CPU_SOME "/proc/pressure/cpu"
+#define PSI_CPU_FULL	"/proc/pressure/cpu"
+#define PSI_MEMORY_SOME "/proc/pressure/memory"
+#define PSI_MEMORY_FULL "/proc/pressure/memory"
+#define PSI_IO_SOME "/proc/pressure/io"
+#define PSI_IO_FULL "/proc/pressure/io"
+#define PSI_IRQ_FULL	"/proc/pressure/irq"
+
 #define NLA_NEXT(na)			((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len)))
 #define NLA_DATA(na)			((void *)((char *)(na) + NLA_HDRLEN))
 #define NLA_PAYLOAD(len)		(len - NLA_HDRLEN)
@@ -66,6 +74,24 @@ struct config {
 	char *container_path;	/* Path to container cgroup */
 };
 
+/* PSI statistics structure */
+struct psi_stats {
+	double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300;
+	unsigned long long cpu_some_total;
+	double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300;
+	unsigned long long cpu_full_total;
+	double memory_some_avg10, memory_some_avg60, memory_some_avg300;
+	unsigned long long memory_some_total;
+	double memory_full_avg10, memory_full_avg60, memory_full_avg300;
+	unsigned long long memory_full_total;
+	double io_some_avg10, io_some_avg60, io_some_avg300;
+	unsigned long long io_some_total;
+	double io_full_avg10, io_full_avg60, io_full_avg300;
+	unsigned long long io_full_total;
+	double irq_full_avg10, irq_full_avg60, irq_full_avg300;
+	unsigned long long irq_full_total;
+};
+
 /* Task delay information structure */
 struct task_info {
 	int pid;
@@ -100,6 +126,7 @@ struct container_stats {
 
 /* Global variables */
 static struct config cfg;
+static struct psi_stats psi;
 static struct task_info tasks[MAX_TASKS];
 static int task_count;
 static int running = 1;
@@ -130,13 +157,13 @@ static void usage(void)
 {
 	printf("Usage: delaytop [Options]\n"
 	"Options:\n"
-	"  -h, --help               Show this help message and exit\n"
-	"  -d, --delay=SECONDS      Set refresh interval (default: 2 seconds, min: 1)\n"
-	"  -n, --iterations=COUNT   Set number of updates (default: 0 = infinite)\n"
-	"  -P, --processes=NUMBER   Set maximum number of processes to show (default: 20, max: 1000)\n"
-	"  -o, --once               Display once and exit\n"
-	"  -p, --pid=PID            Monitor only the specified PID\n"
-	"  -C, --container=PATH     Monitor the container at specified cgroup path\n");
+	"  -h, --help				Show this help message and exit\n"
+	"  -d, --delay=SECONDS	  Set refresh interval (default: 2 seconds, min: 1)\n"
+	"  -n, --iterations=COUNT	Set number of updates (default: 0 = infinite)\n"
+	"  -P, --processes=NUMBER	Set maximum number of processes to show (default: 20, max: 1000)\n"
+	"  -o, --once				Display once and exit\n"
+	"  -p, --pid=PID			Monitor only the specified PID\n"
+	"  -C, --container=PATH	 Monitor the container at specified cgroup path\n");
 	exit(0);
 }
 
@@ -276,7 +303,7 @@ static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
 	memset(&nladdr, 0, sizeof(nladdr));
 	nladdr.nl_family = AF_NETLINK;
 	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
-				   sizeof(nladdr))) < buflen) {
+					sizeof(nladdr))) < buflen) {
 		if (r > 0) {
 			buf += r;
 			buflen -= r;
@@ -320,6 +347,89 @@ static int get_family_id(int sd)
 	return id;
 }
 
+static void read_psi_stats(void)
+{
+	FILE *fp;
+	char line[256];
+	int ret = 0;
+	/* Zero all fields */
+	memset(&psi, 0, sizeof(psi));
+	/* CPU pressure */
+	fp = fopen(PSI_CPU_SOME, "r");
+	if (fp) {
+		while (fgets(line, sizeof(line), fp)) {
+			if (strncmp(line, "some", 4) == 0) {
+				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
+							&psi.cpu_some_avg10, &psi.cpu_some_avg60,
+							&psi.cpu_some_avg300, &psi.cpu_some_total);
+				if (ret != 4)
+					fprintf(stderr, "Failed to parse CPU some PSI data\n");
+			} else if (strncmp(line, "full", 4) == 0) {
+				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
+						&psi.cpu_full_avg10, &psi.cpu_full_avg60,
+						&psi.cpu_full_avg300, &psi.cpu_full_total);
+				if (ret != 4)
+					fprintf(stderr, "Failed to parse CPU full PSI data\n");
+			}
+		}
+		fclose(fp);
+	}
+	/* Memory pressure */
+	fp = fopen(PSI_MEMORY_SOME, "r");
+	if (fp) {
+		while (fgets(line, sizeof(line), fp)) {
+			if (strncmp(line, "some", 4) == 0) {
+				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
+						&psi.memory_some_avg10, &psi.memory_some_avg60,
+						&psi.memory_some_avg300, &psi.memory_some_total);
+				if (ret != 4)
+					fprintf(stderr, "Failed to parse Memory some PSI data\n");
+			} else if (strncmp(line, "full", 4) == 0) {
+				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
+						&psi.memory_full_avg10, &psi.memory_full_avg60,
+						&psi.memory_full_avg300, &psi.memory_full_total);
+			}
+				if (ret != 4)
+					fprintf(stderr, "Failed to parse Memory full PSI data\n");
+		}
+		fclose(fp);
+	}
+	/* IO pressure */
+	fp = fopen(PSI_IO_SOME, "r");
+	if (fp) {
+		while (fgets(line, sizeof(line), fp)) {
+			if (strncmp(line, "some", 4) == 0) {
+				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
+						&psi.io_some_avg10, &psi.io_some_avg60,
+						&psi.io_some_avg300, &psi.io_some_total);
+				if (ret != 4)
+					fprintf(stderr, "Failed to parse IO some PSI data\n");
+			} else if (strncmp(line, "full", 4) == 0) {
+				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
+						&psi.io_full_avg10, &psi.io_full_avg60,
+						&psi.io_full_avg300, &psi.io_full_total);
+				if (ret != 4)
+					fprintf(stderr, "Failed to parse IO full PSI data\n");
+			}
+		}
+		fclose(fp);
+	}
+	/* IRQ pressure (only full) */
+	fp = fopen(PSI_IRQ_FULL, "r");
+	if (fp) {
+		while (fgets(line, sizeof(line), fp)) {
+			if (strncmp(line, "full", 4) == 0) {
+				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
+						&psi.irq_full_avg10, &psi.irq_full_avg60,
+						&psi.irq_full_avg300, &psi.irq_full_total);
+				if (ret != 4)
+					fprintf(stderr, "Failed to parse IRQ full PSI data\n");
+			}
+		}
+		fclose(fp);
+	}
+}
+
 static int read_comm(int pid, char *comm_buf, size_t buf_size)
 {
 	char path[64];
@@ -549,7 +659,29 @@ static void display_results(void)
 	FILE *out = stdout;
 
 	fprintf(out, "\033[H\033[J");
+	/* PSI output (one-line, no cat style) */
+	fprintf(out, "System Pressure Information: ");
+	fprintf(out, "(avg10/avg60/avg300/total)\n");
+	fprintf(out, "CPU:");
+	fprintf(out, "	full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu", psi.cpu_full_avg10,
+			psi.cpu_full_avg60, psi.cpu_full_avg300, psi.cpu_full_total);
+	fprintf(out, "  some: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n", psi.cpu_some_avg10,
+			psi.cpu_some_avg60, psi.cpu_some_avg300, psi.cpu_some_total);
 
+	fprintf(out, "Memory:");
+	fprintf(out, " full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu", psi.memory_full_avg10,
+			psi.memory_full_avg60, psi.memory_full_avg300, psi.memory_full_total);
+	fprintf(out, "  some: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n", psi.memory_some_avg10,
+			psi.memory_some_avg60, psi.memory_some_avg300, psi.memory_some_total);
+
+	fprintf(out, "IO:");
+	fprintf(out, "	full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu", psi.io_full_avg10,
+			psi.io_full_avg60, psi.io_full_avg300, psi.io_full_total);
+	fprintf(out, "  some: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n", psi.io_some_avg10,
+			psi.io_some_avg60, psi.io_some_avg300, psi.io_some_total);
+	fprintf(out, "IRQ:");
+	fprintf(out, "	full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n\n", psi.irq_full_avg10,
+			psi.irq_full_avg60, psi.irq_full_avg300, psi.irq_full_total);
 	if (cfg.container_path) {
 		fprintf(out, "Container Information (%s):\n", cfg.container_path);
 		fprintf(out, "Processes: running=%d, sleeping=%d, ",
@@ -559,8 +691,8 @@ static void display_results(void)
 			container_stats.nr_io_wait);
 	}
 	fprintf(out, "Top %d processes (sorted by CPU delay):\n\n",
-		   cfg.max_processes);
-	fprintf(out, "  PID	TGID  COMMAND		 CPU(ms)  IO(ms)   ");
+			cfg.max_processes);
+	fprintf(out, "  PID	TGID  COMMAND		 CPU(ms)  IO(ms)	");
 	fprintf(out, "SWAP(ms) RCL(ms) THR(ms)  CMP(ms)  WP(ms)  IRQ(ms)\n");
 	fprintf(out, "-----------------------------------------------");
 	fprintf(out, "----------------------------------------------\n");
@@ -616,6 +748,9 @@ int main(int argc, char **argv)
 
 	/* Main loop */
 	while (running) {
+		/* Read PSI statistics */
+		read_psi_stats();
+
 		/* Get container stats if container path provided */
 		if (cfg.container_path)
 			get_container_stats();

From 320bf1709a473403840eba60c432a9f9b7d824d5 Mon Sep 17 00:00:00 2001
From: Wang Yaxin <wang.yaxin@zte.com.cn>
Date: Thu, 10 Jul 2025 13:51:41 +0800
Subject: [PATCH 61/80] docs: update docs after introducing delaytop

The "getdelays" can only display the latency of a single task by
specifying a PID, we introduce the "delaytop" with the following
capabilities:

1. system view: monitors latency metrics (CPU, I/O, memory, IRQ, etc.)
   for all system processes

2. supports field-based sorting (e.g., default sort by CPU latency in
   descending order)

3. dynamic interactive interface: focus on specific processes with
   --pid; limit displayed entries with --processes 20; control monitoring
   duration with --iterations;

Link: https://lkml.kernel.org/r/2025071013514177028RdjISjqeIOnTCRvGAwy@zte.com.cn
Signed-off-by: Fan Yu <fan.yu9@zte.com.cn>
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Signed-off-by: Jiang Kun <jiang.kun2@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Peilin He <he.peilin@zte.com.cn>
Cc: Qiang Tu <tu.qiang35@zte.com.cn>
Cc: wangyong <wang.yong12@zte.com.cn>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Cc: Yunkai Zhang <zhang.yunkai@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/accounting/delay-accounting.rst | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 210c194d4a7b..664950328fb7 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -131,3 +131,50 @@ Get IO accounting for pid 1, it works only with -p::
 	linuxrc: read=65536, write=0, cancelled_write=0
 
 The above command can be used with -v to get more debug information.
+
+After the system starts, use `delaytop` to get the Top-N high-latency tasks.
+this tool supports sorting by CPU latency in descending order by default,
+displays the top 20 high-latency tasks by default, and refreshes the latency
+data every 2 seconds by default.
+
+Get Top-N tasks delay, since system boot::
+
+	bash# ./delaytop
+	Top 20 processes (sorted by CPU delay):
+
+	  PID   TGID  COMMAND            CPU(ms)  IO(ms)        SWAP(ms) RCL(ms) THR(ms)  CMP(ms)  WP(ms)  IRQ(ms)
+	---------------------------------------------------------------------------------------------
+	   32     32  kworker/2:0H-sy   23.65     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	  497    497  kworker/R-scsi_    1.20     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	  495    495  kworker/R-scsi_    1.13     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	  494    494  scsi_eh_0          1.12     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	  485    485  kworker/R-ata_s    0.90     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	  574    574  kworker/R-kdmfl    0.36     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	   34     34  idle_inject/3      0.33     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	 1123   1123  nde-netfilter      0.28     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	   60     60  ksoftirqd/7        0.25     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	  114    114  kworker/0:2-cgr    0.25     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	  496    496  scsi_eh_1          0.24     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	   51     51  cpuhp/6            0.24     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	 1667   1667  atd                0.24     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	   45     45  cpuhp/5            0.23     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	 1102   1102  nde-backupservi    0.22     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	 1098   1098  systemsettings     0.21     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	 1100   1100  audit-monitor      0.20     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	   53     53  migration/6        0.20     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	 1482   1482  sshd               0.19     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	   39     39  cpuhp/4            0.19     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+
+Dynamic interactive interface of delaytop::
+
+	# ./delaytop -p pid
+	Print delayacct stats
+
+	# ./delaytop -P num
+	Display the top N tasks
+
+	# ./delaytop -n num
+	Set delaytop refresh frequency (num times)
+
+	# ./delaytop -d secs
+	Specify refresh interval as secs

From a9ed4422adacce848fd368c3a7076368ead7fc18 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 23 Jun 2025 16:47:25 +0800
Subject: [PATCH 62/80] lib/raid6: update recov_rvv.c zero page usage

Update lib/raid6/recov_rvv.c, for 1857fcc84744 ("lib/raid6: replace custom
zero page with ZERO_PAGE"), per Klara.

Link: https://lkml.kernel.org/r/aFkUnXWtxcgOTVkw@gondor.apana.org.au
Fixes: 1857fcc84744 ("lib/raid6: replace custom zero page with ZERO_PAGE")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Song Liu <song@kernel.org>
Cc: Yu Kuai <yukuai3@huawei.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/raid6/recov_rvv.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
index f29303795ccf..5d54c4b437df 100644
--- a/lib/raid6/recov_rvv.c
+++ b/lib/raid6/recov_rvv.c
@@ -165,10 +165,10 @@ static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
 	 * delta p and delta q
 	 */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -203,7 +203,7 @@ static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
 	 * Use the dead data page as temporary storage for delta q
 	 */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);

From fefbeed8c6f62dc10f80a6b1787e75de2c64ad0d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 14 Jul 2025 17:20:02 -0700
Subject: [PATCH 63/80] init/Kconfig: restore CONFIG_BROKEN help text

Linus added it in 2003, it later was removed.  Put it back.

Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 init/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index 666783eb50ab..c66a33865f1f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -169,6 +169,10 @@ menu "General setup"
 
 config BROKEN
 	bool
+	help
+	  This option allows you to choose whether you want to try to
+	  compile (and fix) old drivers that haven't been updated to
+	  new infrastructure.
 
 config BROKEN_ON_SMP
 	bool

From 6c6d8f8ba7789c221a2e4c43a0ed982c7a41f428 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Wed, 16 Jul 2025 14:32:45 +0100
Subject: [PATCH 64/80] lib/xxhash: remove unused functions

xxh32_digest() and xxh32_update() were added in 2017 in the original
xxhash commit, but have remained unused.

Remove them.

Link: https://lkml.kernel.org/r/20250716133245.243363-1-linux@treblig.org
Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Gilbert <linux@treblig.org>
Cc: Nick Terrell <terrelln@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xxhash.h |  26 ----------
 lib/xxhash.c           | 107 -----------------------------------------
 2 files changed, 133 deletions(-)

diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h
index df42511438d0..27f57eca8cb1 100644
--- a/include/linux/xxhash.h
+++ b/include/linux/xxhash.h
@@ -177,32 +177,6 @@ struct xxh64_state {
  */
 void xxh32_reset(struct xxh32_state *state, uint32_t seed);
 
-/**
- * xxh32_update() - hash the data given and update the xxh32 state
- *
- * @state:  The xxh32 state to update.
- * @input:  The data to hash.
- * @length: The length of the data to hash.
- *
- * After calling xxh32_reset() call xxh32_update() as many times as necessary.
- *
- * Return:  Zero on success, otherwise an error code.
- */
-int xxh32_update(struct xxh32_state *state, const void *input, size_t length);
-
-/**
- * xxh32_digest() - produce the current xxh32 hash
- *
- * @state: Produce the current xxh32 hash of this state.
- *
- * A hash value can be produced at any time. It is still possible to continue
- * inserting input into the hash state after a call to xxh32_digest(), and
- * generate new hashes later on, by calling xxh32_digest() again.
- *
- * Return: The xxh32 hash stored in the state.
- */
-uint32_t xxh32_digest(const struct xxh32_state *state);
-
 /**
  * xxh64_reset() - reset the xxh64 state to start a new hashing operation
  *
diff --git a/lib/xxhash.c b/lib/xxhash.c
index b5bd567aa6b3..cf629766f376 100644
--- a/lib/xxhash.c
+++ b/lib/xxhash.c
@@ -267,113 +267,6 @@ void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed)
 }
 EXPORT_SYMBOL(xxh64_reset);
 
-int xxh32_update(struct xxh32_state *state, const void *input, const size_t len)
-{
-	const uint8_t *p = (const uint8_t *)input;
-	const uint8_t *const b_end = p + len;
-
-	if (input == NULL)
-		return -EINVAL;
-
-	state->total_len_32 += (uint32_t)len;
-	state->large_len |= (len >= 16) | (state->total_len_32 >= 16);
-
-	if (state->memsize + len < 16) { /* fill in tmp buffer */
-		memcpy((uint8_t *)(state->mem32) + state->memsize, input, len);
-		state->memsize += (uint32_t)len;
-		return 0;
-	}
-
-	if (state->memsize) { /* some data left from previous update */
-		const uint32_t *p32 = state->mem32;
-
-		memcpy((uint8_t *)(state->mem32) + state->memsize, input,
-			16 - state->memsize);
-
-		state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32));
-		p32++;
-		state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32));
-		p32++;
-		state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32));
-		p32++;
-		state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32));
-		p32++;
-
-		p += 16-state->memsize;
-		state->memsize = 0;
-	}
-
-	if (p <= b_end - 16) {
-		const uint8_t *const limit = b_end - 16;
-		uint32_t v1 = state->v1;
-		uint32_t v2 = state->v2;
-		uint32_t v3 = state->v3;
-		uint32_t v4 = state->v4;
-
-		do {
-			v1 = xxh32_round(v1, get_unaligned_le32(p));
-			p += 4;
-			v2 = xxh32_round(v2, get_unaligned_le32(p));
-			p += 4;
-			v3 = xxh32_round(v3, get_unaligned_le32(p));
-			p += 4;
-			v4 = xxh32_round(v4, get_unaligned_le32(p));
-			p += 4;
-		} while (p <= limit);
-
-		state->v1 = v1;
-		state->v2 = v2;
-		state->v3 = v3;
-		state->v4 = v4;
-	}
-
-	if (p < b_end) {
-		memcpy(state->mem32, p, (size_t)(b_end-p));
-		state->memsize = (uint32_t)(b_end-p);
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(xxh32_update);
-
-uint32_t xxh32_digest(const struct xxh32_state *state)
-{
-	const uint8_t *p = (const uint8_t *)state->mem32;
-	const uint8_t *const b_end = (const uint8_t *)(state->mem32) +
-		state->memsize;
-	uint32_t h32;
-
-	if (state->large_len) {
-		h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) +
-			xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18);
-	} else {
-		h32 = state->v3 /* == seed */ + PRIME32_5;
-	}
-
-	h32 += state->total_len_32;
-
-	while (p + 4 <= b_end) {
-		h32 += get_unaligned_le32(p) * PRIME32_3;
-		h32 = xxh_rotl32(h32, 17) * PRIME32_4;
-		p += 4;
-	}
-
-	while (p < b_end) {
-		h32 += (*p) * PRIME32_5;
-		h32 = xxh_rotl32(h32, 11) * PRIME32_1;
-		p++;
-	}
-
-	h32 ^= h32 >> 15;
-	h32 *= PRIME32_2;
-	h32 ^= h32 >> 13;
-	h32 *= PRIME32_3;
-	h32 ^= h32 >> 16;
-
-	return h32;
-}
-EXPORT_SYMBOL(xxh32_digest);
-
 int xxh64_update(struct xxh64_state *state, const void *input, const size_t len)
 {
 	const uint8_t *p = (const uint8_t *)input;

From ed4f142f72a9191b8236778093074c277435bf8a Mon Sep 17 00:00:00 2001
From: Matt Fleming <mfleming@cloudflare.com>
Date: Fri, 18 Jul 2025 16:39:28 +0100
Subject: [PATCH 65/80] stackdepot: make max number of pools boot-time
 configurable

We're hitting the WARN in depot_init_pool() about reaching the stack depot
limit because we have long stacks that don't dedup very well.

Introduce a new start-up parameter to allow users to set the number of
maximum stack depot pools.

Link: https://lkml.kernel.org/r/20250718153928.94229-1-matt@readmodwrite.com
Signed-off-by: Matt Fleming <mfleming@cloudflare.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../admin-guide/kernel-parameters.txt         |  5 ++
 lib/stackdepot.c                              | 67 ++++++++++++++++---
 2 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3d1e55ed4382..1673e803c47f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -7029,6 +7029,11 @@
 			consumed by the stack hash table. By default this is set
 			to false.
 
+	stack_depot_max_pools= [KNL,EARLY]
+			Specify the maximum number of pools to use for storing
+			stack traces. Pools are allocated on-demand up to this
+			limit. Default value is 8191 pools.
+
 	stacktrace	[FTRACE]
 			Enabled the stack tracer on boot up.
 
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 73d7b50924ef..de0b0025af2b 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -36,11 +36,11 @@
 #include <linux/memblock.h>
 #include <linux/kasan-enabled.h>
 
-#define DEPOT_POOLS_CAP 8192
-/* The pool_index is offset by 1 so the first record does not have a 0 handle. */
-#define DEPOT_MAX_POOLS \
-	(((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \
-	 (1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP)
+/*
+ * The pool_index is offset by 1 so the first record does not have a 0 handle.
+ */
+static unsigned int stack_max_pools __read_mostly =
+	MIN((1LL << DEPOT_POOL_INDEX_BITS) - 1, 8192);
 
 static bool stack_depot_disabled;
 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
@@ -62,7 +62,7 @@ static unsigned int stack_bucket_number_order;
 static unsigned int stack_hash_mask;
 
 /* Array of memory regions that store stack records. */
-static void *stack_pools[DEPOT_MAX_POOLS];
+static void **stack_pools;
 /* Newly allocated pool that is not yet added to stack_pools. */
 static void *new_pool;
 /* Number of pools in stack_pools. */
@@ -101,6 +101,34 @@ static int __init disable_stack_depot(char *str)
 }
 early_param("stack_depot_disable", disable_stack_depot);
 
+static int __init parse_max_pools(char *str)
+{
+	const long long limit = (1LL << (DEPOT_POOL_INDEX_BITS)) - 1;
+	unsigned int max_pools;
+	int rv;
+
+	rv = kstrtouint(str, 0, &max_pools);
+	if (rv)
+		return rv;
+
+	if (max_pools < 1024) {
+		pr_err("stack_depot_max_pools below 1024, using default of %u\n",
+		       stack_max_pools);
+		goto out;
+	}
+
+	if (max_pools > limit) {
+		pr_err("stack_depot_max_pools exceeds %lld, using default of %u\n",
+		       limit, stack_max_pools);
+		goto out;
+	}
+
+	stack_max_pools = max_pools;
+out:
+	return 0;
+}
+early_param("stack_depot_max_pools", parse_max_pools);
+
 void __init stack_depot_request_early_init(void)
 {
 	/* Too late to request early init now. */
@@ -182,6 +210,17 @@ int __init stack_depot_early_init(void)
 	}
 	init_stack_table(entries);
 
+	pr_info("allocating space for %u stack pools via memblock\n",
+		stack_max_pools);
+	stack_pools =
+		memblock_alloc(stack_max_pools * sizeof(void *), PAGE_SIZE);
+	if (!stack_pools) {
+		pr_err("stack pools allocation failed, disabling\n");
+		memblock_free(stack_table, entries * sizeof(struct list_head));
+		stack_depot_disabled = true;
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
@@ -231,6 +270,16 @@ int stack_depot_init(void)
 	stack_hash_mask = entries - 1;
 	init_stack_table(entries);
 
+	pr_info("allocating space for %u stack pools via kvcalloc\n",
+		stack_max_pools);
+	stack_pools = kvcalloc(stack_max_pools, sizeof(void *), GFP_KERNEL);
+	if (!stack_pools) {
+		pr_err("stack pools allocation failed, disabling\n");
+		kvfree(stack_table);
+		stack_depot_disabled = true;
+		ret = -ENOMEM;
+	}
+
 out_unlock:
 	mutex_unlock(&stack_depot_init_mutex);
 
@@ -245,9 +294,9 @@ static bool depot_init_pool(void **prealloc)
 {
 	lockdep_assert_held(&pool_lock);
 
-	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
+	if (unlikely(pools_num >= stack_max_pools)) {
 		/* Bail out if we reached the pool limit. */
-		WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */
+		WARN_ON_ONCE(pools_num > stack_max_pools); /* should never happen */
 		WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */
 		WARN_ONCE(1, "Stack depot reached limit capacity");
 		return false;
@@ -273,7 +322,7 @@ static bool depot_init_pool(void **prealloc)
 	 * NULL; do not reset to NULL if we have reached the maximum number of
 	 * pools.
 	 */
-	if (pools_num < DEPOT_MAX_POOLS)
+	if (pools_num < stack_max_pools)
 		WRITE_ONCE(new_pool, NULL);
 	else
 		WRITE_ONCE(new_pool, STACK_DEPOT_POISON);

From 07d24902977e4704fab8472981e73a0ad6dfa1fd Mon Sep 17 00:00:00 2001
From: Alexander Graf <graf@amazon.com>
Date: Tue, 10 Jun 2025 08:53:27 +0000
Subject: [PATCH 66/80] kexec: enable CMA based contiguous allocation

When booting a new kernel with kexec_file, the kernel picks a target
location that the kernel should live at, then allocates random pages,
checks whether any of those patches magically happens to coincide with a
target address range and if so, uses them for that range.

For every page allocated this way, it then creates a page list that the
relocation code - code that executes while all CPUs are off and we are
just about to jump into the new kernel - copies to their final memory
location.  We can not put them there before, because chances are pretty
good that at least some page in the target range is already in use by the
currently running Linux environment.  Copying is happening from a single
CPU at RAM rate, which takes around 4-50 ms per 100 MiB.

All of this is inefficient and error prone.

To successfully kexec, we need to quiesce all devices of the outgoing
kernel so they don't scribble over the new kernel's memory.  We have seen
cases where that does not happen properly (*cough* GIC *cough*) and hence
the new kernel was corrupted.  This started a month long journey to root
cause failing kexecs to eventually see memory corruption, because the new
kernel was corrupted severely enough that it could not emit output to tell
us about the fact that it was corrupted.  By allocating memory for the
next kernel from a memory range that is guaranteed scribbling free, we can
boot the next kernel up to a point where it is at least able to detect
corruption and maybe even stop it before it becomes severe.  This
increases the chance for successful kexecs.

Since kexec got introduced, Linux has gained the CMA framework which can
perform physically contiguous memory mappings, while keeping that memory
available for movable memory when it is not needed for contiguous
allocations.  The default CMA allocator is for DMA allocations.

This patch adds logic to the kexec file loader to attempt to place the
target payload at a location allocated from CMA.  If successful, it uses
that memory range directly instead of creating copy instructions during
the hot phase.  To ensure that there is a safety net in case anything goes
wrong with the CMA allocation, it also adds a flag for user space to force
disable CMA allocations.

Using CMA allocations has two advantages:

  1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the
     hot phase.
  2) More robust. Even if by accident some page is still in use for DMA,
     the new kernel image will be safe from that access because it resides
     in a memory region that is considered allocated in the old kernel and
     has a chance to reinitialize that component.

Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com
Signed-off-by: Alexander Graf <graf@amazon.com>
Acked-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Zhongkun He <hezhongkun.hzk@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/kernel/kexec_elf.c |   1 +
 include/linux/kexec.h         |  10 ++++
 include/uapi/linux/kexec.h    |   1 +
 kernel/kexec.c                |   2 +-
 kernel/kexec_core.c           | 100 +++++++++++++++++++++++++++++++---
 kernel/kexec_file.c           |  51 ++++++++++++++++-
 kernel/kexec_internal.h       |   2 +-
 7 files changed, 156 insertions(+), 11 deletions(-)

diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c
index f4755d49b89e..56444c7bd34e 100644
--- a/arch/riscv/kernel/kexec_elf.c
+++ b/arch/riscv/kernel/kexec_elf.c
@@ -95,6 +95,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len,
 	kbuf.buf_align = PMD_SIZE;
 	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
 	kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE);
+	kbuf.cma = NULL;
 	kbuf.top_down = false;
 	ret = arch_kexec_locate_mem_hole(&kbuf);
 	if (!ret) {
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 03f85ad03025..1b10a5d84b68 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -79,6 +79,12 @@ extern note_buf_t __percpu *crash_notes;
 
 typedef unsigned long kimage_entry_t;
 
+/*
+ * This is a copy of the UAPI struct kexec_segment and must be identical
+ * to it because it gets copied straight from user space into kernel
+ * memory. Do not modify this structure unless you change the way segments
+ * get ingested from user space.
+ */
 struct kexec_segment {
 	/*
 	 * This pointer can point to user memory if kexec_load() system
@@ -172,6 +178,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image);
  * @buf_align:	Minimum alignment needed.
  * @buf_min:	The buffer can't be placed below this address.
  * @buf_max:	The buffer can't be placed above this address.
+ * @cma:	CMA page if the buffer is backed by CMA.
  * @top_down:	Allocate from top of memory.
  * @random:	Place the buffer at a random position.
  */
@@ -184,6 +191,7 @@ struct kexec_buf {
 	unsigned long buf_align;
 	unsigned long buf_min;
 	unsigned long buf_max;
+	struct page *cma;
 	bool top_down;
 #ifdef CONFIG_CRASH_DUMP
 	bool random;
@@ -340,6 +348,7 @@ struct kimage {
 
 	unsigned long nr_segments;
 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+	struct page *segment_cma[KEXEC_SEGMENT_MAX];
 
 	struct list_head control_pages;
 	struct list_head dest_pages;
@@ -361,6 +370,7 @@ struct kimage {
 	 */
 	unsigned int hotplug_support:1;
 #endif
+	unsigned int no_cma:1;
 
 #ifdef ARCH_HAS_KIMAGE_ARCH
 	struct kimage_arch arch;
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 5ae1741ea8ea..8958ebfcff94 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -27,6 +27,7 @@
 #define KEXEC_FILE_ON_CRASH	0x00000002
 #define KEXEC_FILE_NO_INITRAMFS	0x00000004
 #define KEXEC_FILE_DEBUG	0x00000008
+#define KEXEC_FILE_NO_CMA	0x00000010
 
 /* These values match the ELF architecture values.
  * Unless there is a good reason that should continue to be the case.
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a6b3f96bb50c..28008e3d462e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 		goto out;
 
 	for (i = 0; i < nr_segments; i++) {
-		ret = kimage_load_segment(image, &image->segment[i]);
+		ret = kimage_load_segment(image, i);
 		if (ret)
 			goto out;
 	}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 3a9a9f240dbc..e390c0df6d55 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -40,6 +40,7 @@
 #include <linux/hugetlb.h>
 #include <linux/objtool.h>
 #include <linux/kmsg_dump.h>
+#include <linux/dma-map-ops.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry)
 	kimage_free_pages(page);
 }
 
+static void kimage_free_cma(struct kimage *image)
+{
+	unsigned long i;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		struct page *cma = image->segment_cma[i];
+		u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT;
+
+		if (!cma)
+			continue;
+
+		arch_kexec_pre_free_pages(page_address(cma), nr_pages);
+		dma_release_from_contiguous(NULL, cma, nr_pages);
+		image->segment_cma[i] = NULL;
+	}
+
+}
+
 void kimage_free(struct kimage *image)
 {
 	kimage_entry_t *ptr, entry;
@@ -591,6 +610,9 @@ void kimage_free(struct kimage *image)
 	/* Free the kexec control pages... */
 	kimage_free_page_list(&image->control_pages);
 
+	/* Free CMA allocations */
+	kimage_free_cma(image);
+
 	/*
 	 * Free up any temporary buffers allocated. This might hit if
 	 * error occurred much later after buffer allocation.
@@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image,
 	return page;
 }
 
-static int kimage_load_normal_segment(struct kimage *image,
-					 struct kexec_segment *segment)
+static int kimage_load_cma_segment(struct kimage *image, int idx)
 {
+	struct kexec_segment *segment = &image->segment[idx];
+	struct page *cma = image->segment_cma[idx];
+	char *ptr = page_address(cma);
+	unsigned long maddr;
+	size_t ubytes, mbytes;
+	int result = 0;
+	unsigned char __user *buf = NULL;
+	unsigned char *kbuf = NULL;
+
+	if (image->file_mode)
+		kbuf = segment->kbuf;
+	else
+		buf = segment->buf;
+	ubytes = segment->bufsz;
+	mbytes = segment->memsz;
+	maddr = segment->mem;
+
+	/* Then copy from source buffer to the CMA one */
+	while (mbytes) {
+		size_t uchunk, mchunk;
+
+		ptr += maddr & ~PAGE_MASK;
+		mchunk = min_t(size_t, mbytes,
+				PAGE_SIZE - (maddr & ~PAGE_MASK));
+		uchunk = min(ubytes, mchunk);
+
+		if (uchunk) {
+			/* For file based kexec, source pages are in kernel memory */
+			if (image->file_mode)
+				memcpy(ptr, kbuf, uchunk);
+			else
+				result = copy_from_user(ptr, buf, uchunk);
+			ubytes -= uchunk;
+			if (image->file_mode)
+				kbuf += uchunk;
+			else
+				buf += uchunk;
+		}
+
+		if (result) {
+			result = -EFAULT;
+			goto out;
+		}
+
+		ptr    += mchunk;
+		maddr  += mchunk;
+		mbytes -= mchunk;
+
+		cond_resched();
+	}
+
+	/* Clear any remainder */
+	memset(ptr, 0, mbytes);
+
+out:
+	return result;
+}
+
+static int kimage_load_normal_segment(struct kimage *image, int idx)
+{
+	struct kexec_segment *segment = &image->segment[idx];
 	unsigned long maddr;
 	size_t ubytes, mbytes;
 	int result;
@@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image,
 	mbytes = segment->memsz;
 	maddr = segment->mem;
 
+	if (image->segment_cma[idx])
+		return kimage_load_cma_segment(image, idx);
+
 	result = kimage_set_destination(image, maddr);
 	if (result < 0)
 		goto out;
@@ -787,13 +872,13 @@ out:
 }
 
 #ifdef CONFIG_CRASH_DUMP
-static int kimage_load_crash_segment(struct kimage *image,
-					struct kexec_segment *segment)
+static int kimage_load_crash_segment(struct kimage *image, int idx)
 {
 	/* For crash dumps kernels we simply copy the data from
 	 * user space to it's destination.
 	 * We do things a page at a time for the sake of kmap.
 	 */
+	struct kexec_segment *segment = &image->segment[idx];
 	unsigned long maddr;
 	size_t ubytes, mbytes;
 	int result;
@@ -858,18 +943,17 @@ out:
 }
 #endif
 
-int kimage_load_segment(struct kimage *image,
-				struct kexec_segment *segment)
+int kimage_load_segment(struct kimage *image, int idx)
 {
 	int result = -ENOMEM;
 
 	switch (image->type) {
 	case KEXEC_TYPE_DEFAULT:
-		result = kimage_load_normal_segment(image, segment);
+		result = kimage_load_normal_segment(image, idx);
 		break;
 #ifdef CONFIG_CRASH_DUMP
 	case KEXEC_TYPE_CRASH:
-		result = kimage_load_crash_segment(image, segment);
+		result = kimage_load_crash_segment(image, idx);
 		break;
 #endif
 	}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 69fe76fd9233..41271eee0f99 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -26,6 +26,7 @@
 #include <linux/kernel_read_file.h>
 #include <linux/syscalls.h>
 #include <linux/vmalloc.h>
+#include <linux/dma-map-ops.h>
 #include "kexec_internal.h"
 
 #ifdef CONFIG_KEXEC_SIG
@@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
 		ret = 0;
 	}
 
+	image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+
 	if (cmdline_len) {
 		image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
 		if (IS_ERR(image->cmdline_buf)) {
@@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 			      i, ksegment->buf, ksegment->bufsz, ksegment->mem,
 			      ksegment->memsz);
 
-		ret = kimage_load_segment(image, &image->segment[i]);
+		ret = kimage_load_segment(image, i);
 		if (ret)
 			goto out;
 	}
@@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
 		return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
 }
 
+static int kexec_alloc_contig(struct kexec_buf *kbuf)
+{
+	size_t nr_pages = kbuf->memsz >> PAGE_SHIFT;
+	unsigned long mem;
+	struct page *p;
+
+	/* User space disabled CMA allocations, bail out. */
+	if (kbuf->image->no_cma)
+		return -EPERM;
+
+	/* Skip CMA logic for crash kernel */
+	if (kbuf->image->type == KEXEC_TYPE_CRASH)
+		return -EPERM;
+
+	p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true);
+	if (!p)
+		return -ENOMEM;
+
+	pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p));
+
+	mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+
+	if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) {
+		/* Our region is already in use by a statically defined one. Bail out. */
+		pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz);
+		dma_release_from_contiguous(NULL, p, nr_pages);
+		return -EBUSY;
+	}
+
+	kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+	kbuf->cma = p;
+
+	arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0);
+
+	return 0;
+}
+
 /**
  * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
  * @kbuf:	Parameters for the memory search.
@@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
 	if (ret <= 0)
 		return ret;
 
+	/*
+	 * Try to find a free physically contiguous block of memory first. With that, we
+	 * can avoid any copying at kexec time.
+	 */
+	if (!kexec_alloc_contig(kbuf))
+		return 0;
+
 	if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
 		ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
 	else
@@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
 	/* Ensure minimum alignment needed for segments. */
 	kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
 	kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
+	kbuf->cma = NULL;
 
 	/* Walk the RAM ranges and allocate a suitable range for the buffer */
 	ret = arch_kexec_locate_mem_hole(kbuf);
@@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
 	ksegment->bufsz = kbuf->bufsz;
 	ksegment->mem = kbuf->mem;
 	ksegment->memsz = kbuf->memsz;
+	kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma;
 	kbuf->image->nr_segments++;
 	return 0;
 }
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 30a733a55a67..228bb88c018b 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void);
 int sanity_check_segment_list(struct kimage *image);
 void kimage_free_page_list(struct list_head *list);
 void kimage_free(struct kimage *image);
-int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+int kimage_load_segment(struct kimage *image, int idx);
 void kimage_terminate(struct kimage *image);
 int kimage_is_destination_range(struct kimage *image,
 				unsigned long start, unsigned long end);

From f8cd9193b62e92ad25def5370ca8ea2bc7585381 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 21 Jul 2025 19:45:57 +0200
Subject: [PATCH 67/80] ucount: fix atomic_long_inc_below() argument type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The type of u argument of atomic_long_inc_below() should be long to avoid
unwanted truncation to int.

The patch fixes the wrong argument type of an internal function to
prevent unwanted argument truncation.  It fixes an internal locking
primitive; it should not have any direct effect on userspace.

Mark said

: AFAICT there's no problem in practice because atomic_long_inc_below()
: is only used by inc_ucount(), and it looks like the value is
: constrained between 0 and INT_MAX.
:
: In inc_ucount() the limit value is taken from
: user_namespace::ucount_max[], and AFAICT that's only written by
: sysctls, to the table setup by setup_userns_sysctls(), where
: UCOUNT_ENTRY() limits the value between 0 and INT_MAX.
:
: This is certainly a cleanup, but there might be no functional issue in
: practice as above.

Link: https://lkml.kernel.org/r/20250721174610.28361-1-ubizjak@gmail.com
Fixes: f9c82a4ea89c ("Increase size of ucounts to atomic_long_t")
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Alexey Gladkov <legion@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: MengEn Sun <mengensun@tencent.com>
Cc: "Thomas Weißschuh" <linux@weissschuh.net>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/ucount.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/ucount.c b/kernel/ucount.c
index 8686e329b8f2..f629db485a07 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -199,7 +199,7 @@ void put_ucounts(struct ucounts *ucounts)
 	}
 }
 
-static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, long u)
 {
 	long c, old;
 	c = atomic_long_read(v);

From 58b4fba81a2e400a47ddbe7c1dc0a2bc038313b7 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 21 Jul 2025 19:45:58 +0200
Subject: [PATCH 68/80] ucount: use atomic_long_try_cmpxchg() in
 atomic_long_inc_below()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use atomic_long_try_cmpxchg() instead of
atomic_long_cmpxchg (*ptr, old, new) == old in atomic_long_inc_below().
x86 CMPXCHG instruction returns success in ZF flag, so this change saves
a compare after cmpxchg (and related move instruction in front of cmpxchg).

Also, atomic_long_try_cmpxchg implicitly assigns old *ptr value to "old"
when cmpxchg fails, enabling further code simplifications.

No functional change intended.

Link: https://lkml.kernel.org/r/20250721174610.28361-2-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Reviewed-by: Alexey Gladkov <legion@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Alexey Gladkov <legion@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: MengEn Sun <mengensun@tencent.com>
Cc: "Thomas Weißschuh" <linux@weissschuh.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/ucount.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/kernel/ucount.c b/kernel/ucount.c
index f629db485a07..586af49fc03e 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -201,16 +201,14 @@ void put_ucounts(struct ucounts *ucounts)
 
 static inline bool atomic_long_inc_below(atomic_long_t *v, long u)
 {
-	long c, old;
-	c = atomic_long_read(v);
-	for (;;) {
+	long c = atomic_long_read(v);
+
+	do {
 		if (unlikely(c >= u))
 			return false;
-		old = atomic_long_cmpxchg(v, c, c+1);
-		if (likely(old == c))
-			return true;
-		c = old;
-	}
+	} while (!atomic_long_try_cmpxchg(v, &c, c+1));
+
+	return true;
 }
 
 struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,

From 1f03d55e5ef0b041bd66fbf7803952c901a93fcb Mon Sep 17 00:00:00 2001
From: Wang Yaxin <wang.yaxin@zte.com.cn>
Date: Mon, 21 Jul 2025 09:40:49 +0800
Subject: [PATCH 69/80] MAINTAINERS: add maintainers for delaytop

The delaytop tool supports showing system delays and task-level delays,
effectively identifying the top-n tasks with high latency in the system,
which is highly beneficial for improving system performance.  Wang Yaxin
and her colleague Fan Yu focus on locating system delay issues.  To
promote the thriving development of delaytop, we hope to serve as
maintainers to continuously improve it, aiming to provide a more effective
solution for system latency issues in the future.

Link: https://lkml.kernel.org/r/20250721094049958ImB8XG_imntcPqpQn1KfG@zte.com.cn
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Signed-off-by: Fan Yu <fan.yu9@zte.com.cn>
Reviewed-by: Yang Yang <yang.yang29@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 947ec6bf5b95..87897a285bc1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19462,6 +19462,16 @@ S:	Maintained
 F:	include/linux/delayacct.h
 F:	kernel/delayacct.c
 
+TASK DELAY MONITORING TOOLS
+M:	Andrew Morton <akpm@linux-foundation.org>
+M:	Wang Yaxin <wang.yaxin@zte.com.cn>
+M:	Fan Yu <fan.yu9@zte.com.cn>
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	Documentation/accounting/delay-accounting.rst
+F:	tools/accounting/delaytop.c
+F:	tools/accounting/getdelays.c
+
 PERFORMANCE EVENTS SUBSYSTEM
 M:	Peter Zijlstra <peterz@infradead.org>
 M:	Ingo Molnar <mingo@redhat.com>

From a30469cac8ce6555284948dab30066ce1ea43548 Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Tue, 22 Jul 2025 15:34:24 +0800
Subject: [PATCH 70/80] KVM: x86: fix typo "notifer"

Patch series "treewide: Fix typo "notifer"", v3.

There are some spelling mistakes of 'notifer' in comments which
should be 'notifier'.

Fix them and add it to scripts/spelling.txt.


This patch (of 8):

There are some spelling mistakes of 'notifer' which should be 'notifier'.

Link: https://lkml.kernel.org/r/576F0D85F6853074+20250722072734.19367-1-wangyuli@uniontech.com
Link: https://lkml.kernel.org/r/7F05778C3A1A9F8B+20250722073431.21983-1-wangyuli@uniontech.com
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kvm/i8254.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 739aa6c0d0c3..9ff55112900a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -641,7 +641,7 @@ static void kvm_pit_reset(struct kvm_pit *pit)
 	kvm_pit_reset_reinject(pit);
 }
 
-static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+static void pit_mask_notifier(struct kvm_irq_mask_notifier *kimn, bool mask)
 {
 	struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
 
@@ -694,7 +694,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
-	pit->mask_notifier.func = pit_mask_notifer;
+	pit->mask_notifier.func = pit_mask_notifier;
 
 	kvm_pit_reset(pit);
 

From fbedfb051a4c74854c23f9c898fc6b29fab7be60 Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Tue, 22 Jul 2025 15:34:25 +0800
Subject: [PATCH 71/80] cxl: mce: fix typo "notifer"

According to the context, "mce_notifer" should be "mce_notifier".

Link: https://lkml.kernel.org/r/E1EB1BA9FDF07D53+20250722073431.21983-2-wangyuli@uniontech.com
Fixes: 516e5bd0b6bf ("cxl: Add mce notifier to emit aliased address for extended linear cache")
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/cxl/core/mce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cxl/core/mce.h b/drivers/cxl/core/mce.h
index ace73424eeb6..ca272e8db6c7 100644
--- a/drivers/cxl/core/mce.h
+++ b/drivers/cxl/core/mce.h
@@ -7,7 +7,7 @@
 
 #ifdef CONFIG_CXL_MCE
 int devm_cxl_register_mce_notifier(struct device *dev,
-				   struct notifier_block *mce_notifer);
+				   struct notifier_block *mce_notifier);
 #else
 static inline int
 devm_cxl_register_mce_notifier(struct device *dev,

From 26197b0fd220ceb2b26f2ea2948c00fdd9855fae Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Tue, 22 Jul 2025 15:34:26 +0800
Subject: [PATCH 72/80] drm/xe: fix typo "notifer"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a spelling mistake of 'notifer' in the comment which
should be 'notifier'.

Link: https://lkml.kernel.org/r/94190C5F54A19F3E+20250722073431.21983-3-wangyuli@uniontech.com
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/gpu/drm/xe/xe_vm_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index 1979e9bdbdf3..0ca27579fd1f 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -259,7 +259,7 @@ struct xe_vm {
 		 * up for revalidation. Protected from access with the
 		 * @invalidated_lock. Removing items from the list
 		 * additionally requires @lock in write mode, and adding
-		 * items to the list requires either the @userptr.notifer_lock in
+		 * items to the list requires either the @userptr.notifier_lock in
 		 * write mode, OR @lock in write mode.
 		 */
 		struct list_head invalidated;

From 545040384e78d6eaabb20e1f4baa85ace864dcfc Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Tue, 22 Jul 2025 15:34:27 +0800
Subject: [PATCH 73/80] net: mvneta: fix typo "notifer"

There is a spelling mistake of 'notifer' in the comment which
should be 'notifier'.

Link: https://lkml.kernel.org/r/0CB4300CB6F49007+20250722073431.21983-4-wangyuli@uniontech.com
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/net/ethernet/marvell/mvneta.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 147571fdada3..ee4696600146 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4610,7 +4610,7 @@ static int mvneta_stop(struct net_device *dev)
 		/* Inform that we are stopping so we don't want to setup the
 		 * driver for new CPUs in the notifiers. The code of the
 		 * notifier for CPU online is protected by the same spinlock,
-		 * so when we get the lock, the notifer work is done.
+		 * so when we get the lock, the notifier work is done.
 		 */
 		spin_lock(&pp->lock);
 		pp->is_stopped = true;

From 004f42dd90b7ef542a51983bdaa5b2ef621ed41d Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Tue, 22 Jul 2025 15:34:30 +0800
Subject: [PATCH 74/80] xen/xenbus: fix typo "notifer"

There is a spelling mistake of 'notifer' in the comment which
should be 'notifier'.

Link: https://lkml.kernel.org/r/C6633C66376C709A+20250722073431.21983-7-wangyuli@uniontech.com
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/xen/xenbus.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 3f90bdd387b6..00b84f2e402b 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -180,7 +180,7 @@ int xenbus_printf(struct xenbus_transaction t,
  * sprintf-style type string, and pointer. Returns 0 or errno.*/
 int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
 
-/* notifer routines for when the xenstore comes up */
+/* notifier routines for when the xenstore comes up */
 extern int xenstored_ready;
 int register_xenstore_notifier(struct notifier_block *nb);
 void unregister_xenstore_notifier(struct notifier_block *nb);

From 53f433891e698e76aaf01b84b30a17a79a53535c Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Tue, 22 Jul 2025 15:34:31 +0800
Subject: [PATCH 75/80] scripts/spelling.txt: add notifer||notifier to
 spelling.txt

This typo was not listed in scripts/spelling.txt, thus it was more
difficult to detect. Add it for convenience.

Link: https://lkml.kernel.org/r/02153C05ED7B49B7+20250722073431.21983-8-wangyuli@uniontech.com
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/spelling.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index ac94fa1c2415..1e89b92c2f9a 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -1099,6 +1099,7 @@ notication||notification
 notications||notifications
 notifcations||notifications
 notifed||notified
+notifer||notifier
 notity||notify
 notfify||notify
 nubmer||number

From fb0e9db99eefc17cb8693ce93afe5c5dbc5148a5 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 24 Jul 2025 16:42:10 +0900
Subject: [PATCH 76/80] fat: fix too many log in fat_chain_add()

This log was excessive for a serial console.  So use the ratelimited
version instead.

Link: https://lkml.kernel.org/r/87qzy611d9.fsf@mail.parknet.co.jp
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Reported-by: syzbot+fa7ef54f66c189c04b73@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=fa7ef54f66c189c04b73
Cc: Namjae Jeon <linkinjeon@kernel.org>
Cc: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/misc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index c7a2d27120ba..950da09f0961 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -158,9 +158,9 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 			mark_inode_dirty(inode);
 	}
 	if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
-		fat_fs_error(sb, "clusters badly computed (%d != %llu)",
-			     new_fclus,
-			     (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
+		fat_fs_error_ratelimit(
+			sb, "clusters badly computed (%d != %llu)", new_fclus,
+			(llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
 		fat_cache_inval_inode(inode);
 	}
 	inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9);

From 8c54f7e3e0eab0174683a562051417317c4ea297 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Thu, 24 Jul 2025 12:17:15 +0100
Subject: [PATCH 77/80] samples: Kconfig: fix spelling mistake "instancess" ->
 "instances"

There is a spelling mistake in the SAMPLE_TRACE_ARRAY config. Fix it.

Link: https://lkml.kernel.org/r/20250724111715.141826-1-colin.i.king@gmail.com
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 samples/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/Kconfig b/samples/Kconfig
index a8880c62d4c8..6e072a5f1ed8 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -54,7 +54,7 @@ config SAMPLE_FTRACE_OPS
 	  measures the time taken to invoke one function a number of times.
 
 config SAMPLE_TRACE_ARRAY
-        tristate "Build sample module for kernel access to Ftrace instancess"
+        tristate "Build sample module for kernel access to Ftrace instances"
 	depends on EVENT_TRACING && m
 	help
 	 This builds a module that demonstrates the use of various APIs to

From d92dccd05a20b7a9c2836d4e46e22128f5b73367 Mon Sep 17 00:00:00 2001
From: "fan.yu9@zte.com.cn" <fan.yu9@zte.com.cn>
Date: Mon, 28 Jul 2025 16:28:34 +0800
Subject: [PATCH 78/80] delaytop: enhance error logging and add PSI feature
 description

This patch improves error diagnostics and documentation for delaytop:

1) Enhanced error logging:
   - Added explicit error messages in critical failure paths
   - Implemented BOOL_FPRINT macro for robust output handling

2) PSI feature documentation:
   - Updated header comment to reflect PSI monitoring capability
   - Improved output formatting for PSI information

System Pressure Information: (avg10/avg60/avg300/total)
CPU some:       0.0%/   0.0%/   0.0%/     345(ms)
CPU full:       0.0%/   0.0%/   0.0%/       0(ms)
Memory full:    0.0%/   0.0%/   0.0%/       0(ms)
Memory some:    0.0%/   0.0%/   0.0%/       0(ms)
IO full:        0.0%/   0.0%/   0.0%/      65(ms)
IO some:        0.0%/   0.0%/   0.0%/      79(ms)
IRQ full:       0.0%/   0.0%/   0.0%/       0(ms)

Link: https://lkml.kernel.org/r/202507281628341752gMXCMN7S-Vz_LHYHum9r@zte.com.cn
Signed-off-by: Fan Yu <fan.yu9@zte.com.cn>
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Acked-by: Yang Yang <yang.yang29@zte.com.cn>
Cc: Fan Yu <fan.yu9@zte.com.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/accounting/delay-accounting.rst |  61 ++++---
 tools/accounting/delaytop.c                   | 162 ++++++++++++------
 2 files changed, 143 insertions(+), 80 deletions(-)

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 664950328fb7..8ccc5af5ea1e 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -132,38 +132,47 @@ Get IO accounting for pid 1, it works only with -p::
 
 The above command can be used with -v to get more debug information.
 
-After the system starts, use `delaytop` to get the Top-N high-latency tasks.
-this tool supports sorting by CPU latency in descending order by default,
+After the system starts, use `delaytop` to get the system-wide delay information,
+which includes system-wide PSI information and Top-N high-latency tasks.
+
+`delaytop` supports sorting by CPU latency in descending order by default,
 displays the top 20 high-latency tasks by default, and refreshes the latency
 data every 2 seconds by default.
 
-Get Top-N tasks delay, since system boot::
+Get PSI information and Top-N tasks delay, since system boot::
 
 	bash# ./delaytop
+	System Pressure Information: (avg10/avg60/avg300/total)
+	CPU some:       0.0%/   0.0%/   0.0%/     345(ms)
+	CPU full:       0.0%/   0.0%/   0.0%/       0(ms)
+	Memory full:    0.0%/   0.0%/   0.0%/       0(ms)
+	Memory some:    0.0%/   0.0%/   0.0%/       0(ms)
+	IO full:        0.0%/   0.0%/   0.0%/      65(ms)
+	IO some:        0.0%/   0.0%/   0.0%/      79(ms)
+	IRQ full:       0.0%/   0.0%/   0.0%/       0(ms)
 	Top 20 processes (sorted by CPU delay):
-
-	  PID   TGID  COMMAND            CPU(ms)  IO(ms)        SWAP(ms) RCL(ms) THR(ms)  CMP(ms)  WP(ms)  IRQ(ms)
-	---------------------------------------------------------------------------------------------
-	   32     32  kworker/2:0H-sy   23.65     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	  497    497  kworker/R-scsi_    1.20     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	  495    495  kworker/R-scsi_    1.13     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	  494    494  scsi_eh_0          1.12     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	  485    485  kworker/R-ata_s    0.90     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	  574    574  kworker/R-kdmfl    0.36     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	   34     34  idle_inject/3      0.33     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	 1123   1123  nde-netfilter      0.28     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	   60     60  ksoftirqd/7        0.25     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	  114    114  kworker/0:2-cgr    0.25     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	  496    496  scsi_eh_1          0.24     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	   51     51  cpuhp/6            0.24     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	 1667   1667  atd                0.24     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	   45     45  cpuhp/5            0.23     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	 1102   1102  nde-backupservi    0.22     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	 1098   1098  systemsettings     0.21     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	 1100   1100  audit-monitor      0.20     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	   53     53  migration/6        0.20     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	 1482   1482  sshd               0.19     0.00     0.00     0.00    0.00     0.00     0.00     0.00
-	   39     39  cpuhp/4            0.19     0.00     0.00     0.00    0.00     0.00     0.00     0.00
+	  PID   TGID  COMMAND          CPU(ms)  IO(ms) SWAP(ms) RCL(ms) THR(ms) CMP(ms)  WP(ms) IRQ(ms)
+	----------------------------------------------------------------------------------------------
+	  161    161  zombie_memcg_re   1.40    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	  130    130  blkcg_punt_bio    1.37    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	  444    444  scsi_tmf_0        0.73    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	 1280   1280  rsyslogd          0.53    0.04    0.00    0.00    0.00    0.00    0.00    0.00
+	   12     12  ksoftirqd/0       0.47    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	 1277   1277  nbd-server        0.44    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	  308    308  kworker/2:2-sys   0.41    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	   55     55  netns             0.36    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	 1187   1187  acpid             0.31    0.03    0.00    0.00    0.00    0.00    0.00    0.00
+	 6184   6184  kworker/1:2-sys   0.24    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	  186    186  kaluad            0.24    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	   18     18  ksoftirqd/1       0.24    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	  185    185  kmpath_rdacd      0.23    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	  190    190  kstrp             0.23    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	 2759   2759  agetty            0.20    0.03    0.00    0.00    0.00    0.00    0.00    0.00
+	 1190   1190  kworker/0:3-sys   0.19    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	 1272   1272  sshd              0.15    0.04    0.00    0.00    0.00    0.00    0.00    0.00
+	 1156   1156  license           0.15    0.11    0.00    0.00    0.00    0.00    0.00    0.00
+	  134    134  md                0.13    0.00    0.00    0.00    0.00    0.00    0.00    0.00
+	 6142   6142  kworker/3:2-xfs   0.13    0.00    0.00    0.00    0.00    0.00    0.00    0.00
 
 Dynamic interactive interface of delaytop::
 
diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c
index cd848af9a856..9afb1ffc00ba 100644
--- a/tools/accounting/delaytop.c
+++ b/tools/accounting/delaytop.c
@@ -1,16 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * delaytop.c - task delay monitoring tool.
+ * delaytop.c - system-wide delay monitoring tool.
  *
  * This tool provides real-time monitoring and statistics of
  * system, container, and task-level delays, including CPU,
- * memory, IO, and IRQ and delay accounting. It supports both
- * interactive (top-like), and can output delay information
- * for the whole system, specific containers (cgroups), or
- * individual tasks (PIDs).
+ * memory, IO, and IRQ. It supports both interactive (top-like),
+ * and can output delay information for the whole system, specific
+ * containers (cgroups), or individual tasks (PIDs).
  *
  * Key features:
  *	- Collects per-task delay accounting statistics via taskstats.
+ *	- Collects system-wide PSI information.
  *	- Supports sorting, filtering.
  *	- Supports both interactive (screen refresh).
  *
@@ -32,6 +32,7 @@
 #include <time.h>
 #include <dirent.h>
 #include <ctype.h>
+#include <stdbool.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/socket.h>
@@ -41,7 +42,6 @@
 #include <linux/genetlink.h>
 #include <linux/taskstats.h>
 #include <linux/cgroupstats.h>
-#include <ncurses.h>
 
 #define PSI_CPU_SOME "/proc/pressure/cpu"
 #define PSI_CPU_FULL	"/proc/pressure/cpu"
@@ -62,6 +62,12 @@
 #define MAX_MSG_SIZE	1024
 #define MAX_TASKS		1000
 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field
+#define BOOL_FPRINT(stream, fmt, ...) \
+({ \
+	int ret = fprintf(stream, fmt, ##__VA_ARGS__); \
+	ret >= 0; \
+})
+#define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n"
 
 /* Program settings structure */
 struct config {
@@ -262,6 +268,7 @@ static int create_nl_socket(void)
 	local.nl_family = AF_NETLINK;
 
 	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) {
+		fprintf(stderr, "Failed to bind socket when create nl_socket\n");
 		close(fd);
 		return -1;
 	}
@@ -332,13 +339,17 @@ static int get_family_id(int sd)
 	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
 			CTRL_ATTR_FAMILY_NAME, (void *)name,
 			strlen(TASKSTATS_GENL_NAME)+1);
-	if (rc < 0)
+	if (rc < 0) {
+		fprintf(stderr, "Failed to send cmd for family id\n");
 		return 0;
+	}
 
 	rep_len = recv(sd, &ans, sizeof(ans), 0);
 	if (ans.n.nlmsg_type == NLMSG_ERROR ||
-		(rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
+		(rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) {
+		fprintf(stderr, "Failed to receive response for family id\n");
 		return 0;
+	}
 
 	na = (struct nlattr *) GENLMSG_DATA(&ans);
 	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
@@ -433,26 +444,30 @@ static void read_psi_stats(void)
 static int read_comm(int pid, char *comm_buf, size_t buf_size)
 {
 	char path[64];
+	int ret = -1;
 	size_t len;
 	FILE *fp;
 
 	snprintf(path, sizeof(path), "/proc/%d/comm", pid);
 	fp = fopen(path, "r");
-	if (!fp)
-		return -1;
+	if (!fp) {
+		fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid);
+		return ret;
+	}
+
 	if (fgets(comm_buf, buf_size, fp)) {
 		len = strlen(comm_buf);
 		if (len > 0 && comm_buf[len - 1] == '\n')
 			comm_buf[len - 1] = '\0';
-	} else {
-		fclose(fp);
-		return -1;
+		ret = 0;
 	}
+
 	fclose(fp);
-	return 0;
+
+	return ret;
 }
 
-static int fetch_and_fill_task_info(int pid, const char *comm)
+static void fetch_and_fill_task_info(int pid, const char *comm)
 {
 	struct {
 		struct nlmsghdr n;
@@ -466,13 +481,21 @@ static int fetch_and_fill_task_info(int pid, const char *comm)
 	int nl_len;
 	int rc;
 
+	/* Send request for task stats */
 	if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET,
 				 TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) {
-		return -1;
+		fprintf(stderr, "Failed to send request for task stats\n");
+		return;
 	}
+
+	/* Receive response */
 	rc = recv(nl_sd, &resp, sizeof(resp), 0);
-	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR)
-		return -1;
+	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
+		fprintf(stderr, "Failed to receive response for task stats\n");
+		return;
+	}
+
+	/* Parse response */
 	nl_len = GENLMSG_PAYLOAD(&resp.n);
 	na = (struct nlattr *) GENLMSG_DATA(&resp);
 	while (nl_len > 0) {
@@ -515,7 +538,7 @@ static int fetch_and_fill_task_info(int pid, const char *comm)
 		nl_len -= NLA_ALIGN(na->nla_len);
 		na = NLA_NEXT(na);
 	}
-	return 0;
+	return;
 }
 
 static void get_task_delays(void)
@@ -654,54 +677,82 @@ static void display_results(void)
 {
 	time_t now = time(NULL);
 	struct tm *tm_now = localtime(&now);
-	char timestamp[32];
-	int i, count;
 	FILE *out = stdout;
+	char timestamp[32];
+	bool suc = true;
+	int i, count;
+
+	/* Clear terminal screen */
+	suc &= BOOL_FPRINT(out, "\033[H\033[J");
 
-	fprintf(out, "\033[H\033[J");
 	/* PSI output (one-line, no cat style) */
-	fprintf(out, "System Pressure Information: ");
-	fprintf(out, "(avg10/avg60/avg300/total)\n");
-	fprintf(out, "CPU:");
-	fprintf(out, "	full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu", psi.cpu_full_avg10,
-			psi.cpu_full_avg60, psi.cpu_full_avg300, psi.cpu_full_total);
-	fprintf(out, "  some: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n", psi.cpu_some_avg10,
-			psi.cpu_some_avg60, psi.cpu_some_avg300, psi.cpu_some_total);
+	suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n");
+	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+		"CPU some:",
+		psi.cpu_some_avg10,
+		psi.cpu_some_avg60,
+		psi.cpu_some_avg300,
+		psi.cpu_some_total / 1000);
+	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+		"CPU full:",
+		psi.cpu_full_avg10,
+		psi.cpu_full_avg60,
+		psi.cpu_full_avg300,
+		psi.cpu_full_total / 1000);
+	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+		"Memory full:",
+		psi.memory_full_avg10,
+		psi.memory_full_avg60,
+		psi.memory_full_avg300,
+		psi.memory_full_total / 1000);
+	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+		"Memory some:",
+		psi.memory_some_avg10,
+		psi.memory_some_avg60,
+		psi.memory_some_avg300,
+		psi.memory_some_total / 1000);
+	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+		"IO full:",
+		psi.io_full_avg10,
+		psi.io_full_avg60,
+		psi.io_full_avg300,
+		psi.io_full_total / 1000);
+	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+		"IO some:",
+		psi.io_some_avg10,
+		psi.io_some_avg60,
+		psi.io_some_avg300,
+		psi.io_some_total / 1000);
+	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+		"IRQ full:",
+		psi.irq_full_avg10,
+		psi.irq_full_avg60,
+		psi.irq_full_avg300,
+		psi.irq_full_total / 1000);
 
-	fprintf(out, "Memory:");
-	fprintf(out, " full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu", psi.memory_full_avg10,
-			psi.memory_full_avg60, psi.memory_full_avg300, psi.memory_full_total);
-	fprintf(out, "  some: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n", psi.memory_some_avg10,
-			psi.memory_some_avg60, psi.memory_some_avg300, psi.memory_some_total);
-
-	fprintf(out, "IO:");
-	fprintf(out, "	full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu", psi.io_full_avg10,
-			psi.io_full_avg60, psi.io_full_avg300, psi.io_full_total);
-	fprintf(out, "  some: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n", psi.io_some_avg10,
-			psi.io_some_avg60, psi.io_some_avg300, psi.io_some_total);
-	fprintf(out, "IRQ:");
-	fprintf(out, "	full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n\n", psi.irq_full_avg10,
-			psi.irq_full_avg60, psi.irq_full_avg300, psi.irq_full_total);
 	if (cfg.container_path) {
-		fprintf(out, "Container Information (%s):\n", cfg.container_path);
-		fprintf(out, "Processes: running=%d, sleeping=%d, ",
+		suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path);
+		suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ",
 			container_stats.nr_running, container_stats.nr_sleeping);
-		fprintf(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n",
+		suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n",
 			container_stats.nr_stopped, container_stats.nr_uninterruptible,
 			container_stats.nr_io_wait);
 	}
-	fprintf(out, "Top %d processes (sorted by CPU delay):\n\n",
+	suc &= BOOL_FPRINT(out, "Top %d processes (sorted by CPU delay):\n",
 			cfg.max_processes);
-	fprintf(out, "  PID	TGID  COMMAND		 CPU(ms)  IO(ms)	");
-	fprintf(out, "SWAP(ms) RCL(ms) THR(ms)  CMP(ms)  WP(ms)  IRQ(ms)\n");
-	fprintf(out, "-----------------------------------------------");
-	fprintf(out, "----------------------------------------------\n");
+	suc &= BOOL_FPRINT(out, "%5s  %5s  %-17s", "PID", "TGID", "COMMAND");
+	suc &= BOOL_FPRINT(out, "%7s %7s %7s %7s %7s %7s %7s %7s\n",
+		"CPU(ms)", "IO(ms)", "SWAP(ms)", "RCL(ms)",
+		"THR(ms)", "CMP(ms)", "WP(ms)", "IRQ(ms)");
+
+	suc &= BOOL_FPRINT(out, "-----------------------------------------------");
+	suc &= BOOL_FPRINT(out, "----------------------------------------------\n");
 	count = task_count < cfg.max_processes ? task_count : cfg.max_processes;
 
 	for (i = 0; i < count; i++) {
-		fprintf(out, "%5d  %5d  %-15s ",
+		suc &= BOOL_FPRINT(out, "%5d  %5d  %-15s",
 			tasks[i].pid, tasks[i].tgid, tasks[i].command);
-		fprintf(out, "%7.2f  %7.2f  %7.2f  %7.2f %7.2f  %7.2f  %7.2f  %7.2f\n",
+		suc &= BOOL_FPRINT(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n",
 			average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count),
 			average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count),
 			average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count),
@@ -712,7 +763,10 @@ static void display_results(void)
 			average_ms(tasks[i].irq_delay_total, tasks[i].irq_count));
 	}
 
-	fprintf(out, "\n");
+	suc &= BOOL_FPRINT(out, "\n");
+
+	if (!suc)
+		perror("Error writing to output");
 }
 
 /* Main function */

From b753522bed0b7e388a643f58d91bd81d8849ba43 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Sun, 27 Jul 2025 11:37:33 +0300
Subject: [PATCH 79/80] kho: add test for kexec handover

Testing kexec handover requires a kernel driver that will generate some
data and preserve it with KHO on the first boot and then restore that data
and verify it was preserved properly after kexec.

To facilitate such test, along with the kernel driver responsible for data
generation, preservation and restoration add a script that runs a kernel
in a VM with a minimal /init.  The /init enables KHO, loads a kernel image
for kexec and runs kexec reboot.  After the boot of the kexeced kernel,
the driver verifies that the data was properly preserved.

[rppt@kernel.org: fix section mismatch]
  Link: https://lkml.kernel.org/r/aIiRC8fXiOXKbPM_@kernel.org
Link: https://lkml.kernel.org/r/20250727083733.2590139-1-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS                            |   1 +
 lib/Kconfig.debug                      |  21 ++
 lib/Makefile                           |   1 +
 lib/test_kho.c                         | 305 +++++++++++++++++++++++++
 tools/testing/selftests/kho/arm64.conf |   9 +
 tools/testing/selftests/kho/init.c     | 100 ++++++++
 tools/testing/selftests/kho/vmtest.sh  | 183 +++++++++++++++
 tools/testing/selftests/kho/x86.conf   |   7 +
 8 files changed, 627 insertions(+)
 create mode 100644 lib/test_kho.c
 create mode 100644 tools/testing/selftests/kho/arm64.conf
 create mode 100644 tools/testing/selftests/kho/init.c
 create mode 100755 tools/testing/selftests/kho/vmtest.sh
 create mode 100644 tools/testing/selftests/kho/x86.conf

diff --git a/MAINTAINERS b/MAINTAINERS
index 87897a285bc1..d16d76f24c6e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13353,6 +13353,7 @@ F:	Documentation/admin-guide/mm/kho.rst
 F:	Documentation/core-api/kho/*
 F:	include/linux/kexec_handover.h
 F:	kernel/kexec_handover.c
+F:	tools/testing/selftests/kho/
 
 KEYS-ENCRYPTED
 M:	Mimi Zohar <zohar@linux.ibm.com>
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ebe33181b6e6..4f82d38e3c45 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -3225,6 +3225,27 @@ config TEST_OBJPOOL
 
 	  If unsure, say N.
 
+config TEST_KEXEC_HANDOVER
+	bool "Test for Kexec HandOver"
+	default n
+	depends on KEXEC_HANDOVER
+	help
+	  This option enables test for Kexec HandOver (KHO).
+	  The test consists of two parts: saving kernel data before kexec and
+	  restoring the data after kexec and verifying that it was properly
+	  handed over. This test module creates and saves data on the boot of
+	  the first kernel and restores and verifies the data on the boot of
+	  kexec'ed kernel.
+
+	  For detailed documentation about KHO, see Documentation/core-api/kho.
+
+	  To run the test run:
+
+	  tools/testing/selftests/kho/vmtest.sh -h
+
+	  If unsure, say N.
+
+
 config INT_POW_KUNIT_TEST
 	tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS
 	depends on KUNIT
diff --git a/lib/Makefile b/lib/Makefile
index 88d6228089a8..dadf0028b319 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_TEST_HMM) += test_hmm.o
 obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o
 obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o
 obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o
+obj-$(CONFIG_TEST_KEXEC_HANDOVER) += test_kho.o
 
 obj-$(CONFIG_TEST_FPU) += test_fpu.o
 test_fpu-y := test_fpu_glue.o test_fpu_impl.o
diff --git a/lib/test_kho.c b/lib/test_kho.c
new file mode 100644
index 000000000000..c2eb899c3b45
--- /dev/null
+++ b/lib/test_kho.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test module for KHO
+ * Copyright (c) 2025 Microsoft Corporation.
+ *
+ * Authors:
+ *   Saurabh Sengar <ssengar@microsoft.com>
+ *   Mike Rapoport <rppt@kernel.org>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/libfdt.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/vmalloc.h>
+#include <linux/kexec_handover.h>
+
+#include <net/checksum.h>
+
+#define KHO_TEST_MAGIC	0x4b484f21	/* KHO! */
+#define KHO_TEST_FDT	"kho_test"
+#define KHO_TEST_COMPAT "kho-test-v1"
+
+static long max_mem = (PAGE_SIZE << MAX_PAGE_ORDER) * 2;
+module_param(max_mem, long, 0644);
+
+struct kho_test_state {
+	unsigned int nr_folios;
+	struct folio **folios;
+	struct folio *fdt;
+	__wsum csum;
+};
+
+static struct kho_test_state kho_test_state;
+
+static int kho_test_notifier(struct notifier_block *self, unsigned long cmd,
+			     void *v)
+{
+	struct kho_test_state *state = &kho_test_state;
+	struct kho_serialization *ser = v;
+	int err = 0;
+
+	switch (cmd) {
+	case KEXEC_KHO_ABORT:
+		return NOTIFY_DONE;
+	case KEXEC_KHO_FINALIZE:
+		/* Handled below */
+		break;
+	default:
+		return NOTIFY_BAD;
+	}
+
+	err |= kho_preserve_folio(state->fdt);
+	err |= kho_add_subtree(ser, KHO_TEST_FDT, folio_address(state->fdt));
+
+	return err ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+static struct notifier_block kho_test_nb = {
+	.notifier_call = kho_test_notifier,
+};
+
+static int kho_test_save_data(struct kho_test_state *state, void *fdt)
+{
+	phys_addr_t *folios_info __free(kvfree) = NULL;
+	int err = 0;
+
+	folios_info = kvmalloc_array(state->nr_folios, sizeof(*folios_info),
+				     GFP_KERNEL);
+	if (!folios_info)
+		return -ENOMEM;
+
+	for (int i = 0; i < state->nr_folios; i++) {
+		struct folio *folio = state->folios[i];
+		unsigned int order = folio_order(folio);
+
+		folios_info[i] = virt_to_phys(folio_address(folio)) | order;
+
+		err = kho_preserve_folio(folio);
+		if (err)
+			return err;
+	}
+
+	err |= fdt_begin_node(fdt, "data");
+	err |= fdt_property(fdt, "nr_folios", &state->nr_folios,
+			    sizeof(state->nr_folios));
+	err |= fdt_property(fdt, "folios_info", folios_info,
+			    state->nr_folios * sizeof(*folios_info));
+	err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum));
+	err |= fdt_end_node(fdt);
+
+	return err;
+}
+
+static int kho_test_prepare_fdt(struct kho_test_state *state)
+{
+	const char compatible[] = KHO_TEST_COMPAT;
+	unsigned int magic = KHO_TEST_MAGIC;
+	ssize_t fdt_size;
+	int err = 0;
+	void *fdt;
+
+	fdt_size = state->nr_folios * sizeof(phys_addr_t) + PAGE_SIZE;
+	state->fdt = folio_alloc(GFP_KERNEL, get_order(fdt_size));
+	if (!state->fdt)
+		return -ENOMEM;
+
+	fdt = folio_address(state->fdt);
+
+	err |= fdt_create(fdt, fdt_size);
+	err |= fdt_finish_reservemap(fdt);
+
+	err |= fdt_begin_node(fdt, "");
+	err |= fdt_property(fdt, "compatible", compatible, sizeof(compatible));
+	err |= fdt_property(fdt, "magic", &magic, sizeof(magic));
+	err |= kho_test_save_data(state, fdt);
+	err |= fdt_end_node(fdt);
+
+	err |= fdt_finish(fdt);
+
+	if (err)
+		folio_put(state->fdt);
+
+	return err;
+}
+
+static int kho_test_generate_data(struct kho_test_state *state)
+{
+	size_t alloc_size = 0;
+	__wsum csum = 0;
+
+	while (alloc_size < max_mem) {
+		int order = get_random_u32() % NR_PAGE_ORDERS;
+		struct folio *folio;
+		unsigned int size;
+		void *addr;
+
+		/* cap allocation so that we won't exceed max_mem */
+		if (alloc_size + (PAGE_SIZE << order) > max_mem) {
+			order = get_order(max_mem - alloc_size);
+			if (order)
+				order--;
+		}
+		size = PAGE_SIZE << order;
+
+		folio = folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
+		if (!folio)
+			goto err_free_folios;
+
+		state->folios[state->nr_folios++] = folio;
+		addr = folio_address(folio);
+		get_random_bytes(addr, size);
+		csum = csum_partial(addr, size, csum);
+		alloc_size += size;
+	}
+
+	state->csum = csum;
+	return 0;
+
+err_free_folios:
+	for (int i = 0; i < state->nr_folios; i++)
+		folio_put(state->folios[i]);
+	return -ENOMEM;
+}
+
+static int kho_test_save(void)
+{
+	struct kho_test_state *state = &kho_test_state;
+	struct folio **folios __free(kvfree) = NULL;
+	unsigned long max_nr;
+	int err;
+
+	max_mem = PAGE_ALIGN(max_mem);
+	max_nr = max_mem >> PAGE_SHIFT;
+
+	folios = kvmalloc_array(max_nr, sizeof(*state->folios), GFP_KERNEL);
+	if (!folios)
+		return -ENOMEM;
+	state->folios = folios;
+
+	err = kho_test_generate_data(state);
+	if (err)
+		return err;
+
+	err = kho_test_prepare_fdt(state);
+	if (err)
+		return err;
+
+	return register_kho_notifier(&kho_test_nb);
+}
+
+static int kho_test_restore_data(const void *fdt, int node)
+{
+	const unsigned int *nr_folios;
+	const phys_addr_t *folios_info;
+	const __wsum *old_csum;
+	__wsum csum = 0;
+	int len;
+
+	node = fdt_path_offset(fdt, "/data");
+
+	nr_folios = fdt_getprop(fdt, node, "nr_folios", &len);
+	if (!nr_folios || len != sizeof(*nr_folios))
+		return -EINVAL;
+
+	old_csum = fdt_getprop(fdt, node, "csum", &len);
+	if (!old_csum || len != sizeof(*old_csum))
+		return -EINVAL;
+
+	folios_info = fdt_getprop(fdt, node, "folios_info", &len);
+	if (!folios_info || len != sizeof(*folios_info) * *nr_folios)
+		return -EINVAL;
+
+	for (int i = 0; i < *nr_folios; i++) {
+		unsigned int order = folios_info[i] & ~PAGE_MASK;
+		phys_addr_t phys = folios_info[i] & PAGE_MASK;
+		unsigned int size = PAGE_SIZE << order;
+		struct folio *folio;
+
+		folio = kho_restore_folio(phys);
+		if (!folio)
+			break;
+
+		if (folio_order(folio) != order)
+			break;
+
+		csum = csum_partial(folio_address(folio), size, csum);
+		folio_put(folio);
+	}
+
+	if (csum != *old_csum)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int kho_test_restore(phys_addr_t fdt_phys)
+{
+	void *fdt = phys_to_virt(fdt_phys);
+	const unsigned int *magic;
+	int node, len, err;
+
+	node = fdt_path_offset(fdt, "/");
+	if (node < 0)
+		return -EINVAL;
+
+	if (fdt_node_check_compatible(fdt, node, KHO_TEST_COMPAT))
+		return -EINVAL;
+
+	magic = fdt_getprop(fdt, node, "magic", &len);
+	if (!magic || len != sizeof(*magic))
+		return -EINVAL;
+
+	if (*magic != KHO_TEST_MAGIC)
+		return -EINVAL;
+
+	err = kho_test_restore_data(fdt, node);
+	if (err)
+		return err;
+
+	pr_info("KHO restore succeeded\n");
+	return 0;
+}
+
+static int __init kho_test_init(void)
+{
+	phys_addr_t fdt_phys;
+	int err;
+
+	err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys);
+	if (!err)
+		return kho_test_restore(fdt_phys);
+
+	if (err != -ENOENT) {
+		pr_warn("failed to retrieve %s FDT: %d\n", KHO_TEST_FDT, err);
+		return err;
+	}
+
+	return kho_test_save();
+}
+module_init(kho_test_init);
+
+static void kho_test_cleanup(void)
+{
+	for (int i = 0; i < kho_test_state.nr_folios; i++)
+		folio_put(kho_test_state.folios[i]);
+
+	kvfree(kho_test_state.folios);
+}
+
+static void __exit kho_test_exit(void)
+{
+	unregister_kho_notifier(&kho_test_nb);
+	kho_test_cleanup();
+}
+module_exit(kho_test_exit);
+
+MODULE_AUTHOR("Mike Rapoport <rppt@kernel.org>");
+MODULE_DESCRIPTION("KHO test module");
+MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/kho/arm64.conf b/tools/testing/selftests/kho/arm64.conf
new file mode 100644
index 000000000000..ee696807cd35
--- /dev/null
+++ b/tools/testing/selftests/kho/arm64.conf
@@ -0,0 +1,9 @@
+QEMU_CMD="qemu-system-aarch64 -M virt -cpu max"
+QEMU_KCONFIG="
+CONFIG_SERIAL_AMBA_PL010=y
+CONFIG_SERIAL_AMBA_PL010_CONSOLE=y
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+"
+KERNEL_IMAGE="Image"
+KERNEL_CMDLINE="console=ttyAMA0"
diff --git a/tools/testing/selftests/kho/init.c b/tools/testing/selftests/kho/init.c
new file mode 100644
index 000000000000..8034e24c6bf6
--- /dev/null
+++ b/tools/testing/selftests/kho/init.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef NOLIBC
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <syscall.h>
+#include <sys/mount.h>
+#include <sys/reboot.h>
+#endif
+
+/* from arch/x86/include/asm/setup.h */
+#define COMMAND_LINE_SIZE	2048
+
+/* from include/linux/kexex.h */
+#define KEXEC_FILE_NO_INITRAMFS	0x00000004
+
+#define KHO_FINILIZE "/debugfs/kho/out/finalize"
+#define KERNEL_IMAGE "/kernel"
+
+static int mount_filesystems(void)
+{
+	if (mount("debugfs", "/debugfs", "debugfs", 0, NULL) < 0)
+		return -1;
+
+	return mount("proc", "/proc", "proc", 0, NULL);
+}
+
+static int kho_enable(void)
+{
+	const char enable[] = "1";
+	int fd;
+
+	fd = open(KHO_FINILIZE, O_RDWR);
+	if (fd < 0)
+		return -1;
+
+	if (write(fd, enable, sizeof(enable)) != sizeof(enable))
+		return 1;
+
+	close(fd);
+	return 0;
+}
+
+static long kexec_file_load(int kernel_fd, int initrd_fd,
+			    unsigned long cmdline_len, const char *cmdline,
+			    unsigned long flags)
+{
+	return syscall(__NR_kexec_file_load, kernel_fd, initrd_fd, cmdline_len,
+		       cmdline, flags);
+}
+
+static int kexec_load(void)
+{
+	char cmdline[COMMAND_LINE_SIZE];
+	ssize_t len;
+	int fd, err;
+
+	fd = open("/proc/cmdline", O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	len = read(fd, cmdline, sizeof(cmdline));
+	close(fd);
+	if (len < 0)
+		return -1;
+
+	/* replace \n with \0 */
+	cmdline[len - 1] = 0;
+	fd = open(KERNEL_IMAGE, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	err = kexec_file_load(fd, -1, len, cmdline, KEXEC_FILE_NO_INITRAMFS);
+	close(fd);
+
+	return err ? : 0;
+}
+
+int main(int argc, char *argv[])
+{
+	if (mount_filesystems())
+		goto err_reboot;
+
+	if (kho_enable())
+		goto err_reboot;
+
+	if (kexec_load())
+		goto err_reboot;
+
+	if (reboot(RB_KEXEC))
+		goto err_reboot;
+
+	return 0;
+
+err_reboot:
+	reboot(RB_AUTOBOOT);
+	return -1;
+}
diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh
new file mode 100755
index 000000000000..ec70a17bd476
--- /dev/null
+++ b/tools/testing/selftests/kho/vmtest.sh
@@ -0,0 +1,183 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -ue
+
+CROSS_COMPILE="${CROSS_COMPILE:-""}"
+
+test_dir=$(realpath "$(dirname "$0")")
+kernel_dir=$(realpath "$test_dir/../../../..")
+
+tmp_dir=$(mktemp -d /tmp/kho-test.XXXXXXXX)
+headers_dir="$tmp_dir/usr"
+initrd_dir="$tmp_dir/initrd"
+initrd="$tmp_dir/initrd.cpio"
+
+source "$test_dir/../kselftest/ktap_helpers.sh"
+
+function usage() {
+	cat <<EOF
+$0 [-d build_dir] [-j jobs] [-t target_arch] [-h]
+Options:
+	-d)	path to the kernel build directory
+	-j)	number of jobs for compilation, similar to -j in make
+	-t)	run test for target_arch, requires CROSS_COMPILE set
+		supported targets: aarch64, x86_64
+	-h)	display this help
+EOF
+}
+
+function cleanup() {
+	rm -fr "$tmp_dir"
+	ktap_finished
+}
+trap cleanup EXIT
+
+function skip() {
+	local msg=${1:-""}
+
+	ktap_test_skip "$msg"
+	exit "$KSFT_SKIP"
+}
+
+function fail() {
+	local msg=${1:-""}
+
+	ktap_test_fail "$msg"
+	exit "$KSFT_FAIL"
+}
+
+function build_kernel() {
+	local build_dir=$1
+	local make_cmd=$2
+	local arch_kconfig=$3
+	local kimage=$4
+
+	local kho_config="$tmp_dir/kho.config"
+	local kconfig="$build_dir/.config"
+
+	# enable initrd, KHO and KHO test in kernel configuration
+	tee "$kconfig" > "$kho_config" <<EOF
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KEXEC_HANDOVER=y
+CONFIG_TEST_KEXEC_HANDOVER=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_VM=y
+$arch_kconfig
+EOF
+
+	make_cmd="$make_cmd -C $kernel_dir O=$build_dir"
+	$make_cmd olddefconfig
+
+	# verify that kernel confiration has all necessary options
+	while read -r opt ; do
+		grep "$opt" "$kconfig" &>/dev/null || skip "$opt is missing"
+	done < "$kho_config"
+
+	$make_cmd "$kimage"
+	$make_cmd headers_install INSTALL_HDR_PATH="$headers_dir"
+}
+
+function mkinitrd() {
+	local kernel=$1
+
+	mkdir -p "$initrd_dir"/{dev,debugfs,proc}
+	sudo mknod "$initrd_dir/dev/console" c 5 1
+
+	"$CROSS_COMPILE"gcc -s -static -Os -nostdinc -I"$headers_dir/include" \
+			-fno-asynchronous-unwind-tables -fno-ident -nostdlib \
+			-include "$test_dir/../../../include/nolibc/nolibc.h" \
+			-o "$initrd_dir/init" "$test_dir/init.c" \
+
+	cp "$kernel" "$initrd_dir/kernel"
+
+	pushd "$initrd_dir" &>/dev/null
+	find . | cpio -H newc --create > "$initrd" 2>/dev/null
+	popd &>/dev/null
+}
+
+function run_qemu() {
+	local qemu_cmd=$1
+	local cmdline=$2
+	local kernel=$3
+	local serial="$tmp_dir/qemu.serial"
+
+	cmdline="$cmdline kho=on panic=-1"
+
+	$qemu_cmd -m 1G -smp 2 -no-reboot -nographic -nodefaults \
+		  -accel kvm -accel hvf -accel tcg  \
+		  -serial file:"$serial" \
+		  -append "$cmdline" \
+		  -kernel "$kernel" \
+		  -initrd "$initrd"
+
+	grep "KHO restore succeeded" "$serial" &> /dev/null || fail "KHO failed"
+}
+
+function target_to_arch() {
+	local target=$1
+
+	case $target in
+	     aarch64) echo "arm64" ;;
+	     x86_64) echo "x86" ;;
+	     *) skip "architecture $target is not supported"
+	esac
+}
+
+function main() {
+	local build_dir="$kernel_dir/.kho"
+	local jobs=$(($(nproc) * 2))
+	local target="$(uname -m)"
+
+	# skip the test if any of the preparation steps fails
+	set -o errtrace
+	trap skip ERR
+
+	while getopts 'hd:j:t:' opt; do
+		case $opt in
+		d)
+			build_dir="$OPTARG"
+			;;
+		j)
+		        jobs="$OPTARG"
+			;;
+		t)
+			target="$OPTARG"
+			;;
+		h)
+			usage
+			exit 0
+			;;
+		*)
+			echo Unknown argument "$opt"
+			usage
+			exit 1
+			;;
+		esac
+	done
+
+	ktap_print_header
+	ktap_set_plan 1
+
+	if [[ "$target" != "$(uname -m)" ]] && [[ -z "$CROSS_COMPILE" ]]; then
+		skip "Cross-platform testing needs to specify CROSS_COMPILE"
+	fi
+
+	mkdir -p "$build_dir"
+	local arch=$(target_to_arch "$target")
+	source "$test_dir/$arch.conf"
+
+	# build the kernel and create initrd
+	# initrd includes the kernel image that will be kexec'ed
+	local make_cmd="make ARCH=$arch CROSS_COMPILE=$CROSS_COMPILE -j$jobs"
+	build_kernel "$build_dir" "$make_cmd" "$QEMU_KCONFIG" "$KERNEL_IMAGE"
+
+	local kernel="$build_dir/arch/$arch/boot/$KERNEL_IMAGE"
+	mkinitrd "$kernel"
+
+	run_qemu "$QEMU_CMD" "$KERNEL_CMDLINE" "$kernel"
+
+	ktap_test_pass "KHO succeeded"
+}
+
+main "$@"
diff --git a/tools/testing/selftests/kho/x86.conf b/tools/testing/selftests/kho/x86.conf
new file mode 100644
index 000000000000..b419e610ca22
--- /dev/null
+++ b/tools/testing/selftests/kho/x86.conf
@@ -0,0 +1,7 @@
+QEMU_CMD=qemu-system-x86_64
+QEMU_KCONFIG="
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+"
+KERNEL_IMAGE="bzImage"
+KERNEL_CMDLINE="console=ttyS0"

From 085dece6cc88b5c6fc6f2eca0403bfd2c5fbc7cb Mon Sep 17 00:00:00 2001
From: Fan Yu <fan.yu9@zte.com.cn>
Date: Thu, 31 Jul 2025 22:53:26 +0800
Subject: [PATCH 80/80] tools/getdelays: add backward compatibility for
 taskstats version

Add version checks to print_delayacct() to handle differences in struct
taskstats across kernel versions.  Field availability depends on taskstats
version (t->version), corresponding to TASKSTATS_VERSION in kernel headers
(see include/uapi/linux/taskstats.h).

Version feature mapping:
- version >= 11  - supports COMPACT statistics
- version >= 13  - supports WPCOPY statistics
- version >= 14  - supports IRQ statistics
- version >= 16  - supports *_max and *_min delay statistics

This ensures the tool works correctly with both older and newer kernel
versions by conditionally printing fields based on the reported version.

eg.1
bash# grep -r "#define TASKSTATS_VERSION" /usr/include/linux/taskstats.h
"#define TASKSTATS_VERSION       10"
bash# ./getdelays -d -p 1
CPU                 count     real total  virtual total    delay total  delay average
                     7481     3786181709     3807098291       36393725          0.005ms
IO                  count    delay total  delay average
                      369     1116046035          3.025ms
SWAP                count    delay total  delay average
                        0              0          0.000ms
RECLAIM             count    delay total  delay average
                        0              0          0.000ms
THRASHING           count    delay total  delay average
                        0              0          0.000ms

eg.2
bash# grep -r "#define TASKSTATS_VERSION" /usr/include/linux/taskstats.h
"#define TASKSTATS_VERSION       14"
bash# ./getdelays -d -p 1
CPU                 count     real total  virtual total    delay total  delay average
                    68862   163474790046   174584722267    19962496806          0.290ms
IO                  count    delay total  delay average
                        0              0          0.000ms
SWAP                count    delay total  delay average
                        0              0          0.000ms
RECLAIM             count    delay total  delay average
                        0              0          0.000ms
THRASHING           count    delay total  delay average
                        0              0          0.000ms
COMPACT             count    delay total  delay average
                        0              0          0.000ms
WPCOPY              count    delay total  delay average
                        0              0          0.000ms
IRQ                 count    delay total  delay average
                        0              0          0.000ms

Link: https://lkml.kernel.org/r/20250731225326549CttJ7g9NfjTlaqBwl015T@zte.com.cn
Signed-off-by: Fan Yu <fan.yu9@zte.com.cn>
Cc: Fan Yu <fan.yu9@zte.com.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Wang Yaxin <wang.yaxin@zte.com.cn>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/accounting/getdelays.c | 167 +++++++++++++++++++++--------------
 1 file changed, 100 insertions(+), 67 deletions(-)

diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 3feac0482fe9..21cb3c3d1331 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -194,75 +194,108 @@ static int get_family_id(int sd)
 #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1))
 #define delay_ms(t) (t / 1000000ULL)
 
+/*
+ * Version compatibility note:
+ * Field availability depends on taskstats version (t->version),
+ * corresponding to TASKSTATS_VERSION in kernel headers
+ * see include/uapi/linux/taskstats.h
+ *
+ * Version feature mapping:
+ * version >= 11  - supports COMPACT statistics
+ * version >= 13  - supports WPCOPY statistics
+ * version >= 14  - supports IRQ statistics
+ * version >= 16  - supports *_max and *_min delay statistics
+ *
+ * Always verify version before accessing version-dependent fields
+ * to maintain backward compatibility.
+ */
+#define PRINT_CPU_DELAY(version, t) \
+	do { \
+		if (version >= 16) { \
+			printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \
+				"CPU", "count", "real total", "virtual total", \
+				"delay total", "delay average", "delay max", "delay min"); \
+			printf("          %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n", \
+				(unsigned long long)(t)->cpu_count, \
+				(unsigned long long)(t)->cpu_run_real_total, \
+				(unsigned long long)(t)->cpu_run_virtual_total, \
+				(unsigned long long)(t)->cpu_delay_total, \
+				average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \
+				delay_ms((double)(t)->cpu_delay_max), \
+				delay_ms((double)(t)->cpu_delay_min)); \
+		} else { \
+			printf("%-10s%15s%15s%15s%15s%15s\n", \
+				"CPU", "count", "real total", "virtual total", \
+				"delay total", "delay average"); \
+			printf("          %15llu%15llu%15llu%15llu%15.3fms\n", \
+				(unsigned long long)(t)->cpu_count, \
+				(unsigned long long)(t)->cpu_run_real_total, \
+				(unsigned long long)(t)->cpu_run_virtual_total, \
+				(unsigned long long)(t)->cpu_delay_total, \
+				average_ms((double)(t)->cpu_delay_total, (t)->cpu_count)); \
+		} \
+	} while (0)
+#define PRINT_FILED_DELAY(name, version, t, count, total, max, min) \
+	do { \
+		if (version >= 16) { \
+			printf("%-10s%15s%15s%15s%15s%15s\n", \
+				name, "count", "delay total", "delay average", \
+				"delay max", "delay min"); \
+			printf("          %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \
+				(unsigned long long)(t)->count, \
+				(unsigned long long)(t)->total, \
+				average_ms((double)(t)->total, (t)->count), \
+				delay_ms((double)(t)->max), \
+				delay_ms((double)(t)->min)); \
+		} else { \
+			printf("%-10s%15s%15s%15s\n", \
+				name, "count", "delay total", "delay average"); \
+			printf("          %15llu%15llu%15.3fms\n", \
+				(unsigned long long)(t)->count, \
+				(unsigned long long)(t)->total, \
+				average_ms((double)(t)->total, (t)->count)); \
+		} \
+	} while (0)
+
 static void print_delayacct(struct taskstats *t)
 {
-	printf("\n\nCPU   %15s%15s%15s%15s%15s%15s%15s\n"
-	       "      %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n"
-	       "IO    %15s%15s%15s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
-	       "SWAP  %15s%15s%15s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
-	       "RECLAIM  %12s%15s%15s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
-	       "THRASHING%12s%15s%15s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
-	       "COMPACT  %12s%15s%15s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
-	       "WPCOPY   %12s%15s%15s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
-	       "IRQ   %15s%15s%15s%15s%15s\n"
-	       "      %15llu%15llu%15.3fms%13.6fms%13.6fms\n",
-	       "count", "real total", "virtual total",
-	       "delay total", "delay average", "delay max", "delay min",
-	       (unsigned long long)t->cpu_count,
-	       (unsigned long long)t->cpu_run_real_total,
-	       (unsigned long long)t->cpu_run_virtual_total,
-	       (unsigned long long)t->cpu_delay_total,
-	       average_ms((double)t->cpu_delay_total, t->cpu_count),
-	       delay_ms((double)t->cpu_delay_max),
-	       delay_ms((double)t->cpu_delay_min),
-	       "count", "delay total", "delay average", "delay max", "delay min",
-	       (unsigned long long)t->blkio_count,
-	       (unsigned long long)t->blkio_delay_total,
-	       average_ms((double)t->blkio_delay_total, t->blkio_count),
-	       delay_ms((double)t->blkio_delay_max),
-	       delay_ms((double)t->blkio_delay_min),
-	       "count", "delay total", "delay average", "delay max", "delay min",
-	       (unsigned long long)t->swapin_count,
-	       (unsigned long long)t->swapin_delay_total,
-	       average_ms((double)t->swapin_delay_total, t->swapin_count),
-	       delay_ms((double)t->swapin_delay_max),
-	       delay_ms((double)t->swapin_delay_min),
-	       "count", "delay total", "delay average", "delay max", "delay min",
-	       (unsigned long long)t->freepages_count,
-	       (unsigned long long)t->freepages_delay_total,
-	       average_ms((double)t->freepages_delay_total, t->freepages_count),
-	       delay_ms((double)t->freepages_delay_max),
-	       delay_ms((double)t->freepages_delay_min),
-	       "count", "delay total", "delay average", "delay max", "delay min",
-	       (unsigned long long)t->thrashing_count,
-	       (unsigned long long)t->thrashing_delay_total,
-	       average_ms((double)t->thrashing_delay_total, t->thrashing_count),
-	       delay_ms((double)t->thrashing_delay_max),
-	       delay_ms((double)t->thrashing_delay_min),
-	       "count", "delay total", "delay average", "delay max", "delay min",
-	       (unsigned long long)t->compact_count,
-	       (unsigned long long)t->compact_delay_total,
-	       average_ms((double)t->compact_delay_total, t->compact_count),
-	       delay_ms((double)t->compact_delay_max),
-	       delay_ms((double)t->compact_delay_min),
-	       "count", "delay total", "delay average", "delay max", "delay min",
-	       (unsigned long long)t->wpcopy_count,
-	       (unsigned long long)t->wpcopy_delay_total,
-	       average_ms((double)t->wpcopy_delay_total, t->wpcopy_count),
-	       delay_ms((double)t->wpcopy_delay_max),
-	       delay_ms((double)t->wpcopy_delay_min),
-	       "count", "delay total", "delay average", "delay max", "delay min",
-	       (unsigned long long)t->irq_count,
-	       (unsigned long long)t->irq_delay_total,
-	       average_ms((double)t->irq_delay_total, t->irq_count),
-	       delay_ms((double)t->irq_delay_max),
-	       delay_ms((double)t->irq_delay_min));
+	printf("\n\n");
+
+	PRINT_CPU_DELAY(t->version, t);
+
+	PRINT_FILED_DELAY("IO", t->version, t,
+		blkio_count, blkio_delay_total,
+		blkio_delay_max, blkio_delay_min);
+
+	PRINT_FILED_DELAY("SWAP", t->version, t,
+		swapin_count, swapin_delay_total,
+		swapin_delay_max, swapin_delay_min);
+
+	PRINT_FILED_DELAY("RECLAIM", t->version, t,
+		freepages_count, freepages_delay_total,
+		freepages_delay_max, freepages_delay_min);
+
+	PRINT_FILED_DELAY("THRASHING", t->version, t,
+		thrashing_count, thrashing_delay_total,
+		thrashing_delay_max, thrashing_delay_min);
+
+	if (t->version >= 11) {
+		PRINT_FILED_DELAY("COMPACT", t->version, t,
+			compact_count, compact_delay_total,
+			compact_delay_max, compact_delay_min);
+	}
+
+	if (t->version >= 13) {
+		PRINT_FILED_DELAY("WPCOPY", t->version, t,
+			wpcopy_count, wpcopy_delay_total,
+			wpcopy_delay_max, wpcopy_delay_min);
+	}
+
+	if (t->version >= 14) {
+		PRINT_FILED_DELAY("IRQ", t->version, t,
+			irq_count, irq_delay_total,
+			irq_delay_max, irq_delay_min);
+	}
 }
 
 static void task_context_switch_counts(struct taskstats *t)