mirror of
https://github.com/torvalds/linux.git
synced 2025-08-15 14:11:42 +02:00

On do_exit() when a task is exiting, if a unwind is requested and the deferred user stacktrace is deferred via the task_work, the task_work callback is called after exit_mm() is called in do_exit(). This means that the user stack trace will not be retrieved and an empty stack is created. Instead, add a function unwind_deferred_task_exit() and call it just before exit_mm() so that the unwinder can call the requested callbacks with the user space stack. Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Josh Poimboeuf <jpoimboe@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Andrii Nakryiko <andrii@kernel.org> Cc: Indu Bhagat <indu.bhagat@oracle.com> Cc: "Jose E. Marchesi" <jemarch@gnu.org> Cc: Beau Belgrave <beaub@linux.microsoft.com> Cc: Jens Remus <jremus@linux.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Florian Weimer <fweimer@redhat.com> Cc: Sam James <sam@gentoo.org> Link: https://lore.kernel.org/20250729182406.504259474@kernel.org Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
362 lines
9.8 KiB
C
362 lines
9.8 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Deferred user space unwinding
|
|
*/
|
|
#include <linux/sched/task_stack.h>
|
|
#include <linux/unwind_deferred.h>
|
|
#include <linux/sched/clock.h>
|
|
#include <linux/task_work.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/sizes.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/mm.h>
|
|
|
|
/*
|
|
* For requesting a deferred user space stack trace from NMI context
|
|
* the architecture must support a safe cmpxchg in NMI context.
|
|
* For those architectures that do not have that, then it cannot ask
|
|
* for a deferred user space stack trace from an NMI context. If it
|
|
* does, then it will get -EINVAL.
|
|
*/
|
|
#if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
|
|
# define CAN_USE_IN_NMI 1
|
|
static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
|
|
{
|
|
u32 old = 0;
|
|
|
|
return try_cmpxchg(&info->id.cnt, &old, cnt);
|
|
}
|
|
#else
|
|
# define CAN_USE_IN_NMI 0
|
|
/* When NMIs are not allowed, this always succeeds */
|
|
static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
|
|
{
|
|
info->id.cnt = cnt;
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
/* Make the cache fit in a 4K page */
|
|
#define UNWIND_MAX_ENTRIES \
|
|
((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
|
|
|
|
/* Guards adding to or removing from the list of callbacks */
|
|
static DEFINE_MUTEX(callback_mutex);
|
|
static LIST_HEAD(callbacks);
|
|
|
|
#define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED)
|
|
|
|
/* Zero'd bits are available for assigning callback users */
|
|
static unsigned long unwind_mask = RESERVED_BITS;
|
|
DEFINE_STATIC_SRCU(unwind_srcu);
|
|
|
|
static inline bool unwind_pending(struct unwind_task_info *info)
|
|
{
|
|
return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
|
|
}
|
|
|
|
/*
|
|
* This is a unique percpu identifier for a given task entry context.
|
|
* Conceptually, it's incremented every time the CPU enters the kernel from
|
|
* user space, so that each "entry context" on the CPU gets a unique ID. In
|
|
* reality, as an optimization, it's only incremented on demand for the first
|
|
* deferred unwind request after a given entry-from-user.
|
|
*
|
|
* It's combined with the CPU id to make a systemwide-unique "context cookie".
|
|
*/
|
|
static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
|
|
|
|
/*
|
|
* The context cookie is a unique identifier that is assigned to a user
|
|
* space stacktrace. As the user space stacktrace remains the same while
|
|
* the task is in the kernel, the cookie is an identifier for the stacktrace.
|
|
* Although it is possible for the stacktrace to get another cookie if another
|
|
* request is made after the cookie was cleared and before reentering user
|
|
* space.
|
|
*/
|
|
static u64 get_cookie(struct unwind_task_info *info)
|
|
{
|
|
u32 cnt = 1;
|
|
|
|
if (info->id.cpu)
|
|
return info->id.id;
|
|
|
|
/* LSB is always set to ensure 0 is an invalid value */
|
|
cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
|
|
if (try_assign_cnt(info, cnt)) {
|
|
/* Update the per cpu counter */
|
|
__this_cpu_write(unwind_ctx_ctr, cnt);
|
|
}
|
|
/* Interrupts are disabled, the CPU will always be same */
|
|
info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
|
|
|
|
return info->id.id;
|
|
}
|
|
|
|
/**
|
|
* unwind_user_faultable - Produce a user stacktrace in faultable context
|
|
* @trace: The descriptor that will store the user stacktrace
|
|
*
|
|
* This must be called in a known faultable context (usually when entering
|
|
* or exiting user space). Depending on the available implementations
|
|
* the @trace will be loaded with the addresses of the user space stacktrace
|
|
* if it can be found.
|
|
*
|
|
* Return: 0 on success and negative on error
|
|
* On success @trace will contain the user space stacktrace
|
|
*/
|
|
int unwind_user_faultable(struct unwind_stacktrace *trace)
|
|
{
|
|
struct unwind_task_info *info = ¤t->unwind_info;
|
|
struct unwind_cache *cache;
|
|
|
|
/* Should always be called from faultable context */
|
|
might_fault();
|
|
|
|
if (!current->mm)
|
|
return -EINVAL;
|
|
|
|
if (!info->cache) {
|
|
info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
|
|
GFP_KERNEL);
|
|
if (!info->cache)
|
|
return -ENOMEM;
|
|
}
|
|
|
|
cache = info->cache;
|
|
trace->entries = cache->entries;
|
|
|
|
if (cache->nr_entries) {
|
|
/*
|
|
* The user stack has already been previously unwound in this
|
|
* entry context. Skip the unwind and use the cache.
|
|
*/
|
|
trace->nr = cache->nr_entries;
|
|
return 0;
|
|
}
|
|
|
|
trace->nr = 0;
|
|
unwind_user(trace, UNWIND_MAX_ENTRIES);
|
|
|
|
cache->nr_entries = trace->nr;
|
|
|
|
/* Clear nr_entries on way back to user space */
|
|
set_bit(UNWIND_USED_BIT, &info->unwind_mask);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void process_unwind_deferred(struct task_struct *task)
|
|
{
|
|
struct unwind_task_info *info = &task->unwind_info;
|
|
struct unwind_stacktrace trace;
|
|
struct unwind_work *work;
|
|
unsigned long bits;
|
|
u64 cookie;
|
|
|
|
if (WARN_ON_ONCE(!unwind_pending(info)))
|
|
return;
|
|
|
|
/* Clear pending bit but make sure to have the current bits */
|
|
bits = atomic_long_fetch_andnot(UNWIND_PENDING,
|
|
(atomic_long_t *)&info->unwind_mask);
|
|
/*
|
|
* From here on out, the callback must always be called, even if it's
|
|
* just an empty trace.
|
|
*/
|
|
trace.nr = 0;
|
|
trace.entries = NULL;
|
|
|
|
unwind_user_faultable(&trace);
|
|
|
|
if (info->cache)
|
|
bits &= ~(info->cache->unwind_completed);
|
|
|
|
cookie = info->id.id;
|
|
|
|
guard(srcu)(&unwind_srcu);
|
|
list_for_each_entry_srcu(work, &callbacks, list,
|
|
srcu_read_lock_held(&unwind_srcu)) {
|
|
if (test_bit(work->bit, &bits)) {
|
|
work->func(work, &trace, cookie);
|
|
if (info->cache)
|
|
info->cache->unwind_completed |= BIT(work->bit);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void unwind_deferred_task_work(struct callback_head *head)
|
|
{
|
|
process_unwind_deferred(current);
|
|
}
|
|
|
|
void unwind_deferred_task_exit(struct task_struct *task)
|
|
{
|
|
struct unwind_task_info *info = ¤t->unwind_info;
|
|
|
|
if (!unwind_pending(info))
|
|
return;
|
|
|
|
process_unwind_deferred(task);
|
|
|
|
task_work_cancel(task, &info->work);
|
|
}
|
|
|
|
/**
|
|
* unwind_deferred_request - Request a user stacktrace on task kernel exit
|
|
* @work: Unwind descriptor requesting the trace
|
|
* @cookie: The cookie of the first request made for this task
|
|
*
|
|
* Schedule a user space unwind to be done in task work before exiting the
|
|
* kernel.
|
|
*
|
|
* The returned @cookie output is the generated cookie of the very first
|
|
* request for a user space stacktrace for this task since it entered the
|
|
* kernel. It can be from a request by any caller of this infrastructure.
|
|
* Its value will also be passed to the callback function. It can be
|
|
* used to stitch kernel and user stack traces together in post-processing.
|
|
*
|
|
* It's valid to call this function multiple times for the same @work within
|
|
* the same task entry context. Each call will return the same cookie
|
|
* while the task hasn't left the kernel. If the callback is not pending
|
|
* because it has already been previously called for the same entry context,
|
|
* it will be called again with the same stack trace and cookie.
|
|
*
|
|
* Return: 0 if the callback successfully was queued.
|
|
* 1 if the callback is pending or was already executed.
|
|
* Negative if there's an error.
|
|
* @cookie holds the cookie of the first request by any user
|
|
*/
|
|
int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
|
|
{
|
|
struct unwind_task_info *info = ¤t->unwind_info;
|
|
unsigned long old, bits;
|
|
unsigned long bit;
|
|
int ret;
|
|
|
|
*cookie = 0;
|
|
|
|
if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
|
|
!user_mode(task_pt_regs(current)))
|
|
return -EINVAL;
|
|
|
|
/*
|
|
* NMI requires having safe cmpxchg operations.
|
|
* Trigger a warning to make it obvious that an architecture
|
|
* is using this in NMI when it should not be.
|
|
*/
|
|
if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
|
|
return -EINVAL;
|
|
|
|
/* Do not allow cancelled works to request again */
|
|
bit = READ_ONCE(work->bit);
|
|
if (WARN_ON_ONCE(bit < 0))
|
|
return -EINVAL;
|
|
|
|
/* Only need the mask now */
|
|
bit = BIT(bit);
|
|
|
|
guard(irqsave)();
|
|
|
|
*cookie = get_cookie(info);
|
|
|
|
old = READ_ONCE(info->unwind_mask);
|
|
|
|
/* Is this already queued or executed */
|
|
if (old & bit)
|
|
return 1;
|
|
|
|
/*
|
|
* This work's bit hasn't been set yet. Now set it with the PENDING
|
|
* bit and fetch the current value of unwind_mask. If ether the
|
|
* work's bit or PENDING was already set, then this is already queued
|
|
* to have a callback.
|
|
*/
|
|
bits = UNWIND_PENDING | bit;
|
|
old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
|
|
if (old & bits) {
|
|
/*
|
|
* If the work's bit was set, whatever set it had better
|
|
* have also set pending and queued a callback.
|
|
*/
|
|
WARN_ON_ONCE(!(old & UNWIND_PENDING));
|
|
return old & bit;
|
|
}
|
|
|
|
/* The work has been claimed, now schedule it. */
|
|
ret = task_work_add(current, &info->work, TWA_RESUME);
|
|
|
|
if (WARN_ON_ONCE(ret))
|
|
WRITE_ONCE(info->unwind_mask, 0);
|
|
|
|
return ret;
|
|
}
|
|
|
|
void unwind_deferred_cancel(struct unwind_work *work)
|
|
{
|
|
struct task_struct *g, *t;
|
|
int bit;
|
|
|
|
if (!work)
|
|
return;
|
|
|
|
bit = work->bit;
|
|
|
|
/* No work should be using a reserved bit */
|
|
if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS))
|
|
return;
|
|
|
|
guard(mutex)(&callback_mutex);
|
|
list_del_rcu(&work->list);
|
|
|
|
/* Do not allow any more requests and prevent callbacks */
|
|
work->bit = -1;
|
|
|
|
__clear_bit(bit, &unwind_mask);
|
|
|
|
synchronize_srcu(&unwind_srcu);
|
|
|
|
guard(rcu)();
|
|
/* Clear this bit from all threads */
|
|
for_each_process_thread(g, t) {
|
|
clear_bit(bit, &t->unwind_info.unwind_mask);
|
|
if (t->unwind_info.cache)
|
|
clear_bit(bit, &t->unwind_info.cache->unwind_completed);
|
|
}
|
|
}
|
|
|
|
int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
|
|
{
|
|
memset(work, 0, sizeof(*work));
|
|
|
|
guard(mutex)(&callback_mutex);
|
|
|
|
/* See if there's a bit in the mask available */
|
|
if (unwind_mask == ~0UL)
|
|
return -EBUSY;
|
|
|
|
work->bit = ffz(unwind_mask);
|
|
__set_bit(work->bit, &unwind_mask);
|
|
|
|
list_add_rcu(&work->list, &callbacks);
|
|
work->func = func;
|
|
return 0;
|
|
}
|
|
|
|
void unwind_task_init(struct task_struct *task)
|
|
{
|
|
struct unwind_task_info *info = &task->unwind_info;
|
|
|
|
memset(info, 0, sizeof(*info));
|
|
init_task_work(&info->work, unwind_deferred_task_work);
|
|
info->unwind_mask = 0;
|
|
}
|
|
|
|
void unwind_task_free(struct task_struct *task)
|
|
{
|
|
struct unwind_task_info *info = &task->unwind_info;
|
|
|
|
kfree(info->cache);
|
|
task_work_cancel(task, &info->work);
|
|
}
|