mirror of
https://github.com/torvalds/linux.git
synced 2025-08-15 14:11:42 +02:00

Inspired by mutex blocker tracking[1], this patch makes a trade-off to balance the overhead and utility of the hung task detector. Unlike mutexes, semaphores lack explicit ownership tracking, making it challenging to identify the root cause of hangs. To address this, we introduce a last_holder field to the semaphore structure, which is updated when a task successfully calls down() and cleared during up(). The assumption is that if a task is blocked on a semaphore, the holders must not have released it. While this does not guarantee that the last holder is one of the current blockers, it likely provides a practical hint for diagnosing semaphore-related stalls. With this change, the hung task detector can now show blocker task's info like below: [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked for more than 120 seconds. [Tue Apr 8 12:19:07 2025] Tainted: G E 6.14.0-rc6+ #1 [Tue Apr 8 12:19:07 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Tue Apr 8 12:19:07 2025] task:cat state:D stack:0 pid:945 tgid:945 ppid:828 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] <TASK> [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0xe3/0xf0 [Tue Apr 8 12:19:07 2025] ? __folio_mod_stat+0x2a/0x80 [Tue Apr 8 12:19:07 2025] ? set_ptes.constprop.0+0x27/0x90 [Tue Apr 8 12:19:07 2025] __down_common+0x155/0x280 [Tue Apr 8 12:19:07 2025] down+0x53/0x70 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x23/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f419478f46e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007fff1c4d2668 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f419478f46e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f4194683000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f4194683000 R08: 00007f4194682010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] </TASK> [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked on a semaphore likely last held by task cat:938 [Tue Apr 8 12:19:07 2025] task:cat state:S stack:0 pid:938 tgid:938 ppid:584 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] <TASK> [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] ? _raw_spin_unlock_irqrestore+0xe/0x40 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0x77/0xf0 [Tue Apr 8 12:19:07 2025] ? __pfx_process_timeout+0x10/0x10 [Tue Apr 8 12:19:07 2025] msleep_interruptible+0x49/0x60 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x2d/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f7c584a646e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007ffdba8ce158 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f7c584a646e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f7c5839a000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f7c5839a000 R08: 00007f7c58399010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] </TASK> [1] https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com Link: https://lkml.kernel.org/r/20250414145945.84916-3-ioworker0@gmail.com Signed-off-by: Mingzhe Yang <mingzhe.yang@ly.com> Signed-off-by: Lance Yang <ioworker0@gmail.com> Suggested-by: Andrew Morton <akpm@linux-foundation.org> Suggested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Cc: Anna Schumaker <anna.schumaker@oracle.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Joel Granados <joel.granados@kernel.org> Cc: John Stultz <jstultz@google.com> Cc: Kent Overstreet <kent.overstreet@linux.dev> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sergey Senozhatsky <senozhatsky@chromium.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Tomasz Figa <tfiga@chromium.org> Cc: Waiman Long <longman@redhat.com> Cc: Will Deacon <will@kernel.org> Cc: Yongliang Gao <leonylgao@tencent.com> Cc: Zi Li <amaindex@outlook.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
329 lines
8.7 KiB
C
329 lines
8.7 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright (c) 2008 Intel Corporation
|
|
* Author: Matthew Wilcox <willy@linux.intel.com>
|
|
*
|
|
* This file implements counting semaphores.
|
|
* A counting semaphore may be acquired 'n' times before sleeping.
|
|
* See mutex.c for single-acquisition sleeping locks which enforce
|
|
* rules which allow code to be debugged more easily.
|
|
*/
|
|
|
|
/*
|
|
* Some notes on the implementation:
|
|
*
|
|
* The spinlock controls access to the other members of the semaphore.
|
|
* down_trylock() and up() can be called from interrupt context, so we
|
|
* have to disable interrupts when taking the lock. It turns out various
|
|
* parts of the kernel expect to be able to use down() on a semaphore in
|
|
* interrupt context when they know it will succeed, so we have to use
|
|
* irqsave variants for down(), down_interruptible() and down_killable()
|
|
* too.
|
|
*
|
|
* The ->count variable represents how many more tasks can acquire this
|
|
* semaphore. If it's zero, there may be tasks waiting on the wait_list.
|
|
*/
|
|
|
|
#include <linux/compiler.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/export.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/debug.h>
|
|
#include <linux/sched/wake_q.h>
|
|
#include <linux/semaphore.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/ftrace.h>
|
|
#include <trace/events/lock.h>
|
|
#include <linux/hung_task.h>
|
|
|
|
static noinline void __down(struct semaphore *sem);
|
|
static noinline int __down_interruptible(struct semaphore *sem);
|
|
static noinline int __down_killable(struct semaphore *sem);
|
|
static noinline int __down_timeout(struct semaphore *sem, long timeout);
|
|
static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
|
|
|
|
#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
|
|
static inline void hung_task_sem_set_holder(struct semaphore *sem)
|
|
{
|
|
WRITE_ONCE((sem)->last_holder, (unsigned long)current);
|
|
}
|
|
|
|
static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
|
|
{
|
|
if (READ_ONCE((sem)->last_holder) == (unsigned long)current)
|
|
WRITE_ONCE((sem)->last_holder, 0UL);
|
|
}
|
|
|
|
unsigned long sem_last_holder(struct semaphore *sem)
|
|
{
|
|
return READ_ONCE(sem->last_holder);
|
|
}
|
|
#else
|
|
static inline void hung_task_sem_set_holder(struct semaphore *sem)
|
|
{
|
|
}
|
|
static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
|
|
{
|
|
}
|
|
unsigned long sem_last_holder(struct semaphore *sem)
|
|
{
|
|
return 0UL;
|
|
}
|
|
#endif
|
|
|
|
static inline void __sem_acquire(struct semaphore *sem)
|
|
{
|
|
sem->count--;
|
|
hung_task_sem_set_holder(sem);
|
|
}
|
|
|
|
/**
|
|
* down - acquire the semaphore
|
|
* @sem: the semaphore to be acquired
|
|
*
|
|
* Acquires the semaphore. If no more tasks are allowed to acquire the
|
|
* semaphore, calling this function will put the task to sleep until the
|
|
* semaphore is released.
|
|
*
|
|
* Use of this function is deprecated, please use down_interruptible() or
|
|
* down_killable() instead.
|
|
*/
|
|
void __sched down(struct semaphore *sem)
|
|
{
|
|
unsigned long flags;
|
|
|
|
might_sleep();
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
|
if (likely(sem->count > 0))
|
|
__sem_acquire(sem);
|
|
else
|
|
__down(sem);
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
|
}
|
|
EXPORT_SYMBOL(down);
|
|
|
|
/**
|
|
* down_interruptible - acquire the semaphore unless interrupted
|
|
* @sem: the semaphore to be acquired
|
|
*
|
|
* Attempts to acquire the semaphore. If no more tasks are allowed to
|
|
* acquire the semaphore, calling this function will put the task to sleep.
|
|
* If the sleep is interrupted by a signal, this function will return -EINTR.
|
|
* If the semaphore is successfully acquired, this function returns 0.
|
|
*/
|
|
int __sched down_interruptible(struct semaphore *sem)
|
|
{
|
|
unsigned long flags;
|
|
int result = 0;
|
|
|
|
might_sleep();
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
|
if (likely(sem->count > 0))
|
|
__sem_acquire(sem);
|
|
else
|
|
result = __down_interruptible(sem);
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
|
|
|
return result;
|
|
}
|
|
EXPORT_SYMBOL(down_interruptible);
|
|
|
|
/**
|
|
* down_killable - acquire the semaphore unless killed
|
|
* @sem: the semaphore to be acquired
|
|
*
|
|
* Attempts to acquire the semaphore. If no more tasks are allowed to
|
|
* acquire the semaphore, calling this function will put the task to sleep.
|
|
* If the sleep is interrupted by a fatal signal, this function will return
|
|
* -EINTR. If the semaphore is successfully acquired, this function returns
|
|
* 0.
|
|
*/
|
|
int __sched down_killable(struct semaphore *sem)
|
|
{
|
|
unsigned long flags;
|
|
int result = 0;
|
|
|
|
might_sleep();
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
|
if (likely(sem->count > 0))
|
|
__sem_acquire(sem);
|
|
else
|
|
result = __down_killable(sem);
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
|
|
|
return result;
|
|
}
|
|
EXPORT_SYMBOL(down_killable);
|
|
|
|
/**
|
|
* down_trylock - try to acquire the semaphore, without waiting
|
|
* @sem: the semaphore to be acquired
|
|
*
|
|
* Try to acquire the semaphore atomically. Returns 0 if the semaphore has
|
|
* been acquired successfully or 1 if it cannot be acquired.
|
|
*
|
|
* NOTE: This return value is inverted from both spin_trylock and
|
|
* mutex_trylock! Be careful about this when converting code.
|
|
*
|
|
* Unlike mutex_trylock, this function can be used from interrupt context,
|
|
* and the semaphore can be released by any task or interrupt.
|
|
*/
|
|
int __sched down_trylock(struct semaphore *sem)
|
|
{
|
|
unsigned long flags;
|
|
int count;
|
|
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
|
count = sem->count - 1;
|
|
if (likely(count >= 0))
|
|
__sem_acquire(sem);
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
|
|
|
return (count < 0);
|
|
}
|
|
EXPORT_SYMBOL(down_trylock);
|
|
|
|
/**
|
|
* down_timeout - acquire the semaphore within a specified time
|
|
* @sem: the semaphore to be acquired
|
|
* @timeout: how long to wait before failing
|
|
*
|
|
* Attempts to acquire the semaphore. If no more tasks are allowed to
|
|
* acquire the semaphore, calling this function will put the task to sleep.
|
|
* If the semaphore is not released within the specified number of jiffies,
|
|
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
|
|
*/
|
|
int __sched down_timeout(struct semaphore *sem, long timeout)
|
|
{
|
|
unsigned long flags;
|
|
int result = 0;
|
|
|
|
might_sleep();
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
|
if (likely(sem->count > 0))
|
|
__sem_acquire(sem);
|
|
else
|
|
result = __down_timeout(sem, timeout);
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
|
|
|
return result;
|
|
}
|
|
EXPORT_SYMBOL(down_timeout);
|
|
|
|
/**
|
|
* up - release the semaphore
|
|
* @sem: the semaphore to release
|
|
*
|
|
* Release the semaphore. Unlike mutexes, up() may be called from any
|
|
* context and even by tasks which have never called down().
|
|
*/
|
|
void __sched up(struct semaphore *sem)
|
|
{
|
|
unsigned long flags;
|
|
DEFINE_WAKE_Q(wake_q);
|
|
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
|
|
|
hung_task_sem_clear_if_holder(sem);
|
|
|
|
if (likely(list_empty(&sem->wait_list)))
|
|
sem->count++;
|
|
else
|
|
__up(sem, &wake_q);
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
|
if (!wake_q_empty(&wake_q))
|
|
wake_up_q(&wake_q);
|
|
}
|
|
EXPORT_SYMBOL(up);
|
|
|
|
/* Functions for the contended case */
|
|
|
|
struct semaphore_waiter {
|
|
struct list_head list;
|
|
struct task_struct *task;
|
|
bool up;
|
|
};
|
|
|
|
/*
|
|
* Because this function is inlined, the 'state' parameter will be
|
|
* constant, and thus optimised away by the compiler. Likewise the
|
|
* 'timeout' parameter for the cases without timeouts.
|
|
*/
|
|
static inline int __sched ___down_common(struct semaphore *sem, long state,
|
|
long timeout)
|
|
{
|
|
struct semaphore_waiter waiter;
|
|
|
|
list_add_tail(&waiter.list, &sem->wait_list);
|
|
waiter.task = current;
|
|
waiter.up = false;
|
|
|
|
for (;;) {
|
|
if (signal_pending_state(state, current))
|
|
goto interrupted;
|
|
if (unlikely(timeout <= 0))
|
|
goto timed_out;
|
|
__set_current_state(state);
|
|
raw_spin_unlock_irq(&sem->lock);
|
|
timeout = schedule_timeout(timeout);
|
|
raw_spin_lock_irq(&sem->lock);
|
|
if (waiter.up) {
|
|
hung_task_sem_set_holder(sem);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
timed_out:
|
|
list_del(&waiter.list);
|
|
return -ETIME;
|
|
|
|
interrupted:
|
|
list_del(&waiter.list);
|
|
return -EINTR;
|
|
}
|
|
|
|
static inline int __sched __down_common(struct semaphore *sem, long state,
|
|
long timeout)
|
|
{
|
|
int ret;
|
|
|
|
hung_task_set_blocker(sem, BLOCKER_TYPE_SEM);
|
|
|
|
trace_contention_begin(sem, 0);
|
|
ret = ___down_common(sem, state, timeout);
|
|
trace_contention_end(sem, ret);
|
|
|
|
hung_task_clear_blocker();
|
|
|
|
return ret;
|
|
}
|
|
|
|
static noinline void __sched __down(struct semaphore *sem)
|
|
{
|
|
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
|
|
}
|
|
|
|
static noinline int __sched __down_interruptible(struct semaphore *sem)
|
|
{
|
|
return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
|
|
}
|
|
|
|
static noinline int __sched __down_killable(struct semaphore *sem)
|
|
{
|
|
return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
|
|
}
|
|
|
|
static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
|
|
{
|
|
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
|
|
}
|
|
|
|
static noinline void __sched __up(struct semaphore *sem,
|
|
struct wake_q_head *wake_q)
|
|
{
|
|
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
|
|
struct semaphore_waiter, list);
|
|
list_del(&waiter->list);
|
|
waiter->up = true;
|
|
wake_q_add(wake_q, waiter->task);
|
|
}
|