mirror of
https://github.com/torvalds/linux.git
synced 2025-08-15 14:11:42 +02:00
kernel/watchdog: add /sys/kernel/{hard,soft}lockup_count
Patch series "sysfs: add counters for lockups and stalls", v2. Commits9db89b4111
("exit: Expose "oops_count" to sysfs") and8b05aa2633
("panic: Expose "warn_count" to sysfs") added counters for oopses and warnings to sysfs, and these two patches do the same for hard/soft lockups and RCU stalls. All of these counters are useful for monitoring tools to detect whether the machine is healthy. If the kernel has experienced a lockup or a stall, it's probably due to a kernel bug, and I'd like to detect that quickly and easily. There is currently no way to detect that, other than parsing dmesg. Or observing indirect effects: such as certain tasks not responding, but then I need to observe all tasks, and it may take a while until these effects become visible/measurable. I'd rather be able to detect the primary cause more quickly, possibly before everything falls apart. This patch (of 2): There is /proc/sys/kernel/hung_task_detect_count, /sys/kernel/warn_count and /sys/kernel/oops_count but there is no userspace-accessible counter for hard/soft lockups. Having this is useful for monitoring tools. Link: https://lkml.kernel.org/r/20250504180831.4190860-1-max.kellermann@ionos.com Link: https://lkml.kernel.org/r/20250504180831.4190860-2-max.kellermann@ionos.com Signed-off-by: Max Kellermann <max.kellermann@ionos.com> Cc: Cc: Core Minyard <cminyard@mvista.com> Cc: Doug Anderson <dianders@chromium.org> Cc: Joel Granados <joel.granados@kernel.org> Cc: Song Liu <song@kernel.org> Cc: Kees Cook <kees@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
cc66e4863a
commit
aaf05e96e9
3 changed files with 67 additions and 0 deletions
7
Documentation/ABI/testing/sysfs-kernel-hardlockup_count
Normal file
7
Documentation/ABI/testing/sysfs-kernel-hardlockup_count
Normal file
|
@ -0,0 +1,7 @@
|
|||
What: /sys/kernel/hardlockup_count
|
||||
Date: May 2025
|
||||
KernelVersion: 6.16
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description:
|
||||
Shows how many times the system has detected a hard lockup since last boot.
|
||||
Available only if CONFIG_HARDLOCKUP_DETECTOR is enabled.
|
7
Documentation/ABI/testing/sysfs-kernel-softlockup_count
Normal file
7
Documentation/ABI/testing/sysfs-kernel-softlockup_count
Normal file
|
@ -0,0 +1,7 @@
|
|||
What: /sys/kernel/softlockup_count
|
||||
Date: May 2025
|
||||
KernelVersion: 6.16
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description:
|
||||
Shows how many times the system has detected a soft lockup since last boot.
|
||||
Available only if CONFIG_SOFTLOCKUP_DETECTOR is enabled.
|
|
@ -64,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
|
|||
*/
|
||||
unsigned int __read_mostly hardlockup_panic =
|
||||
IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
|
||||
static unsigned int hardlockup_count;
|
||||
|
||||
static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *page)
|
||||
{
|
||||
return sysfs_emit(page, "%u\n", hardlockup_count);
|
||||
}
|
||||
|
||||
static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count);
|
||||
|
||||
static __init int kernel_hardlockup_sysfs_init(void)
|
||||
{
|
||||
sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
late_initcall(kernel_hardlockup_sysfs_init);
|
||||
|
||||
#endif // CONFIG_SYSFS
|
||||
|
||||
/*
|
||||
* We may not want to enable hard lockup detection by default in all cases,
|
||||
* for example when running the kernel as a guest on a hypervisor. In these
|
||||
|
@ -170,6 +193,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
|
|||
unsigned int this_cpu = smp_processor_id();
|
||||
unsigned long flags;
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
++hardlockup_count;
|
||||
#endif
|
||||
|
||||
/* Only print hardlockups once. */
|
||||
if (per_cpu(watchdog_hardlockup_warned, cpu))
|
||||
return;
|
||||
|
@ -312,6 +339,28 @@ unsigned int __read_mostly softlockup_panic =
|
|||
static bool softlockup_initialized __read_mostly;
|
||||
static u64 __read_mostly sample_period;
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
|
||||
static unsigned int softlockup_count;
|
||||
|
||||
static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *page)
|
||||
{
|
||||
return sysfs_emit(page, "%u\n", softlockup_count);
|
||||
}
|
||||
|
||||
static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count);
|
||||
|
||||
static __init int kernel_softlockup_sysfs_init(void)
|
||||
{
|
||||
sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
late_initcall(kernel_softlockup_sysfs_init);
|
||||
|
||||
#endif // CONFIG_SYSFS
|
||||
|
||||
/* Timestamp taken after the last successful reschedule. */
|
||||
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
|
||||
/* Timestamp of the last softlockup report. */
|
||||
|
@ -743,6 +792,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
|
|||
touch_ts = __this_cpu_read(watchdog_touch_ts);
|
||||
duration = is_softlockup(touch_ts, period_ts, now);
|
||||
if (unlikely(duration)) {
|
||||
#ifdef CONFIG_SYSFS
|
||||
++softlockup_count;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Prevent multiple soft-lockup reports if one cpu is already
|
||||
* engaged in dumping all cpu back traces.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue