作为稳定性工程师,我们在分析死机问题时,尤其是watchdog触发的死机问题时,不可避免的会查看timer_list或者hrtimer,去查看喂狗线程的状态。
timer list dump
Timer List Dump
--------------------------------------------------------------------------------------------------------------------------------------
CPU 0
--------------------------------------------------------------------------------------------------------------------------------------
BASE_STD (tvec_base: ffffff82f2713240 timer_jiffies: 4297628107(10943.248s) next_timer: 4297628108(10943.252s) active_timers: NA)
+ vectors Timers (14)
INDEX TIMER_LIST_ADDR EXPIRES EXPIRES(s) FUNCTION WORK REMARKS
12 ffffffc01441bb78 4297628108 10943.252s process_timeout
23 ffffffc014413b78 4297628119 10943.296s process_timeout
66 ffffffc00a20cf20 4297628173 10943.512s delayed_work_timer_fn toggle_allocation_gate
117 ffffffc01f9aaf40 4297628580 10945.14s delayed_work_timer_fn msm_vidc_stats_handler[msm_video]
144 ffffff8003d52120 4297630694 10953.596s qcom_wdt_pet_task_wakeup[qcom_wdt_core]
170 ffffff805ff92238 4297628247 10943.808s delayed_work_timer_fn fg_monitor_workfunc[fg_bq28z610]
173 ffffff82f27130b8 4297628435 10944.56s delayed_work_timer_fn kfree_rcu_monitor
177 ffffff800262d750 4297628722 10945.708s blk_rq_timed_out_timer
182 ffffffc011b2bd38 4297629019 10946.896s process_timeout
185 ffffff80039e2a18 4297629218 10947.692s delayed_work_timer_fn wb_workfn
191 ffffff80cca00690 4297629610 10949.26s idletimer_tg_expired
213 ffffffc00f273d48 4297631896 10958.404s process_timeout
283 ffffffc011b0bd18 4297698154 11223.436s process_timeout
283 ffffffc00b3f3d18 4297698153 11223.432s process_timeout
BASE_DEF (tvec_base: ffffff82f27144c0 timer_jiffies: 4297628001(10942.824s) next_timer: 4297628256(10943.844s) active_timers: NA)
+ vectors Timers (6)
INDEX TIMER_LIST_ADDR EXPIRES EXPIRES(s) FUNCTION WORK REMARKS
76 ffffff82f2720fd8 4297628250 10943.82s delayed_work_timer_fn vmstat_update
76 ffffffc00a088258 4297628250 10943.82s delayed_work_timer_fn vmstat_shepherd
172 ffffffc00a1f27b0 4297628391 10944.384s writeout_period
218 ffffffc00a1c3388 4297634636 10969.364s wq_watchdog_timer_fn
266 ffffff8002728050 4297626482 10936.748s idle_worker_timeout
271 ffffff82f2722fd0 4297647479 11020.736s idle_worker_timeout
...
hrtimer info
hrtimer info:
CPU 0 hrtimer_bases v.v (struct hrtimer_cpu_base)0xffffff82f2715740
hrtimer_cpu_base 0xffffff82f2715780
hrtimer function _softexpires _softexpires
v.v (struct hrtimer *)0xffffff82f2715d28 0xffffffc00819b23c ('tick_sched_timer', 0) 10943240000000 10943240000000
v.v (struct hrtimer *)0xffffffc02f4a3d98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943250362854 10943250412854
v.v (struct hrtimer *)0xffffffc029a6bc98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943256967130 10943257017130
v.v (struct hrtimer *)0xffffffc04353bbf8 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943302770459 10943305270457
v.v (struct hrtimer *)0xffffff80470e8168 0xffffffc0088deb50 ('pm_suspend_timer_fn', 0) 10943287661708 10943350161708
v.v (struct hrtimer *)0xffffffc020bfbc98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943377049095 10943377099095
v.v (struct hrtimer *)0xffffffc01a07bbf8 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943320656661 10943384106653
v.v (struct hrtimer *)0xffffffc0202c3bf8 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943406283583 10943406475581
v.v (struct hrtimer *)0xffffffc02baabc98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943416557128 10943416607128
v.v (struct hrtimer *)0xffffffc0147b3d98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943436796292 10943436846292
v.v (struct hrtimer *)0xffffffc02be53c98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943497224311 10943497274311
v.v (struct hrtimer *)0xffffffc01b363bf8 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943514320666 10943514688663
v.v (struct hrtimer *)0xffffffc02bdc3c98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943523970146 10943524020146
v.v (struct hrtimer *)0xffffffc01a043bf8 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943692678375 10943693608372
v.v (struct hrtimer *)0xffffffc01a6e3bf8 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943752190302 10943754190301
v.v (struct hrtimer *)0xffffffc02c5ebc98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943756620199 10943756670199
v.v (struct hrtimer *)0xffffffc02b07bc98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943786910167 10943786960167
v.v (struct hrtimer *)0xffffffc01eab3c98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10943950481292 10943950531292
v.v (struct hrtimer *)0xffffffc00a1c5818 0xffffffc008108688 ('sched_rt_period_timer', 0) 10944008000000 10944008000000
v.v (struct hrtimer *)0xffffffc019013d98 0xffffffc0081876b8 ('hrtimer_wakeup', 0) 10944058200719 10944058250719
所以我们需要了解这两个是什么?干什么用的?在分析喂狗线程中的作用?以及如何从这些定时器信息中获取有效信息?
定时器的种类
在 Linux 内核中,定时器是实现延时执行和周期性任务的基础设施。随着系统对实时性要求的不断提高,内核提供了两种定时器机制:低精度定时器(基于 timer_list
)和高精度定时器(基于 hrtimer
)。本文将从机制原理、使用方法、性能对比等方面对两者进行深入剖析。
低精度定时器timer_list
实现机制
时钟节拍(tick):内核通过固定频率的时钟中断(由
CONFIG_HZ
决定,典型值有 100、250、300、1000)维护全局jiffies
计数。数据结构:
struct timer_list
与 定时轮(timer wheel)结合,管理注册的定时器。触发方式:到期时刻到达下一个时钟节拍,执行钩子函数,存在最多一个节拍周期的抖动。
核心数据结构
struct timer_list {
/*
* All fields that change during normal runtime grouped to the
* same cacheline
*/
struct hlist_node entry;
unsigned long expires;
void (*function)(struct timer_list *);
u32 flags;
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
ANDROID_KABI_RESERVE(1);
ANDROID_KABI_RESERVE(2);
};
核心api
下面是 timer_list
的核心 API 及其功能说明,汇总为表格形式:
简单的timer_list驱动
// timer_hello.c
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/timer.h>
#include <linux/jiffies.h>
#define INTERVAL_SEC 1 /* 间隔秒数 */
static struct timer_list hello_timer;
/* 定时器回调函数 */
static void hello_timer_callback(struct timer_list *t)
{
pr_info("hello world\n");
/* 重新设置定时器,1s 后再次触发 */
mod_timer(&hello_timer, jiffies + msecs_to_jiffies(INTERVAL_SEC * 1000));
}
/* 模块加载时调用 */
static int __init hello_timer_init(void)
{
pr_info("hello_timer: initializing\n");
/* 初始化 timer_list 并指定回调 */
timer_setup(&hello_timer, hello_timer_callback, 0);
/* 启动定时器:jiffies + 1s */
mod_timer(&hello_timer, jiffies + msecs_to_jiffies(INTERVAL_SEC * 1000));
return 0;
}
/* 模块卸载时调用 */
static void __exit hello_timer_exit(void)
{
int ret;
/* 删除定时器并等待正在运行的回调结束 */
ret = del_timer_sync(&hello_timer);
if (ret)
pr_warn("hello_timer: callback was still running\n");
pr_info("hello_timer: exiting\n");
}
module_init(hello_timer_init);
module_exit(hello_timer_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Your Name");
MODULE_DESCRIPTION("A simple timer_list example that prints 'hello world' every second");
每隔 1 秒,内核日志中就会输出一次 “hello world”。
高精度定时器hrtimer
实现机制
硬件时钟源
hrtimer
可绑定到多种高分辨率硬件时钟源,如 HPET、APIC timer 或 CPU TSC。软件结构
内核维护一棵红黑树(rbtree),按到期时间排序所有活跃的hrtimer
。触发流程
用户调用
hrtimer_start()
安排一个到期时间点(ktime_t
)。内核将其插入红黑树,并编程硬件定时器产生一次中断。
当硬件中断到来,内核执行软中断(softirq),遍历红黑树中所有已到期的
hrtimer
,依次调用它们的回调。
精度与开销
精度可达几十—几百纳秒,抖动远小于 1 tick(如配置 HZ=1000 的 1ms)。
每个到期点会产生专属中断,频繁小定时器会带来较多中断和软中断开销。
核心数据结构
struct hrtimer {
struct timerqueue_node node;
ktime_t _softexpires;
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
u8 state;
u8 is_rel;
u8 is_soft;
u8 is_hard;
ANDROID_KABI_RESERVE(1);
};
核心api
简单的hrtimer定时器驱动
// hrtimer_hello.c
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/hrtimer.h>
#include <linux/ktime.h>
#define INTERVAL_NS (1e9) // 1秒 = 1e9 纳秒
static struct hrtimer hello_hrtimer;
static ktime_t hello_period;
/* 定时器回调 */
static enum hrtimer_restart hello_fn(struct hrtimer *timer)
{
pr_info("hello hrtimer\n");
/* 周期性:基于上次到期时间前移一个周期 */
hrtimer_forward_now(timer, hello_period);
return HRTIMER_RESTART;
}
/* 模块加载 */
static int __init hrtimer_hello_init(void)
{
pr_info("hrtimer_hello: init\n");
/* 初始化:使用单调时钟,REL 模式 */
hrtimer_init(&hello_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hello_hrtimer.function = hello_fn;
/* 设置周期 */
hello_period = ktime_set(0, INTERVAL_NS);
/* 启动定时器:当前时刻 + 1s */
hrtimer_start(&hello_hrtimer, hello_period, HRTIMER_MODE_REL);
return 0;
}
/* 模块卸载 */
static void __exit hrtimer_hello_exit(void)
{
int ret;
/* 取消定时器,保证回调结束 */
ret = hrtimer_cancel(&hello_hrtimer);
if (ret)
pr_warn("hrtimer_hello: callback was running\n");
pr_info("hrtimer_hello: exit\n");
}
module_init(hrtimer_hello_init);
module_exit(hrtimer_hello_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Your Name");
MODULE_DESCRIPTION("An hrtimer example that prints every second");
基于硬件的watchdog timer
我们的目的其实就是学习timer在watchdog的应用,所以在上面timeer_list
hrtimer
介绍之后,本章就着重了解一watchdog timer
。
详细代码流程见:https://www.iliuqi.com/archives/android-stability-016
watchdog timer的使用
kernel_platform/msm-kernel/drivers/soc/qcom/qcom_wdt_core.c
static int qcom_wdt_init(struct msm_watchdog_data *wdog_dd,
struct platform_device *pdev)
{
//...
timer_setup(&wdog_dd->pet_timer, qcom_wdt_pet_task_wakeup, 0);
wdog_dd->pet_timer.expires = jiffies + delay_time;
add_timer(&wdog_dd->pet_timer);
timer_setup(&wdog_dd->user_pet_timer, qcom_wdt_user_pet_bite, 0);
//...
}
这里注册了两pet_timer
,其user_pet_timer
是提供给用户空间的一个接口,可以通过这个接口手动触发watchdog bite
void qcom_wdt_trigger_bite(void)
{
if (!wdog_data)
return;
compute_irq_count();
dev_err(wdog_data->dev, "Causing a QCOM Apps Watchdog bite!\n");
wdog_data->ops->show_wdt_status(wdog_data);
wdog_data->ops->set_bite_time(1, wdog_data);
wdog_data->ops->reset_wdt(wdog_data);
/* Delay to make sure bite occurs */
mdelay(10000);
/*
* This function induces the non-secure bite and control
* should not return to the calling function. Non-secure
* bite interrupt is affined to all the cores and it may
* not be handled by the same cores which configured
* non-secure bite. So add forever loop here.
*/
while (1)
udelay(1);
}
本章节着重理清另一个timer的流程,也就wdog_dd->pet_timer
。
wdog_dd->pet_timer
在函qcom_wdt_init
中被初始化并设置了定时周delay_time
,这个周期通CONFIGQCOMWATCHDOG_PET_TIME
配置!小米项目一般设置为15s。
喂狗线程函数为qcom_wdt_pet_task_wakeup
static void qcom_wdt_pet_task_wakeup(struct timer_list *t)
{
struct msm_watchdog_data *wdog_dd =
from_timer(wdog_dd, t, pet_timer);
wdog_dd->timer_expired = true;
wdog_dd->timer_fired = sched_clock();
wake_up(&wdog_dd->pet_complete);
}
这个函数唤醒wdog_dd->pet_complete
,而这个对应的就是watchdog线程
static __ref int qcom_wdt_kthread(void *arg)
{
struct msm_watchdog_data *wdog_dd = arg;
unsigned long delay_time = 0;
struct sched_param param = {.sched_priority = MAX_RT_PRIO-1};
int ret, cpu;
sched_setscheduler(current, SCHED_FIFO, ¶m);
while (!kthread_should_stop()) {
do {
ret = wait_event_interruptible(wdog_dd->pet_complete,
wdog_dd->timer_expired);
} while (ret != 0);
wdog_dd->thread_start = sched_clock();
for_each_cpu(cpu, cpu_present_mask)
wdog_dd->ping_start[cpu] = wdog_dd->ping_end[cpu] = 0;
if (wdog_dd->do_ipi_ping)
qcom_wdt_ping_other_cpus(wdog_dd);
do {
ret = wait_event_interruptible(wdog_dd->pet_complete,
wdog_dd->user_pet_complete);
} while (ret != 0);
wdog_dd->timer_expired = false;
wdog_dd->user_pet_complete = !wdog_dd->user_pet_enabled;
if (wdog_dd->enabled) {
delay_time = msecs_to_jiffies(wdog_dd->pet_time);
wdog_dd->ops->reset_wdt(wdog_dd);
wdog_dd->last_pet = sched_clock();
}
/* Check again before scheduling
* Could have been changed on other cpu
*/
if (!kthread_should_stop()) {
spin_lock(&wdog_dd->freeze_lock);
if (!wdog_dd->freeze_in_progress)
mod_timer(&wdog_dd->pet_timer,
jiffies + delay_time);
spin_unlock(&wdog_dd->freeze_lock);
}
record_irq_count();
}
return 0;
}
watchdog suspend/wakeup
作为一个timer,有一点无法避免开,也就是suspend/wakeup时如何处理!
static const struct dev_pm_ops qcom_soc_dev_pm_ops = {
#ifdef CONFIG_PM_SLEEP
.suspend_late = qcom_wdt_pet_suspend,
.resume_early = qcom_wdt_pet_resume,
#endif
.freeze_late = qcom_wdt_pet_suspend,
.restore_early = qcom_wdt_pet_resume,
};
在 Qualcomm 看门狗驱动中,上述 dev_pm_ops
定义了在系统进入 Suspend/Freeze 以及退出 Resume/Restore 时,各个阶段要调用的回调函数。具体对应关系和执行时序如下:
qcom_wdt_pet_suspend
int qcom_wdt_pet_suspend(struct device *dev)
{
struct msm_watchdog_data *wdog_data =
(struct msm_watchdog_data *)dev_get_drvdata(dev);
if (!wdog_data)
return 0;
if (wdog_data->user_pet_enabled)
del_timer_sync(&wdog_data->user_pet_timer);
spin_lock(&wdog_data->freeze_lock);
wdog_data->freeze_in_progress = true;
spin_unlock(&wdog_data->freeze_lock);
wdog_data->ops->reset_wdt(wdog_data);
del_timer_sync(&wdog_data->pet_timer);
if (wdog_data->wakeup_irq_enable) {
if (wdog_data->hibernate || (pm_suspend_target_state == PM_SUSPEND_MEM)) {
wdog_data->ops->disable_wdt(wdog_data);
wdog_data->enabled = false;
}
wdog_data->last_pet = sched_clock();
return 0;
}
wdog_data->ops->disable_wdt(wdog_data);
wdog_data->enabled = false;
wdog_data->last_pet = sched_clock();
return 0;
}
挂起前先同步删除所有定时器并“喂狗”一次,防止系统在进入低功耗后因定时器误触发或超时重启。
用
freeze_in_progress
标志阻止 kthread 在挂起期间重新编程主定时器。根据是否支持唤醒中断,以及挂起类型(休眠 vs 挂起),有条件地禁用硬件看门狗,并标记
enabled=false
。
qcom_wdt_pet_resume
int qcom_wdt_pet_resume(struct device *dev)
{
uint32_t val;
struct msm_watchdog_data *wdog_data =
(struct msm_watchdog_data *)dev_get_drvdata(dev);
unsigned long delay_time = 0;
if (!wdog_data)
return 0;
val = BIT(EN);
if (wdog_data->user_pet_enabled) {
delay_time = msecs_to_jiffies(wdog_data->bark_time + 3 * 1000);
wdog_data->user_pet_timer.expires = jiffies + delay_time;
add_timer(&wdog_data->user_pet_timer);
}
delay_time = msecs_to_jiffies(wdog_data->pet_time);
spin_lock(&wdog_data->freeze_lock);
wdog_data->pet_timer.expires = jiffies + delay_time;
add_timer(&wdog_data->pet_timer);
wdog_data->freeze_in_progress = false;
spin_unlock(&wdog_data->freeze_lock);
if (wdog_data->wakeup_irq_enable) {
if (wdog_data->hibernate || (pm_suspend_target_state == PM_SUSPEND_MEM)) {
wdog_data->ops->set_bark_time(wdog_data->bark_time, wdog_data);
wdog_data->ops->set_bite_time(wdog_data->bark_time + 10 * 1000, wdog_data);
val |= BIT(UNMASKED_INT_EN);
wdog_data->ops->enable_wdt(val, wdog_data);
wdog_data->enabled = true;
}
wdog_data->ops->reset_wdt(wdog_data);
wdog_data->last_pet = sched_clock();
return 0;
}
wdog_data->ops->enable_wdt(val, wdog_data);
wdog_data->ops->reset_wdt(wdog_data);
wdog_data->enabled = true;
wdog_data->last_pet = sched_clock();
return 0;
}
总的来说,qcom_wdt_pet_resume()
在系统从 Suspend/Freeze 恢复后,会依次重启(或新建)所有相关定时器,解除冻结,并根据硬件和配置重新启用并复位看门狗,使“等待→喂狗→重编程”循环无缝继续。
Timer list dump
作为稳定性工程师,timer_list dump我们其实并不陌生,大概的显示如下:
Timer List Dump
--------------------------------------------------------------------------------------------------------------------------------------
CPU 0
--------------------------------------------------------------------------------------------------------------------------------------
BASE_STD (tvec_base: ffffff887724c080 timer_jiffies: 4307926319(52136.096s) next_timer: 4307926528(52136.932s) active_timers: NA)
+ vectors Timers (5)
INDEX TIMER_LIST_ADDR EXPIRES EXPIRES(s) FUNCTION WORK REMARKS
137 ffffff87a20ec7f8 4307927569 52141.096s delayed_work_timer_fn battery_chg_dfx_monitor_work[qti_battery_charger]
137 ffffff805ba62460 4307927566 52141.084s delayed_work_timer_fn wb_workfn
184 ffffff87cdde38d8 4307926504 52136.836s neigh_timer_handler
237 ffffff8813f24708 4307933421 52164.504s tw_timer_handler
248 ffffffc082631058 4307939003 52186.832s delayed_work_timer_fn crng_reseed
BASE_DEF (tvec_base: ffffff887724d340 timer_jiffies: 4307926316(52136.084s) next_timer: 4307926592(52137.188s) active_timers: NA)
+ vectors Timers (10)
INDEX TIMER_LIST_ADDR EXPIRES EXPIRES(s) FUNCTION WORK REMARKS
92 ffffffc0825b2850 4307926750 52137.82s delayed_work_timer_fn vmstat_shepherd
131 ffffffc0826abbe8 4307927191 52139.584s delayed_work_timer_fn neigh_periodic_work
131 ffffffc0826a5320 4307927191 52139.584s delayed_work_timer_fn neigh_periodic_work
185 ffffffc0826a5408 4307926566 52137.084s delayed_work_timer_fn neigh_managed_work
185 ffffffc0826abcd0 4307926566 52137.084s delayed_work_timer_fn neigh_managed_work
230 ffffff8877a21a10 4307929726 52149.724s idle_worker_timeout
238 ffffffc08275b380 4307933816 52166.084s wq_watchdog_timer_fn
251 ffffff887725da10 4307940623 52193.312s idle_worker_timeout
287 ffffff8877e03a10 4307936006 52174.844s idle_worker_timeout
293 ffffff887725de88 4307960640 52273.38s idle_worker_timeout
--------------------------------------------------------------------------------------------------------------------------------------
CPU 1
--------------------------------------------------------------------------------------------------------------------------------------
....
下面对这段转储按字段含义和阅读流程做说明:
核心区别
唤醒行为
BASE_STD 下的任何定时器到期都会唤醒 CPU,确保定时器回调按时执行。
BASE_DEF 下的定时器不强制唤醒,只有在 CPU 本身已被其他事件唤醒时才会运行;如果长期空闲,可无限期延后。
节能特性
使用 BASE_DEF(通过
timer_setup_deferrable()
或在定时器初始化时设置TIMER_DEFERRABLE
标志)可以合并同一时间窗口内的非关键任务,减少平台从深度空眠状态唤醒的次数,从而 降低功耗。
配置方式
默认
timer_setup()
创建的定时器属于 BASE_STD。调用
timer_setup_deferrable()
或手动给struct timer_list
加上TIMER_DEFERRABLE
标志,则会被插入到 BASE_DEF 链表中。
关于vectors Timers的各列解释如下:
INDEX:在该
timer_base
链表或红黑树中的序号,用于调试定位。TIMER_LIST_ADDR:这条定时器对象
struct timer_list
的地址。EXPIRES / EXPIRES(s):下次触发的 jiffies 值及转换的秒数(相对于系统启动后)。
FUNCTION:回调函数名,触发时会执行此函数。
WORK:若该定时器用于驱动 Delayed Work,则显示对应的 work_struct 名称。
REMARKS:其他备注字段,若无特别信息则留空。
从上述我们已经了解到了watchdog timer的使用了,但是有一点当系统处于suspend时,由于
qcom_wdt_pet_suspend
会去掉定时器,那在suspend中,难道系统就不会在喂狗了???这个答案肯定是当然不会喂狗了!
为什么呢?
因为在suspend的函数中,会停止计数器计数,既然计数器都停止了,对于系统来说时间就停止了!
基于软件的软看门狗机制
Linux 的软看门狗(Software Watchdog)是一种基于内核定时器的软件实现,用来在用户态或内核态失去响应时触发系统重启或其他恢复动作。它尤其适用于没有硬件看门狗或需要额外保护层的场景。下面从关键组件和工作流程来介绍:
这个怎么理解呢?举个例子来说明
假设:某个 CPU 核在内核态连续运行超过阈值(默认 20秒),无法切换到其他进程/线程。
while (1) { /* 内核死循环 */ } // Buggy 驱动代码
spin_lock(&lock); for(;;); // 死锁且未释放锁
local_irq_disable(); while(1); // 禁用中断后死循环
该 CPU 上的所有线程(包括关键守护进程)被"饿死",系统部分功能瘫痪。
而软看门狗就是来检测这样的场景的
软看门狗的基本原理
观看者—被观看者模式
“观看者”定期检查系统或应用状态;若在预定时间内没有收到“心跳”(keepalive),则视为系统失去响应,触发重启或回调。
基于高精度定时器
使用
hrtimer
(high-resolution timer)实现超时检测。
1. 核心组件
2. 检测流程
定时器到期
hrtimer 到期 → 触发watchdog_timer_fn
(在 中断上下文 执行)检查时间戳
比较当前时间now
和看门狗线程更新时间戳watchdog_touch_ts
:if (now - watchdog_touch_ts > softlockup_threshold) { /* 触发警报! */ }
结果处理
时间戳已更新:重置 hrtimer → 继续监控
时间戳未更新:
打印内核警告:
BUG: soft lockup - CPU#X stuck for 23s!
触发堆栈回溯(帮助定位阻塞点)
可配置为触发
panic()
使系统崩溃(需手动启用)
内核中的主要实现
软看门狗和soft lockup其实关系非常密切,它们在 Linux 内核中用于检测和处理系统中的软死锁问题(即某些内核线程或进程长时间没有响应,导致系统无法正常运行)。
下面介绍一些常见的debug的内核参数
常用的用于debug的内核参数
通过 /proc/sys/kernel
调整:
hrtimer info
在timer_list.txt中的hrtimer info解释
hrtimer info:
CPU 0 hrtimer_bases v.v (struct hrtimer_cpu_base)0xffffff887724e600
hrtimer_cpu_base 0xffffff887724e6c0
hrtimer function _softexpires _softexpires
v.v (struct hrtimer *)0xffffff887724f0b0 0xffffffc080206308 ('tick_sched_timer', 0) 52136120000000 52136120000000
v.v (struct hrtimer *)0xffffff887724f350 0xffffffc080246b80 ('watchdog_timer_fn', 0) 52140056000000 52140056000000
v.v (struct hrtimer *)0xffffffc0b0dfbc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52140466907210 52140466957210
v.v (struct hrtimer *)0xffffffc0b288bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52140849712578 52140849762578
v.v (struct hrtimer *)0xffffffc0b5263c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52143244442057 52143244492057
v.v (struct hrtimer *)0xffffffc0b5743c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52147396466222 52147396516222
v.v (struct hrtimer *)0xffffffc0b4e83c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52148067472002 52148067522002
v.v (struct hrtimer *)0xffffffc0b5903c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52152023883195 52152023933195
v.v (struct hrtimer *)0xffffffc0b6f1bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52159000004843 52159000054843
v.v (struct hrtimer *)0xffffffc0b7c6bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52159496489915 52159496539915
v.v (struct hrtimer *)0xffffffc09adf3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52164952366731 52164952416731
v.v (struct hrtimer *)0xffffffc0b9743c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52166107545069 52166107595069
v.v (struct hrtimer *)0xffffffc0b56bbc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52167617888766 52167617938766
v.v (struct hrtimer *)0xffffffc0b7c9bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52182007604461 52182007654461
v.v (struct hrtimer *)0xffffffc0b55bbc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52182165841596 52182165891596
v.v (struct hrtimer *)0xffffffc0b28e3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52204781708479 52204781758479
v.v (struct hrtimer *)0xffffffc0b308bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52221587812571 52221587862571
v.v (struct hrtimer *)0xffffffc0b21f3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52221833683099 52221833733099
v.v (struct hrtimer *)0xffffffc0b5a53c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52242753492308 52242753542308
v.v (struct hrtimer *)0xffffffc099b03d38 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52279756334238 52279756384238
v.v (struct hrtimer *)0xffffffc0b4913c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52347051698425 52347051748425
v.v (struct hrtimer *)0xffffffc0b13f3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52692070982210 52692071032210
v.v (struct hrtimer *)0xffffffc09f94bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52698064978614 52698065028614
v.v (struct hrtimer *)0xffffffc09f1ebc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52701970378820 52701970428820
v.v (struct hrtimer *)0xffffffc09dac3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52710075325485 52710075375485
v.v (struct hrtimer *)0xffffff87ba6f3a00 0xffffffc0804b1a04 ('timerfd_tmrproc', 0) 54053001235604 54053001235604
v.v (struct hrtimer *)0xffffffc09c4d3d30 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 4300382257786162 4300382257836162
v.v (struct hrtimer *)0xffffffc096f8bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 398883794239173412 398883794239223412
v.v (struct hrtimer *)0xffffffc09171bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 9223372036854775807 9223372036854775807
v.v (struct hrtimer *)0xffffffc0918e3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 9223372036854775807 9223372036854775807
v.v (struct hrtimer *)0xffffffc08d3b3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 9223372036854775807 9223372036854775807
hrtimer_cpu_base 0xffffff887724e740
hrtimer function _softexpires _softexpires
v.v (struct hrtimer *)0xffffff887724f0b0 0xffffffc080206308 ('tick_sched_timer', 0) 52136120000000 52136120000000
v.v (struct hrtimer *)0xffffff887724f350 0xffffffc080246b80 ('watchdog_timer_fn', 0) 52140056000000 52140056000000
v.v (struct hrtimer *)0xffffffc0b0dfbc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52140466907210 52140466957210
v.v (struct hrtimer *)0xffffffc0b288bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52140849712578 52140849762578
v.v (struct hrtimer *)0xffffffc0b5263c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52143244442057 52143244492057
v.v (struct hrtimer *)0xffffffc0b5743c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52147396466222 52147396516222
v.v (struct hrtimer *)0xffffffc0b4e83c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52148067472002 52148067522002
v.v (struct hrtimer *)0xffffffc0b5903c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52152023883195 52152023933195
v.v (struct hrtimer *)0xffffffc0b6f1bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52159000004843 52159000054843
v.v (struct hrtimer *)0xffffffc0b7c6bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52159496489915 52159496539915
v.v (struct hrtimer *)0xffffffc09adf3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52164952366731 52164952416731
v.v (struct hrtimer *)0xffffffc0b9743c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52166107545069 52166107595069
v.v (struct hrtimer *)0xffffffc0b56bbc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52167617888766 52167617938766
v.v (struct hrtimer *)0xffffffc0b7c9bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52182007604461 52182007654461
v.v (struct hrtimer *)0xffffffc0b55bbc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52182165841596 52182165891596
v.v (struct hrtimer *)0xffffffc0b28e3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52204781708479 52204781758479
v.v (struct hrtimer *)0xffffffc0b308bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52221587812571 52221587862571
v.v (struct hrtimer *)0xffffffc0b21f3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52221833683099 52221833733099
v.v (struct hrtimer *)0xffffffc0b5a53c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52242753492308 52242753542308
v.v (struct hrtimer *)0xffffffc099b03d38 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52279756334238 52279756384238
v.v (struct hrtimer *)0xffffffc0b4913c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52347051698425 52347051748425
v.v (struct hrtimer *)0xffffffc0b13f3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52692070982210 52692071032210
v.v (struct hrtimer *)0xffffffc09f94bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52698064978614 52698065028614
v.v (struct hrtimer *)0xffffffc09f1ebc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52701970378820 52701970428820
v.v (struct hrtimer *)0xffffffc09dac3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52710075325485 52710075375485
v.v (struct hrtimer *)0xffffff87ba6f3a00 0xffffffc0804b1a04 ('timerfd_tmrproc', 0) 54053001235604 54053001235604
v.v (struct hrtimer *)0xffffffc09c4d3d30 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 4300382257786162 4300382257836162
v.v (struct hrtimer *)0xffffffc096f8bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 398883794239173412 398883794239223412
v.v (struct hrtimer *)0xffffffc09171bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 9223372036854775807 9223372036854775807
v.v (struct hrtimer *)0xffffffc0918e3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 9223372036854775807 9223372036854775807
v.v (struct hrtimer *)0xffffffc08d3b3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 9223372036854775807 9223372036854775807
hrtimer_cpu_base 0xffffff887724e7c0
hrtimer function _softexpires _softexpires
v.v (struct hrtimer *)0xffffff887724f0b0 0xffffffc080206308 ('tick_sched_timer', 0) 52136120000000 52136120000000
v.v (struct hrtimer *)0xffffff887724f350 0xffffffc080246b80 ('watchdog_timer_fn', 0) 52140056000000 52140056000000
v.v (struct hrtimer *)0xffffffc0b0dfbc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52140466907210 52140466957210
v.v (struct hrtimer *)0xffffffc0b288bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52140849712578 52140849762578
v.v (struct hrtimer *)0xffffffc0b5263c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52143244442057 52143244492057
v.v (struct hrtimer *)0xffffffc0b5743c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52147396466222 52147396516222
v.v (struct hrtimer *)0xffffffc0b4e83c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52148067472002 52148067522002
v.v (struct hrtimer *)0xffffffc0b5903c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52152023883195 52152023933195
v.v (struct hrtimer *)0xffffffc0b6f1bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52159000004843 52159000054843
v.v (struct hrtimer *)0xffffffc0b7c6bc80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52159496489915 52159496539915
v.v (struct hrtimer *)0xffffffc09adf3c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52164952366731 52164952416731
v.v (struct hrtimer *)0xffffffc0b9743c80 0xffffffc0801f1adc ('hrtimer_wakeup', 0) 52166107545069 52166107595069
以下是各列和内容的解析
高通平台的watchdog debug
关于在debug watchdog触发的死机问题时,我们有通用的一般的方法,步骤见:
我这里介绍一下关于watchdog触发的判断依据以及小技巧。
查看dmesg_TZ.txt中解析出的watdog部分,检查CPU、ping_start/ping_end,与当前的系统时间做对比,判断ping cpu的动作的执行结果以及可能出现问题的cpu
查看timer_list.txt中的timer list dump,查
qcom_wdt_pet_task_wakeup
在哪个CPU上?触发回调的时间点与当前时间点做比较,如果早于当前的系统时间说明定时器没有被触发?查看各个CPU上的timer_jiffies和next_timer来判断哪些CPU的时间戳没有被更新,则这些CPU是可怀疑对象!
如果watchdog定时器到了时间没有触发,则说明该CPU被阻塞了,去查当前CPU上的栈trace。
还可以检测hrtimer,看时间戳来判断是不是由于系统调度或者soft lockup等问题造成的。
再有必要的时候,也可以参考第3.3章节中的内核参数来抓取日志debug。
总结
pet_timer是个低精度计时器,低精度定时器会在每个tick中断来时被检查,超时就触发软中断,而linux的软中断又会再中断返回时检查执行。所以只要有CPU还能响应中断pet_task_wakeup就能被执行,但是pet_task_wakeup也只是唤醒了喂狗线程而已,并没有直接去允许它,具体什么时候允许要看下次抢占点到了pet-thread能不能成功抢占cpu执行。
watchdog_kthread作为一个实时进程,而且优先级最高加上唤醒抢占的机制,一旦喂狗线程被加到某个CPU的就绪列队中,一般很快就会被执行并且基本没有进程可以抢占它.
所以没法喂狗会发生在以下情况:
喂狗线程所在cpu长时间关抢占,导致即使pet_timer唤醒喂狗线程但得不到执行。
喂狗线程所在的cpu频繁被中断打断,导致没法按时喂狗。
其他cpu长时间处于关中断导致无法响应ipi中断。
所有cpu长时间处于关中断导致直接触发狗咬(pet_timer无法执行,狗叫中断无法响应)。