内核hung检测机制

Hung检测为内核提供的debug机制，用来检测系统是否存在长期处于TASK_UNINTERRUPTIBLE的进程。

Hung检测原理

内核进程定期扫描状态为TASK_UNINTERRUPTIBLE的进程，如果在sysctl_hung_task_timeout_secs时间内，进程没有发生调度行为，则标识该进程处于hung状态，打印此进程堆栈等相关信息。

具体实现

static int __init hung_task_init(void)
{
	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);

	/* Disable hung task detector on suspend */
	pm_notifier(hungtask_pm_notify, 0);

	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");

	return 0;
}
subsys_initcall(hung_task_init);

通过subsys_initall(hung_task_init) 注册hung_task_init到启动阶段回调函数段中，
内核初始化阶段触发hung_task_init()回调完成hungtaskd初始化流程。

注册原子通知链回调，panic时回调被执行。注册panic回调的目的是当panic发生时，置did_panic。该值被khungtaskd判断是否需要执行hung check操作。见check_hung_uninterruptible_tasks() : if (test_taint(TAINT_DIE) || did_panic) return;
创建内核进程khungtaskd执行watchdog(),循环遍历所有进程，根据一定周期内进程是否被调度来识别进程是否hung住。

/*
 * kthread which checks for tasks stuck in D state
 */
static int watchdog(void *dummy)
{

    //本次hung check开始的时刻
	unsigned long hung_last_checked = jiffies;

	set_user_nice(current, 0);  //hungtaskd进程开始执行时，设置优先级为普通优先级。 
    //为普通优先级的khungtaskd也能被调度出去，遍历process的过程可能会被中断说明khungtaskd不是严格准确的检测机制，但即便不是严格准确的检查机制，可能也已经足够实用了。kmemleak的设计与此类似。

	for ( ; ; ) {
        //sysctl_hung_task_timeout_secs 超时时长，进程两次切换间隙时长如果超过此值，那么
        //被认为处于hung状态，可通过sys节点调整。默认为120秒
		unsigned long timeout = sysctl_hung_task_timeout_secs;

        //khungtaskd进程两次执行hung检查间的时间间隔
        //可通过sys节点调整
		unsigned long interval = sysctl_hung_task_check_interval_secs;
		long t;

        //如果没有设置check间隔，则默认同进程超时时长相同，timeout默认为120秒
		if (interval == 0)
			interval = timeout;
        
        //internal为khungtaskd检查的时长间隙，所以必然要比进程被判断为hung状态的阈值要小才行。
        //举例：进程设定为10秒不切换就识别为hung，那么check的间隙最好要在10秒内，否则无法及时检测到处于hung状态的进程，此时sysctl_hung_task_timeout_secs已经失去了意义，因为已经hung住了sysctl_hung_task_timeout_secs秒，但却没有及时被系统发现已经hung住了。
		interval = min_t(unsigned long, interval, timeout);


        //如果sysctl_hung_task_timeout_secs+interval 超过当前时刻，则t>0
        //说明下一轮hung 检查的时间点还没来，不用执行hung检查。
		t = hung_timeout_jiffies(hung_last_checked, interval);

        // t<=0 说明上一次检查到现在已经超过interval了，需要进行检查
		if (t <= 0) {

			//把0赋值给reset_hung_task,并返回reset_hung_task的旧值
            //如果reset_hung_task值为0 并且 hung_detecotr_suspend为0，执行检查
			if (!atomic_xchg(&reset_hung_task, 0) &&
			    !hung_detector_suspended)
				check_hung_uninterruptible_tasks(timeout);
			hung_last_checked = jiffies;
			continue;
		}

        //t>0 说明还需要等待t，才到下一次hung 检查的时机，因此khungtaskd休眠等待t时长
		schedule_timeout_interruptible(t);
	}

	return 0;
}

/*
 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
 * a really long time (120 seconds). If that happens, print out
 * a warning.
 */
 //由khungtaskd内核线程执行hung检查
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
    //设置一轮hung检查可以检查多少个进程
	int max_count = sysctl_hung_task_check_count;
	unsigned long last_break = jiffies;

    //g 进程  t 线程
    //每个进程和线程都有唯一的pid，其中线程通过tgid标识所属的进程
	struct task_struct *g, *t;

	/*
	 * If the system crashed already then all bets are off,
	 * do not report extra hung tasks:
	 */
    //如果处于die状态 或者panic状态，那么hung检查已经没有任何意义了。
    //还记得吗？我们在初始化khungtaskd之前注册了panic的通知链panic_block，其中的回调会置did_panic为1
	if (test_taint(TAINT_DIE) || did_panic)
		return;

	hung_task_show_lock = false;
	rcu_read_lock();
	for_each_process_thread(g, t) {

        //一轮检查中扫描的进程数目不能超过设定的阈值
		if (!max_count--)
			goto unlock;

        //不知道是干嘛的，2023年1月6日 01:04:12
		if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
			if (!rcu_lock_break(g, t))
				goto unlock;
			last_break = jiffies;
		}

		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
        // 注意这里的条件是严格等于，而不是包含
        //timeout即为sysctl_hung_task_timeout_secs，
        //用户可配置的 确定进程hung住的阈值
		if (t->state == TASK_UNINTERRUPTIBLE)
			check_hung_task(t, timeout);
	}
 unlock:
	rcu_read_unlock();
	if (hung_task_show_lock)
		debug_show_all_locks();
	if (hung_task_call_panic) {
		trigger_all_cpu_backtrace();
		panic("hung_task: blocked tasks");
	}
}


//检查某进程是否hung住的核心函数
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{

    //struct task_struct成员，记录进程切换次数
    /* Context switch counts: */
	// unsigned long			nvcsw;
	// unsigned long			nivcsw;
	unsigned long switch_count = t->nvcsw + t->nivcsw;

	/*
	 * Ensure the task is not frozen.
	 * Also, skip vfork and any other user process that freezer should skip.
	 */
    //对于处于frozen状态 或者 不能被frozen的进程，不做检查。
	if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
	    return;

	/*
	 * When a freshly created task is scheduled once, changes its state to
	 * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
	 * musn't be checked.
	 */
    //从没发生切换，当然不需要做检查，这种情况很少见，因为如果被设置为
    //TASK_UNINTERRUPTIBLE状态后，那么通常会马上切换进程，因此
    //switch_count为0的可能性很低
    //猜想switch_count为0的一种可能性：切换为TASK_UNINTERRUPTIBLE后，
    //正要执行schedule()时，khungtaskd对进程进行了扫描
	if (unlikely(!switch_count))
		return;

    // 本次检查发现切换次数与上次检查不相同
    //说明这期间进程有被调度过，说明没有hung住
    //更新当前被扫描进程的（上次khungtaskd）“检查点的时刻”以及（khungtaskd扫描时）“上下文切换次数”
	if (switch_count != t->last_switch_count) {
		t->last_switch_count = switch_count;
		t->last_switch_time = jiffies;
		return;
	}

    //走到这里，说明本次检查时发现切换次数与上次检查时上下文切换次数相同
    //但我们还得再看一下距此进程上一次被khungtaskd检查的间隔
    //是否已经过了timeout，如果到了，说明进程hung住了。
    // if为真，说明当前jiffer小于超时时刻，进程还不能被识别为hung住，因此返回。
	if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
		return;


    //到这里说明进程在timeout周期内，没有发生调度。需要将进程标识为hung状态

    //静态tracepoint点。
	trace_sched_process_hang(t);

    // 如果设定了panic_on_hung,检测到有hung状态进程时会直接触发panic（在遍历进程之后，不是立刻panic）
    // 则置hung_task_show_lock hung_task_call_panic
    //当遍历进程流程结束后会根据这两个变量触发具体动作
	if (sysctl_hung_task_panic) {
		console_verbose();
        // 有hung状态进程时 打印锁信息
		hung_task_show_lock = true;
        // 有hung状态进程时 ，打印所有cpu堆栈，panic
		hung_task_call_panic = true;
	}

	/*
	 * Ok, the task did not get scheduled for more than 2 minutes,
	 * complain:
	 */
    // sysctl_hung_task_warnings设置的是khungtaskd记录处于hung状态
    //进程的个数，输出多少个hung状态进程信息后不再输出相关信息。
	if (sysctl_hung_task_warnings) {
		if (sysctl_hung_task_warnings > 0)
			sysctl_hung_task_warnings--;
		pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
			t->comm, t->pid, timeout);
		pr_err("      %s %s %.*s%s\n",
			print_tainted(), init_utsname()->release,
			(int)strcspn(init_utsname()->version, " "),
			init_utsname()->version,
			LINUX_PACKAGE_ID);
		pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
			" disables this message.\n");
		sched_show_task(t);
		hung_task_show_lock = true;
	}

    //不太懂为啥这里要更新nmi的watchdog value 2023年1月6日 01:28:35
	touch_nmi_watchdog();
}

posted on 2023-01-06 23:38 老僧非是爱花红阅读(319) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· 内核hung检测机制（2）

· 内核hung检测机制（3）

· Linux Hung Task分析【转】

· Hung task

· Softlockup&Hardlockup检测机制

老僧非是爱花红

导航

公告

统计

搜索

常用链接

我的标签

随笔分类

随笔档案

阅读排行榜

推荐排行榜

内核hung检测机制

Hung检测原理

具体实现