linux 3.10 一个扇区异常可能引发的hung
最近遇到一例3.10内核的crash:
[ 4109.682163] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 4109.683440] fas_readwriter D ffff881fffd147c0 0 46745 43298 0x00000080 [ 4109.683444] ffff881f320bfc20 0000000000000082 ffff881f320bffd8 ffff881f320bffd8 [ 4109.683469] ffff881f320bffd8 00000000000147c0 ffff881fd321ae00 ffff881571308f20 [ 4109.683487] ffff881571308f24 ffff881f32186780 00000000ffffffff ffff881571308f28 [ 4109.683491] Call Trace: [ 4109.683503] [<ffffffff8163f959>] schedule_preempt_disabled+0x29/0x70 [ 4109.683507] [<ffffffff8163d415>] __mutex_lock_slowpath+0xc5/0x1c0 [ 4109.683512] [<ffffffff811eaff7>] ? unlazy_walk+0x87/0x140 [ 4109.683515] [<ffffffff8163c87f>] mutex_lock+0x1f/0x2f [ 4109.683522] [<ffffffff81636235>] lookup_slow+0x33/0xa7 [ 4109.683525] [<ffffffff811f0483>] path_lookupat+0x773/0x7a0 [ 4109.683531] [<ffffffff81274454>] ? SYSC_semtimedop+0x264/0xd10 [ 4109.683537] [<ffffffff810c0cbb>] ? should_numa_migrate_memory+0x5b/0x150 [ 4109.683542] [<ffffffff811c3345>] ? kmem_cache_alloc+0x35/0x1d0 [ 4109.683545] [<ffffffff811f117f>] ? getname_flags+0x4f/0x1a0 [ 4109.683548] [<ffffffff811f04db>] filename_lookup+0x2b/0xc0 [ 4109.683550] [<ffffffff811f22a7>] user_path_at_empty+0x67/0xc0 [ 4109.683556] [<ffffffff811993c7>] ? handle_mm_fault+0x607/0xf40 [ 4109.683559] [<ffffffff811f2311>] user_path_at+0x11/0x20 [ 4109.683564] [<ffffffff811e5b53>] vfs_fstatat+0x63/0xc0 [ 4109.683567] [<ffffffff811e60be>] SYSC_newstat+0x2e/0x60 [ 4109.683570] [<ffffffff81273ced>] ? do_smart_update+0x11d/0x130 [ 4109.683575] [<ffffffff8101c899>] ? read_tsc+0x9/0x10 [ 4109.683580] [<ffffffff8110d336>] ? __audit_syscall_exit+0x1e6/0x280 [ 4109.683583] [<ffffffff811e639e>] SyS_newstat+0xe/0x10 [ 4109.683586] [<ffffffff81649909>] system_call_fastpath+0x16/0x1b
看来在一个mutex上处于uninterrupt同步等待超过了时间。
看堆栈是stat调用访问文件的元数据,
crash> struct nameidata ffff881f320bfdc0 struct nameidata { path = { mnt = 0xffff883fcb7e2820, dentry = 0xffff881f502552c0 }, last = { { { hash = 3495247374, len = 27 }, hash_len = 119459364366 }, name = 0xffff881040c56031 "00fe_0002_011712b3_00000001"-----------访问的文件名 }, root = { mnt = 0xffff883fccff1020, dentry = 0xffff881fcc7ff980 }, inode = 0xffff881571308e78, flags = 1, seq = 2, last_type = 0, depth = 0, saved_names = {0x7f30b11923e8 <Address 0x7f30b11923e8 out of bounds>, 0xffff881f8a6a2c40 "g", 0xffff881f46cc0438 "", 0xffff881fce793e80 "\260\334\341W\037\210\377\377\310\354\242o\037\210\377\377\020E-D\037\210\377\377\200\231\001\201\377\377\377\377\020\363\031\201\377\377\377\377", 0x80 <Address 0x80 out of bounds>, 0xffff881f320bfec0 "\260\004|\a(\177", 0xffffffff811993c7 <handle_mm_fault+1543> "A\211\302\351\327\375\377\377\220I\211\370L\211\341H\211\332L\211\356L\211\367D\211M\300\350\310]\003", 0x5cd06a0 <Address 0x5cd06a0 out of bounds>, 0x0} }
files 看不到这个文件,说明该进程没有open 这个file。
要找到对应的mutex:
crash> struct nameidata.path ffff881f320bfdc0 path = { mnt = 0xffff883fcb7e2820, dentry = 0xffff881f502552c0 } crash> dentry.d_inode 0xffff881f502552c0 d_inode = 0xffff881571308e78 crash> struct -x dentry.d_inode struct dentry { [0x30] struct inode *d_inode; } crash> inode.i_mutex 0xffff881571308e78 i_mutex = { count = { counter = -1 }, wait_lock = { { rlock = { raw_lock = { { head_tail = 100009462, tickets = { head = 1526, tail = 1526 } } } } } }, wait_list = { next = 0xffff8819fc5b3c90, prev = 0xffff881f5097bc40 }, owner = 0xffff881f52764500,-------spinlock在没开启debug的情况下不会设置owner,而mutex一般看owner就知道谁持有 { osq = 0x0, __UNIQUE_ID_rh_kabi_hide1 = { spin_mlock = 0x0 }, {<No data fields>} } }
然后看一下owner为啥拿了互斥量不放:
task 0xffff881f52764500 PID: 47270 TASK: ffff881f52764500 CPU: 25 COMMAND: "nginx" struct task_struct { state = 2, stack = 0xffff881a5c19c000,
查看它的堆栈:
crash> bt -f 47270 PID: 47270 TASK: ffff881f52764500 CPU: 25 COMMAND: "nginx" #0 [ffff881a5c19f7f0] __schedule at ffffffff8163df9b ffff881a5c19f7f8: 0000000000000082 ffff881a5c19ffd8 ffff881a5c19f808: ffff881a5c19ffd8 ffff881a5c19ffd8 ffff881a5c19f818: 00000000000147c0 ffff881fd3267300 ffff881a5c19f828: ffff8813ee889a58 ffff8813ee889a60 ffff881a5c19f838: 7fffffffffffffff ffff881f52764500 ffff881a5c19f848: ffff881fd042b500 ffff881a5c19f860 ffff881a5c19f858: ffffffff8163e879 #1 [ffff881a5c19f858] schedule at ffffffff8163e879 ffff881a5c19f860: ffff881a5c19f908 ffffffff8163c329 #2 [ffff881a5c19f868] schedule_timeout at ffffffff8163c329 ffff881a5c19f870: 7fffffffffffffff ffff881a5c19f900 ffff881a5c19f880: 0000000000000000 0000000000000000 ffff881a5c19f890: 0000000000000000 ffff8813ee889980 ffff881a5c19f8a0: ffff881a5c19f8b8 ffffffff812cea44 ffff881a5c19f8b0: 0000000000000001 ffff881a5c19f968 ffff881a5c19f8c0: ffffffffa07a51c4 000010005c19f8d8 ffff881a5c19f8d0: 0000000000000000 00000000edb89915 ffff881a5c19f8e0: ffff8813ee889a58 ffff8813ee889a60 ffff881a5c19f8f0: 7fffffffffffffff ffff881f52764500 ffff881a5c19f900: ffff881fd042b500 ffff881a5c19f968 ffff881a5c19f910: ffffffff8163ec46 #3 [ffff881a5c19f910] wait_for_completion at ffffffff8163ec46 ffff881a5c19f918: 0000000000000001 ffff881f52764500 ffff881a5c19f928: ffffffff810b9930 ffff8813ee889a68 ffff881a5c19f938: ffff8813ee889a68 00000000edb89915 ffff881a5c19f948: ffff8813ee889980 0000000000000001 ffff881a5c19f958: ffffffffa07a6ce3 0000000000000000 ffff881a5c19f968: ffff881a5c19f990 ffffffffa07a6b74 #4 [ffff881a5c19f970] xfs_buf_submit_wait at ffffffffa07a6b74 [xfs]---------同步的buffer io提交函数 ffff881a5c19f978: ffff8813ee889980 0000000000000001-----------ffff8813ee889980 为xfs_buf ffff881a5c19f988: ffffffffa07d3829 ffff881a5c19f9a0 ffff881a5c19f998: ffffffffa07a6ce3 #5 [ffff881a5c19f998] _xfs_buf_read at ffffffffa07a6ce3 [xfs] ffff881a5c19f9a0: ffff881a5c19f9e0 ffffffffa07a6dda #6 [ffff881a5c19f9a8] xfs_buf_read_map at ffffffffa07a6dda [xfs]----------,走这个分支说明tp为NULL,根据map找xfs_buf,找不到则申请一个,扇区为1960485944,长度为8 ffff881a5c19f9b0: ffffffffa07e6cc0 0000000000000000 ffff881a5c19f9c0: 0000000000000000 ffff883f001f7800 ffff881a5c19f9d0: ffff881a5c19fa60 ffff881fd042b500 ffff881a5c19f9e0: ffff881a5c19fa20 ffffffffa07d3829 #7 [ffff881a5c19f9e8] xfs_trans_read_buf_map at ffffffffa07d3829 [xfs]----根据trans和xfs_buf_map调用xfs_buf_read_map 来获取xfs_buf ffff881a5c19f9f0: 00000000edb89915 ffff881a5c19fb10 ffff881a5c19fa00: 0000000000000000 ffff881571308cc0 ffff881a5c19fa10: ffffffffa07e6cc0 ffff881a5c19fa60 ffff881a5c19fa20: ffff881a5c19faa0 ffffffffa0786204 #8 [ffff881a5c19fa28] xfs_da_read_buf at ffffffffa0786204 [xfs] ffff881a5c19fa30: ffff881a5c19fa50 ffffffffa07e6cc0 ffff881a5c19fa40: 0000000000000000 00000001a07e6650 ffff881a5c19fa50: 0000000000000000 ffff881a5c19fa60 ffff881a5c19fa60: 0000000074daa438 ffff881a00000008 ffff881a5c19fa70: 00000000edb89915 0000000000000000 ffff881a5c19fa80: ffff881a5c19fb10 ffff8810cfbd1b00 ffff881a5c19fa90: 0000000000000016 000000000000006c ffff881a5c19faa0: ffff881a5c19fad0 ffffffffa078baa6 #9 [ffff881a5c19faa8] xfs_dir3_data_read at ffffffffa078baa6 [xfs] ffff881a5c19fab0: ffffffffa07e6cc0 ffff881f5a13ad00 ffff881a5c19fac0: ffff881f5a13ad00 ffff8814905453a0 ffff881a5c19fad0: ffff881a5c19fb58 ffffffffa078f99e #10 [ffff881a5c19fad8] xfs_dir2_leafn_lookup_for_entry at ffffffffa078f99e [xfs] ffff881a5c19fae0: 0000000172bc1800 ffff881f5a13ad58 ffff881a5c19faf0: ffff883f001f7800 ffff881571308cc0 ffff881a5c19fb00: 0000000000000000 ffff881400000016 ffff881a5c19fb10: 0000000000000000 0080001d00800010 ffff881a5c19fb20: ffff000501873dff 00000000edb89915 ffff881a5c19fb30: ffff881f5a13ad40 ffff881f5a13ad00 ffff881a5c19fb40: ffff881f5a13ad10 ffff8810cfbd1b00 ffff881a5c19fb50: ffff881f5a13ad58 ffff881a5c19fb68 ffff881a5c19fb60: ffffffffa0791177 #11 [ffff881a5c19fb60] xfs_dir2_leafn_lookup_int at ffffffffa0791177 [xfs] ffff881a5c19fb68: ffff881a5c19fbc8 ffffffffa0787726 #12 [ffff881a5c19fb70] xfs_da3_node_lookup_int at ffffffffa0787726 [xfs] ffff881a5c19fb78: ffff881a5c19fbdc ffff881a5c19fcb8 ffff881a5c19fb88: 0000000000000000 ffff000100643ebe ffff881a5c19fb98: 00000000edb89915 ffff8810cfbd1b00 ffff881a5c19fba8: 0000000000000000 ffff8810cfbd1b00 ffff881a5c19fbb8: ffff881f5a13ad00 ffff881a5c19fcb8 ffff881a5c19fbc8: ffff881a5c19fc08 ffffffffa0791ded #13 [ffff881a5c19fbd0] xfs_dir2_node_lookup at ffffffffa0791ded [xfs] ffff881a5c19fbd8: ffff881a5c19fc08 00000000edb89915 ffff881a5c19fbe8: ffff8810cfbd1b00 0000000000000000 ffff881a5c19fbf8: 0000000000000000 0000000000000000 ffff881a5c19fc08: ffff881a5c19fc58 ffffffffa07897b5 #14 [ffff881a5c19fc10] xfs_dir_lookup at ffffffffa07897b5 [xfs] ffff881a5c19fc18: ffff881a5c19fc68 0000000000000008 ffff881a5c19fc28: 00000000edb89915 0000000000000008 ffff881a5c19fc38: ffff881571308cc0 ffff881a5c19fcb8 ffff881a5c19fc48: 0000000000000000 ffff881a5c19fcb0 ffff881a5c19fc58: ffff881a5c19fca0 ffffffffa07b7bb6 #15 [ffff881a5c19fc60] xfs_lookup at ffffffffa07b7bb6 [xfs] ffff881a5c19fc68: ffff881a5c19fc90 00000000edb89915 ffff881a5c19fc78: ffff881fcd1038c0 0000000000008000 ffff881a5c19fc88: ffff881a5c19fdf0 ffff881f502552c0 ffff881a5c19fc98: ffff881a5c19fe50 ffff881a5c19fcd8 ffff881a5c19fca8: ffffffffa07b4a0b #16 [ffff881a5c19fca8] xfs_vn_lookup at ffffffffa07b4a0b [xfs] ffff881a5c19fcb0: ffff881a5c19fe60 ffff881fcd1038f8 ffff881a5c19fcc0: 000000000000001b 00000000edb89915 ffff881a5c19fcd0: ffff881fcd1038c0 ffff881a5c19fcf8 ffff881a5c19fce0: ffffffff811eac3d #17 [ffff881a5c19fce0] lookup_real at ffffffff811eac3d ffff881a5c19fce8: ffff881f502552c0 ffff881a5c19ff28 ffff881a5c19fcf8: ffff881a5c19fda8 ffffffff811ee813 #18 [ffff881a5c19fd00] do_last at ffffffff811ee813---------------do_last-->lookup_open-->lookup_real,对应2905行 ffff881a5c19fd08: ffff881a5c19ff58 ffffffff81274abd ffff881a5c19fd18: 0000000000000000 ffff881f52764500 ffff881a5c19fd28: 0000001f52764500 0000000000000000 ffff881a5c19fd38: ffff881571308e78 ffff88041076d000------------ ffff881571308e78对应的inode ffff881a5c19fd48: ffff881a5c19fde4 ffff881f502552c0-------------父目录的dentry ffff881a5c19fd58: ffff880aed4d5700 ffff880000000024 ffff881a5c19fd68: 01ff880aed4d5700 ffff881a5c19fd80 ffff881a5c19fd78: 00000000edb89915 ffff881a5c19fe50 ffff881a5c19fd88: ffff880aed4d5700 ffff88041076d000 ffff881a5c19fd98: ffff881a5c19ff28 ffff881f52764500 ffff881a5c19fda8: ffff881a5c19fe40 ffffffff811f0be2 #19 [ffff881a5c19fdb0] path_openat at ffffffff811f0be2 ffff881a5c19fdb8: ffffea00515463b0 0000000000000000 ffff881a5c19fdc8: 0000000000000200 ffff883fc04c7080 ffff881a5c19fdd8: 0000004100000000 00000000000000a9 ffff881a5c19fde8: 0000000000000000 00007f2c5676d000 ffff881a5c19fdf8: ffffea00d1d8c2c0 00000000edb89915 ffff881a5c19fe08: 00007f2c5676dff0 00000000edb89915 ffff881a5c19fe18: 00000000ffffff9c ffff88041076d000 ffff881a5c19fe28: ffff881a5c19ff28 0000000000000001 ffff881a5c19fe38: 00000000000036cc ffff881a5c19ff10 ffff881a5c19fe48: ffffffff811f23ab #20 [ffff881a5c19fe48] do_filp_open at ffffffff811f23ab ffff881a5c19fe50: ffff883fcb7e2820 ffff881f502552c0 ffff881a5c19fe60: 0000001bc5e44c0c ffff88041076d031 ffff881a5c19fe70: ffff883fccff1020 ffff881fcc7ff980 ffff881a5c19fe80: ffff881571308e78 0000000200000101 ffff881a5c19fe90: 0000000000000000 0000000000000001 ffff881a5c19fea0: 00007f2c64217510 0000000000000000 ffff881a5c19feb0: 0000000000000000 ffff881a5c19ff00 ffff881a5c19fec0: ffffffff811fef47 ffff883fc175c3c0 ffff881a5c19fed0: 0010000000008000 0000000000000001 ffff881a5c19fee0: 0000000000008000 00000000edb89915 ffff881a5c19fef0: 0000000000000001 00000000000000e3 ffff881a5c19ff00: 00000000ffffff9c ffff88041076d000 ffff881a5c19ff10: ffff881a5c19ff68 ffffffff811dfd53 #21 [ffff881a5c19ff18] do_sys_open at ffffffff811dfd53 ffff881a5c19ff20: ffff881f49292a48 ffff000000008000 ffff881a5c19ff30: 0000010000000024 00000000edb89915 ffff881a5c19ff40: 0000000000000000 00007f2c64217664 ffff881a5c19ff50: 0000000000000002 00007f2c64217653 ffff881a5c19ff60: 00000000000036cc ffff881a5c19ff78 ffff881a5c19ff70: ffffffff811dfe6e #22 [ffff881a5c19ff70] sys_open at ffffffff811dfe6e ffff881a5c19ff78: 00007f2c64217510 ffffffff81649909 #23 [ffff881a5c19ff80] system_call_fastpath at ffffffff81649909 RIP: 00007f2e31a3dc3d RSP: 00007f2c64217358 RFLAGS: 00010202 RAX: 0000000000000002 RBX: ffffffff81649909 RCX: 0000000000010000 RDX: 0000000000000005 RSI: 0000000000000000 RDI: 00007f2c64217510 RBP: 00007f2c64217510 R8: 000000000000ffff R9: 000000000000001f R10: 00007f2e30df7000 R11: 0000000000000293 R12: ffffffff811dfe6e R13: ffff881a5c19ff78 R14: 00000000000036cc R15: 00007f2c64217653 ORIG_RAX: 0000000000000002 CS: 0033 SS: 002b crash>
该进程在一个完成量上等待:
等待的完成量: crash>completion ffff8813ee889a58 struct completion { done = 0,----------------至今未完成,如果完成则为1,由complete 函数修改,并唤醒等待的task wait = { lock = { { rlock = { raw_lock = { { head_tail = 131074, tickets = { head = 2, tail = 2 } } } } } }, task_list = { next = 0xffff881a5c19f930, prev = 0xffff881a5c19f930 } } }
这个完成量其实就是在等待一个io的完成,
对应的调用链:
xfs_buf_read_map--》_xfs_buf_read--》xfs_buf_submit_wait-->_xfs_buf_ioapply-->xfs_buf_ioapply_map-->submit_bio
_xfs_buf_ioapply返回后,执行wait_for_completion,然后就一直等着了
查看对应的xfs_buf:
crash> xfs_buf 0xffff8813ee889980 struct xfs_buf { b_rbnode = { __rb_parent_color = 18446612139147634816, rb_right = 0x0, rb_left = 0x0 }, b_bn = 1960485944,----------扇区号 b_length = 8, b_hold = { counter = 2 }, b_lru_ref = { counter = 1 }, b_flags = 1048577,也就是0x100001,io未完成,没有置为异步标志:#define XBF_ASYNC (1 << 4),#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */,说明需要等待的同步io b_sema = { lock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } }, count = 0, wait_list = { next = 0xffff8813ee8899b8, prev = 0xffff8813ee8899b8 } }, b_lru = { next = 0xffff8813ee8899c8, prev = 0xffff8813ee8899c8 }, b_lru_flags = 0, b_lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, b_io_error = 0, b_waiters = { lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, task_list = { next = 0xffff8813ee8899f0, prev = 0xffff8813ee8899f0 } }, b_list = { next = 0xffff8813ee889a00, prev = 0xffff8813ee889a00 }, b_pag = 0xffff881f4b98db00,-------这个xfs_buf关联的ag b_target = 0xffff881fd042b500, b_addr = 0xffff880750046000, b_ioend_work = {----------------------这个是io结束之后,会用到的work_struct data = { counter = 0 }, entry = { next = 0x0, prev = 0x0 }, func = 0x0 }, b_ioend_wq = 0xffff881f5ab7c400,-----------ioend之后,初始化对应的b_ioend_work,然后挂到这个workqueue中去 b_iodone = 0x0,-----------------------为NULL, b_iowait = {---------------这个就是完成量的地址 done = 0, wait = { lock = { { rlock = { raw_lock = { { head_tail = 131074, tickets = { head = 2,------------这把锁曾经吃过一次亏,记忆深刻, tail = 2 } } } } } }, task_list = { next = 0xffff881a5c19f930, prev = 0xffff881a5c19f930 } } }, b_fspriv = 0x0, b_transp = 0x0,----------------关联的xfs_trans,传入的tans为NULL b_pages = 0xffff8813ee889a90, b_page_array = {0xffffea001d401180, 0x0}, b_maps = 0xffff8813ee889aa8, __b_map = { bm_bn = 1960485944,----------扇区,由于一个ag是管理个扇区,所以这个扇区对应的是ag1 bm_len = 8------长度 }, b_map_count = 1, b_io_length = 8, b_pin_count = { counter = 0 }, b_io_remaining = { counter = 1------------依然为1 }, b_page_count = 1, b_offset = 0, b_error = 0, b_ops = 0xffffffffa07e6cc0 <xfs_dir3_data_buf_ops>---xfs_buf负责的不同对象,有不同的操作指针
在io正常完成时,回调的 xfs_buf_ioend 会处理xfs_buf的io结果,
if (bp->b_iodone)------我们的buf不满足
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)----我们的buf也不满足
xfs_buf_relse(bp);
else
complete(&bp->b_iowait);----我们的buf该走的流程
xfs_buf是由ag来缓存管理的,xfs使用一颗radix树来管理ag,我们的扇区号除以8就是block编号,每个ag管理的block数是固定的,那么很容易获取到对应的index。
crash> xfs_perag 0xffff881f4b98db00 struct xfs_perag { pag_mount = 0xffff883f001f7800, pag_agno = 1,-------------------------和上面对得上 pag_ref = { counter = 1002 }, pagf_init = 1 '\001', pagi_init = 1 '\001', pagf_metadata = 0 '\000',----是否和元数据关联 pagi_inodeok = 1 '\001', pagf_levels = "\001\001", pagf_flcount = 4, pagf_freeblks = 112266782,------空闲block pagf_longest = 112265313,----最长的空闲块 pagf_btreeblks = 0, pagi_freecount = 39,--------空闲inode个数 pagi_count = 175488,--------已经分配的inode个数 pagl_pagino = 81, pagl_leftrec = 1855879528, pagl_rightrec = 76512, pagb_lock = { { rlock = { raw_lock = { { head_tail = 4587590, tickets = { head = 70, tail = 70 } } } } } }, pagb_tree = { rb_node = 0x0 }, pagf_fstrms = { counter = 0 }, pag_ici_lock = { { rlock = { raw_lock = { { head_tail = 65799148, tickets = { head = 1004, tail = 1004 } } } } } }, pag_ici_root = { height = 5, gfp_mask = 32, rnode = 0xffff88144b2c7909 }, pag_ici_reclaimable = 0, pag_ici_reclaim_lock = { count = { counter = 1 }, wait_lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, wait_list = { next = 0xffff881f4b98db70, prev = 0xffff881f4b98db70 }, owner = 0x0, { osq = 0x0, __UNIQUE_ID_rh_kabi_hide1 = { spin_mlock = 0x0 }, {<No data fields>} } }, pag_ici_reclaim_cursor = 0, pag_buf_lock = { { rlock = { raw_lock = { { head_tail = 593896294, tickets = { head = 9062, tail = 9062 } } } } } }, pag_buf_tree = { rb_node = 0xffff881576c18180-----------管理xfs_buf的红黑树 }, callback_head = { next = 0x0, func = 0x0 }, pagb_count = 0 }
ag使用红黑树来管理xfs_buf,我进一步确认下我找的xfs_buf在红黑树中:
tree -t rbtree -o xfs_buf.b_rbnode ffff881f4b98dba0 -s xfs_buf.b_bn,b_length ffff881576c18180 b_bn = 2021058616 b_length = 8 ffff881571630d80 b_bn = 1975870104 b_length = 8 ffff88144a547180 b_bn = 1959990760 b_length = 8 ffff881452e61800 b_bn = 1957446840 b_length = 8 ffff883ed4be4a80 b_bn = 1953586592 b_length = 8 ffff883e829a3000 b_bn = 1953509312 b_length = 8 ffff881577099080 b_bn = 1953509298 b_length = 1 ffff8817d2cd1200 b_bn = 1953509297 b_length = 1 ffff883e829a3300 b_bn = 1953509299 b_length = 1 ffff883e85d85680 b_bn = 1953509304 b_length = 8 ffff88144e985680 b_bn = 1953509352 b_length = 8 ffff883e829a2880 b_bn = 1953509320 b_length = 8 ffff883f3752f780 b_bn = 1953509344 b_length = 8 ffff883ed266a700 b_bn = 1953585224 b_length = 8 。。。。 ffff883ececa1b00 b_bn = 1960485840 b_length = 32 ffff8813ee889980---------------我们出问题的xfs_buf b_bn = 1960485944------------我们出问题的起始扇区和长度 b_length = 8 ffff883e71c6c000 b_bn = 1960486072 b_length = 8 。。。。
说明xfs_buf是正常的,那么xfs_buf下发的io请求去哪了呢?我在这个地方因为找错了block_device,发现该block_device中的request_queue的io数为0,陷入了迷茫,后来其他同事发现
找错了设备,我找的是sdae,其实出错的是sde。
根据文件路径,可以查找对应的挂载点对应的设备的request_queue:
block_device.bd_queue 0xffff883fc1048d00 bd_queue = 0xffff883fc1e28828 crash> request_queue 0xffff883fc1e28828 struct request_queue { queue_head = { next = 0xffff883fc1e28828,-----------prev和next都指向自己,说明为空,也就是目前该队列因为调度关系没有io下发的原因,目前没有待下发给驱动执行的io prev = 0xffff883fc1e28828 }, last_merge = 0xffff883e826fcd80, elevator = 0xffff883e86875000,------------------这个里面有cfqd nr_rqs = {0, 17},--------------积压了17个同步io nr_rqs_elvpriv = 17,-----------有17个io需要经过调度器,这个值最大就和nr_rqs的总数相同
request_fn = 0xffffffff81422f70 <scsi_request_fn>,---这个函数会将request从request queue中取出来执行
make_request_fn = 0xffffffff812ce690 <blk_queue_bio>,
prep_rq_fn = 0xffffffff814213c0 <scsi_prep_fn>,
merge_bvec_fn = 0x0,
softirq_done_fn = 0xffffffff81422e20 <scsi_softirq_done>,
rq_timed_out_fn = 0xffffffff8141ec00 <scsi_times_out>,-------当队列中的request超时之后被调用,
dma_drain_needed = 0x0,
lld_busy_fn = 0xffffffff81421ad0 <scsi_lld_busy>,
。。。
nr_sorted = 11,-----ELEVATOR_INSERT_SORT 方式加入到队列中的数量
in_flight = {0, 5},----有5个in_flight的同步io没有返回,其实有6个io request,但其中一个不是REQ_TYPE_FS,所以不会统计在 in_flight中
rq_timeout = 30000,--------1个request 30ms的超时时间,会在blk_add_timer中赋值给 req->timeout
timeout_list = {------------管理所有下发给驱动的request的超时
next = 0xffff883e9d3f7ed0,
prev = 0xffff883e9d24ecd0
},
我们目前知道下发io的xfs_buf,但是bio是动态申请的,怎么知道我们当前的这个io下发了多长时间呢?
在 xfs_buf_ioapply_map函数中,bio的申请如下:
static void xfs_buf_ioapply_map( struct xfs_buf *bp, int map, int *buf_offset, int *count, int rw) { 。。。 bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; bio->bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; 。。。}
可以看出,bio的bi_private成员就是我们的xfs_buf,而且bi_sector成员就是我们的扇区号,既然我们知道扇区号,可以通过search 扇区的方式找到对应的bio为 0xffff881490a76000
request为 ffff8813ee88b900:
crash> struct bio.bi_sector,bi_private,bi_bdev 0xffff881490a76000 bi_sector = 1960485944 bi_private = 0xffff8813ee889980 bi_bdev = 0xffff883fc1048d00 crash> request.start_time,__sector,bio ffff8813ee88b900 start_time = 4298780289----------io生成时的jiffies __sector = 1960485944 bio = 0xffff881490a76000
crash> p jiffies
jiffies = $2 = 4298781359--------------当前的jiffies
可以看出,我们io的生成时间和当前的jiffies相差很小,也就是1秒钟左右,该进程虽然拿到了锁,但是它并不是持有了1200s,而很大的可能是它经过唤醒,抢到了锁,然后提交了io,继续等待io完成,正常情况下。它算幸运的,因为不幸运的 46745 进程因为等锁已经1200s了,被khungtaskd 内核线程 选中而触发了hung_task_panic 。
不管怎么样,不幸的 46745进程等待mutex已经超过了阈值,有可能是某个进程一直拿锁而不放,也有可能它排在等待队列的后面,锁虽然释放但也没轮到它。mutex本不应该这么慢,
是什么导致了这么慢呢?
crash> cfq_data.active_queue 0xffff883e86877c00 active_queue = 0xffff883e9efc03a0 crash> cfq_queue.dispatch_start 0xffff883e9efc03a0 dispatch_start = 4298776243 crash> p jiffies jiffies = $2 = 4298781359 crash> p 4298781359-4298776243 $3 = 5116
可以看出,当前active的cfq_queue 距离调度发起的时候已经过去5秒钟。而正常来说,一个request设置的超时默认为30s。
查看一下request_queue中的hash成员管理的request ,在rq_mergeable 的情况下,request的hash成员是归request queue中的hash管理的。
elevator_queue.hash 0xffff883e86875000 hash = {{ 。。。。。 }, { first = 0xffff883e826fdb80 }, { first = 0xffff883e40708800 }, { first = 0x0 }, { first = 0x0 }, { first = 0xffff8812ae0c0980 }, { 。。。。。 }, { first = 0xffff8812c96efb00 }, { first = 0x0 }, { first = 0x0 }, { first = 0xffff8813ee88b980 }, { 。。。。。 }, { first = 0xffff88014a437800 }, { first = 0x0 }, { first = 0xffff880149d39280 }, { first = 0xffff883e4341e300 }, { 。。。。。 }, { first = 0xffff883e826fce00 }, { 。。。。。
通过一个个查看,确定只有9个io,而 q->nr_sorted为11个,说明有2个io是不能merge的,再加上inflight的 5个,也就是16个io,那为什么request queue中alloced了17个io呢。
那是因为,有一个io没有计入在 in_flight 中。
9个未执行的可以merge的io: crash> request.start_time 0xffff883e826fdb00 start_time = 4298776286 crash> request.start_time 0xffff883e40708780 start_time = 4298780265 crash> request.start_time 0xffff8812ae0c0900 start_time = 4298776286 crash> request.start_time 0xffff8812c96efa80 start_time = 4298780264 crash> request.start_time 0xffff8813ee88b900 start_time = 4298780289 crash> request.start_time 0xffff88014a437780 start_time = 4298780258 crash> request.start_time 0xffff880149d39200 start_time = 4298780285 crash> request.start_time 0xffff883e4341e280 start_time = 4298780285 crash> request.start_time 0xffff883e826fcd80 start_time = 4298776272 2个不能merge的io:
request.hash ffff883e40708c00
hash = {
next = 0x0,
pprev = 0x0
request.hash ffff8812ae0c0780
hash = {
next = 0x0,
pprev = 0x0
}
那么等待执行的6个io怎么获取呢?在request_queue的elevator中,我们看到它有10个cfq_queue等待调度,
crash> elevator_queue.elevator_data 0xffff883e86875000 elevator_data = 0xffff883e86877c00 crash> cfq_data 0xffff883e86877c00 struct cfq_data { queue = 0xffff883fc1e28828, grp_service_tree = { rb = { rb_node = 0xffff883f36a09820 }, left = 0xffff883f36a09820, count = 4294964171, min_vdisktime = 4707409920, ttime = { last_end_request = 4295195837, ttime_total = 0, ttime_samples = 0, ttime_mean = 0 } }, root_group = 0xffff883f36a09800, serving_wl_class = BE_WORKLOAD, serving_wl_type = SYNC_WORKLOAD, workload_expires = 4298776270,---------jiffies = $1 = 4298781359,cfqd->workload_expires = jiffies + slice; serving_group = 0xffff883f36a09800,----根据这个当前服务的group,可以遍历其管理的所有cfq_queue prio_trees = {{ rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0xffff881490a87250 }, { rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0x0 }}, busy_queues = 10,-------------有10个queue等待调度 busy_sync_queues = 10, rq_in_driver = 5,----还没完成的io数量 rq_in_flight = {0, 5}, rq_queued = 11,-------有11个io在cfq_data中,也就是各个cfqq中的rq_queued中的总和 hw_tag = 1, hw_tag_est_depth = 32, hw_tag_samples = 51, idle_slice_timer = { entry = { next = 0x0, prev = 0xdead000000200200 }, expires = 4298772352, base = 0xffff881fd2fc8000, function = 0xffffffff812ef710 <cfq_idle_slice_timer>, data = 18446612400859216896, slack = -1, start_pid = -1, start_site = 0x0, start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" }, unplug_work = { data = { counter = 32 }, entry = { next = 0xffff883e86877d28, prev = 0xffff883e86877d28 }, func = 0xffffffff812edef0 <cfq_kick_queue> }, active_queue = 0xffff883e9efc03a0,---------当前active的queue active_cic = 0xffff883e47904f00, async_cfqq = {{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, {0x0, 0x0, 0x0, 0x0, 0xffff883f1f7c9878, 0x0, 0x0, 0x0}},---根据io优先级的异步cfq_queue async_idle_cfqq = 0x0, last_position = 5943375904, cfq_quantum = 8,-----static const int cfq_quantum = 8; cfq_fifo_expire = {250, 125},---static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; cfq_back_penalty = 2,-----static const int cfq_back_penalty = 2; cfq_back_max = 16384,-----static const int cfq_back_max = 16 * 1024; cfq_slice = {40, 100},---static int cfq_slice_async = HZ / 25;static const int cfq_slice_sync = HZ / 10; cfq_slice_async_rq = 2,---static const int cfq_slice_async_rq = 2; cfq_slice_idle = 8,----static int cfq_slice_idle = HZ / 125; cfq_group_idle = 8,----static int cfq_group_idle = HZ / 125; cfq_latency = 1,------ cfqd->cfq_latency = 1; cfq_target_latency = 300,---static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ oom_cfqq = { ref = 1, flags = 64, cfqd = 0xffff883e86877c00, rb_node = { __rb_parent_color = 18446612400859217440, rb_right = 0x0, rb_left = 0x0 }, rb_key = 0, p_node = { __rb_parent_color = 18446612400859217472, rb_right = 0x0, rb_left = 0x0 }, p_root = 0x0, sort_list = { rb_node = 0x0 }, next_rq = 0x0, queued = {0, 0}, allocated = {0, 0}, fifo = { next = 0xffff883e86877e80, prev = 0xffff883e86877e80 }, dispatch_start = 0, allocated_slice = 0, slice_dispatch = 0, slice_start = 0, slice_end = 0, slice_resid = 0, prio_pending = 0, dispatched = 0, ioprio = 0, org_ioprio = 0, ioprio_class = 0, pid = 1, seek_history = 0, last_request_pos = 0, service_tree = 0x0, new_cfqq = 0x0, cfqg = 0xffff883f36a09800, nr_sectors = 0 }, last_delayed_sync = 4298780320 }
先找到当前的serving_group,
serving_group = 0xffff883f36a09800,
根据这个cfq_group,
crash> struct -xo cfq_group.service_trees 0xffff883f36a09800 struct cfq_group { [ffff883f36a09878] struct cfq_rb_root service_trees[2][3]; } crash> cfq_group.dispatched 0xffff883f36a09800 dispatched = 5 ----这个dispatched和 cfqd->rq_in_flight是相同的,对的上
根据当前的io类别:
serving_wl_class = BE_WORKLOAD,
serving_wl_type = SYNC_WORKLOAD,
可以确定它的 service_trees 的数组下标:
enum wl_type_t {---列下标 ASYNC_WORKLOAD = 0, SYNC_NOIDLE_WORKLOAD = 1, SYNC_WORKLOAD = 2 }; enum wl_class_t {----行下标 BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, CFQ_PRIO_NR, };
找到对应的下标为[0][1],[0][2]。
总共10个queue crash> tree -t rb_tree 0xffff883f36a098f8 ffff883f061a0920 ffff883e6fb8c3b0 ffff883e9efc03b0 ffff881490a87230 ffff883e71bcdee0 ffff8801c8b61df8 ffff881577ee1c28 ffff88018a4577a0 crash> tree -t rb_tree 0xffff883f36a098b8 ffff88144e961318 ffff883ea2021400
也就找到了当前group调度的cfq_queue,遍历各个cfq_queue,就知道对应的request了:
crash> cfq_queue.rb_node struct cfq_queue { [16] struct rb_node rb_node; } 由于cfq_queue通过rb_node成员嵌入到对应cfq_group的service_tree数组中对应class和类型的红黑树去,所以可以查看对应的cfq_queue为: 只有11个io,2个dispatch的 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883f061a0910 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} request=ffff88014a437780 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e6fb8c3a0------相当于有一个在request_queue中或者到driver层了,有一个还未被dispatch allocated = {2, 0} ref = 4 dispatched = 1-------------它的dispatch为1 prio_pending = 0 queued = {0, 1} request=ffff883e826fcd80 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e9efc03a0---当前active的queue,无request了 allocated = {1, 0} ref = 2 dispatched = 1-------------它的dispatch为1, prio_pending = 0 queued = {0, 0} crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff881490a87220 allocated = {2, 0} ref = 3 dispatched = 0 prio_pending = 0 queued = {0, 2} ffff8812ae0c0780----不在hash链中 request.hash ffff8812ae0c0780 hash = { next = 0x0, pprev = 0x0 } ffff8812ae0c0900 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e71bcded0 allocated = {2, 0} ref = 3 dispatched = 0 prio_pending = 0 queued = {0, 2} ffff883e40708780 ffff883e40708c00---这个io不在hash中 crash> request.hash ffff883e40708c00 hash = { next = 0x0, pprev = 0x0 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff8801c8b61de8 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff8812c96efa80 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff881577ee1c18 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff883e4341e280 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff88018a457790 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff8813ee88b900 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff88144e961308 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff883e826fdb00 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883ea20213f0 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff880149d39200
其中,所以dispatch的io,可以在request_queue中的timeout_list 中找到:
crash> struct -xo request_queue.timeout_list 0xffff883fc1e28828 struct request_queue { [ffff883fc1e28dc0] struct list_head timeout_list; } crash> list request.timeout_list -s request.__sector,queuelist,cmd_type,bio,start_time -H ffff883fc1e28dc0 ffff883e9d3f7d80 __sector = 4090189560 queuelist = { next = 0xffff883e9d3f7d80, prev = 0xffff883e9d3f7d80 } cmd_type = REQ_TYPE_FS bio = 0xffff883ebb2f2200 start_time = 4298772344 ffff8804a8440d80 __sector = 4151431904 queuelist = { next = 0xffff8804a8440d80, prev = 0xffff8804a8440d80 } cmd_type = REQ_TYPE_FS bio = 0xffff88029f7a9f00 start_time = 4298768456-----当前jiffies是 4298781359,相当于已经过去23s,不过这个req的scsi_cmnd的 jiffies_at_alloc = 4298774713, ffff8802924f8a80-------------------------------------这个io是dfs下发的ioctl __sector = 18446744073709551615 queuelist = { next = 0xffff8802924f8a80, prev = 0xffff8802924f8a80 } cmd_type = REQ_TYPE_BLOCK_PC bio = 0x0 start_time = 4298774713 ffff883e73374480 __sector = 5860805840 queuelist = { next = 0xffff883e73374480, prev = 0xffff883e73374480 } cmd_type = REQ_TYPE_FS bio = 0xffff883ea412bd00 start_time = 4298776230 ffff883ed1c4ea00 __sector = 5977690392 queuelist = { next = 0xffff883ed1c4ea00, prev = 0xffff883ed1c4ea00 } cmd_type = REQ_TYPE_FS bio = 0xffff883f1e7b6300 start_time = 4298776243 ffff883e9d24eb80 __sector = 5943375896 queuelist = { next = 0xffff883e9d24eb80, prev = 0xffff883e9d24eb80 } cmd_type = REQ_TYPE_FS bio = 0xffff883f1ae6ec00 start_time = 4298776241 crash>
6个io,加上11个未下发到request_queue中的io,为17个,全部找到了,通过上面的遍历,对io的管理也都熟悉了,此为后话。
其中最久的那个io,jiffies_at_alloc = 4298774713,相比于jiffies也是过去6s了。fs下来的io,默认30s的定时器。
我们在回过头来看我们的log:
[ 3272.940387] sd 5:0:4:0: [sde] FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [ 3272.940392] sd 5:0:4:0: [sde] Sense Key : Medium Error [current] [descriptor] [ 3272.940395] sd 5:0:4:0: [sde] Add. Sense: Unrecovered read error [ 3272.940397] sd 5:0:4:0: [sde] CDB: Read(16) 88 00 00 00 00 00 00 00 00 08 00 00 00 08 00 00 [ 3272.940399] blk_update_request: critical medium error, dev sde, sector 8 [ 3272.942369] XFS (sde): metadata I/O error: block 0x8 ("xfs_trans_read_buf_map") error 61 numblks 8 [ 3278.783813] sd 5:0:4:0: [sde] FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [ 3278.783823] sd 5:0:4:0: [sde] Sense Key : Medium Error [current] [descriptor] [ 3278.783826] sd 5:0:4:0: [sde] Add. Sense: Unrecovered read error [ 3278.783830] sd 5:0:4:0: [sde] CDB: Read(16) 88 00 00 00 00 00 00 00 00 08 00 00 00 08 00 00 [ 3278.783832] blk_update_request: critical medium error, dev sde, sector 8 [ 3278.785946] XFS (sde): metadata I/O error: block 0x8 ("xfs_trans_read_buf_map") error 61 numblks 8 [ 3820.434234] umount:start to umount target /run/user/989 . [ 3820.434239] umount:according to target /run/user/989 , fstype tmpfs,device tmpfs [ 3820.434287] umount:finish umount target :/run/user/989 ,retval:0 . [ 4000.258607] umount:start to umount target /run/user/989 . [ 4000.258613] umount:according to target /run/user/989 , fstype tmpfs,device tmpfs [ 4000.258661] umount:finish umount target :/run/user/989 ,retval:0 . [ 4109.680685] INFO: task fas_readwriter:46745 blocked for more than 1200 seconds. [ 4109.682163] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 4109.683440] fas_readwriter D ffff881fffd147c0 0 46745 43298 0x00000080 [ 4109.683444] ffff881f320bfc20 0000000000000082 ffff881f320bffd8 ffff881f320bffd8 [ 4109.683469] ffff881f320bffd8 00000000000147c0 ffff881fd321ae00 ffff881571308f20 [ 4109.683487] ffff881571308f24 ffff881f32186780 00000000ffffffff ffff881571308f28 [ 4109.683491] Call Trace:
在触发crash之前,前面一直打印 访问8号扇区出错,对应的调度链是什么呢?
scsi_finish_command-->scsi_io_completion case ACTION_FAIL: -->scsi_print_result -->scsi_print_sense -->scsi_print_command -->scsi_end_request---->blk_update_request
从现象来看,综合日志中的delay打印:
read file dee46cc8be5ab43982c75a835768773e delay 3511 ms, readsize(581795) count(581795) offset(0)>
推测是由于访问硬盘慢导致了io的积压,可能是因为坏块的重试?因为查看该硬盘的host的eh线程,是空闲的,同时查看对应的scsi_device的reset记录,也为0,也就是从sense的角度说,
硬盘只是慢,不是没有响应,错误的io次数统计倒是有:
vendor = 0xffff883fc1e60b48 "SEAGATE ST4000NM0025 N003ZC18ASFP"-----------sde iorequest_cnt = { counter = 4641 }, iodone_cnt = { counter = 4635 }, ioerr_cnt = { counter = 283 },
现在从request的角度,我们知道io慢,比如前面看到的有一个23s的request,都快超时了,那到底是什么原因导致平均持锁的时间过长导致有进程排队太久呢?由于crash也只是一个快照,从错误数量的统计看,虽然从scsi命令看重试次数为0,但是可能硬盘底层对于error会进行重试?暂时无法确认。比较悲剧的是,也没看到abort的打印,毕竟超时都会伴随着abort才对。在另外一个crash中,我就看到了对应的abort记录,由于两个crash类似,所以推测也是由于超时导致了进程排队太久。同一个lun,另外一个crash的打印。
[10408827.934895] sd 5:0:4:0: task abort: SUCCESS scmd(ffff881ebda7b480) [10408828.922082] sd 5:0:4:0: attempting task abort! scmd(ffff881e8f3e39c0) [10408828.923772] sd 5:0:4:0: task abort: SUCCESS scmd(ffff881e8f3e39c0) [10408828.923785] sd 5:0:4:0: attempting task abort! scmd(ffff881e8f3e16c0) [10408828.925213] sd 5:0:4:0: task abort: SUCCESS scmd(ffff881e8f3e16c0) [10409912.753509] sd 5:0:4:0: attempting task abort! scmd(ffff881e5c0fef40) [10409912.754995] sd 5:0:4:0: task abort: SUCCESS scmd(ffff881e5c0fef40) [10409912.755017] sd 5:0:4:0: attempting task abort! scmd(ffff881e5c0ffb80) [10409912.756580] sd 5:0:4:0: task abort: SUCCESS scmd(ffff881e5c0ffb80)
对应的堆栈:
[10410980.277185] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [10410980.278502] nginx D ffff881fffc547c0 0 48133 45181 0x00000080 [10410980.278506] ffff881fac9c7c70 0000000000000086 ffff881fac9c7fd8 ffff881fac9c7fd8 [10410980.278513] ffff881fac9c7fd8 00000000000147c0 ffff881fd3245080 ffff881e95d1cee0 [10410980.278517] ffff881e95d1cee4 ffff881fc5c28b80 00000000ffffffff ffff881e95d1cee8 [10410980.278521] Call Trace: [10410980.278531] [<ffffffff8163f959>] schedule_preempt_disabled+0x29/0x70 [10410980.278534] [<ffffffff8163d415>] __mutex_lock_slowpath+0xc5/0x1c0 [10410980.278537] [<ffffffff8163c87f>] mutex_lock+0x1f/0x2f [10410980.278541] [<ffffffff811edf1f>] do_last+0x28f/0x1270 [10410980.278546] [<ffffffff81274abd>] ? SYSC_semtimedop+0x8cd/0xd10 [10410980.278549] [<ffffffff811f0be2>] path_openat+0xc2/0x490 [10410980.278554] [<ffffffff811f23ab>] do_filp_open+0x4b/0xb0 [10410980.278570] [<ffffffff811fef47>] ? __alloc_fd+0xa7/0x130 [10410980.278574] [<ffffffff811dfd53>] do_sys_open+0xf3/0x1f0 [10410980.278576] [<ffffffff811dfe6e>] SyS_open+0x1e/0x20 [10410980.278579] [<ffffffff81649909>] system_call_fastpath+0x16/0x1b
对应等的mutex:
crash> bt -f 48133 PID: 48133 TASK: ffff881fc5c28b80 CPU: 2 COMMAND: "nginx" #0 [ffff881fac9c7c10] __schedule at ffffffff8163df9b ffff881fac9c7c18: 0000000000000086 ffff881fac9c7fd8 ffff881fac9c7c28: ffff881fac9c7fd8 ffff881fac9c7fd8 ffff881fac9c7c38: 00000000000147c0 ffff881fd3245080 ffff881fac9c7c48: ffff881e95d1cee0 ffff881e95d1cee4 ffff881fac9c7c58: ffff881fc5c28b80 00000000ffffffff ffff881fac9c7c68: ffff881e95d1cee8 ffff881fac9c7c80 ffff881fac9c7c78: ffffffff8163f959 #1 [ffff881fac9c7c78] schedule_preempt_disabled at ffffffff8163f959 ffff881fac9c7c80: ffff881fac9c7ce0 ffffffff8163d415 #2 [ffff881fac9c7c88] __mutex_lock_slowpath at ffffffff8163d415 ffff881fac9c7c90: ffff881f78483c90 ffff883ef4567c90 ffff881fac9c7ca0: ffff881fc5c28b80 00000000e4ff06ea ffff881fac9c7cb0: 00000001ac9c7cc0 ffff881e95d1cee0---对应的mutex ffff881fac9c7cc0: 0000000000008000 ffff881fac9c7df0 ffff881fac9c7cd0: 0000000000000001 ffff881fac9c7e50 ffff881fac9c7ce0: ffff881fac9c7cf8 ffffffff8163c87f #3 [ffff881fac9c7ce8] mutex_lock at ffffffff8163c87f ffff881fac9c7cf0: ffff881fac9c7f28 ffff881fac9c7da8
mutex对应的owner:
struct mutex.owner ffff881e95d1cee0 owner = 0xffff881ea0d30000
对应的pid:
crash> task 0xffff881ea0d30000 PID: 25156 TASK: ffff881ea0d30000 CPU: 5 COMMAND: "fas_duper"
owner对应的堆栈:
crash> bt -f 25156 PID: 25156 TASK: ffff881ea0d30000 CPU: 5 COMMAND: "fas_duper" #0 [ffff881ea0d3b840] __schedule at ffffffff8163df9b ffff881ea0d3b848: 0000000000000082 ffff881ea0d3bfd8 ffff881ea0d3b858: ffff881ea0d3bfd8 ffff881ea0d3bfd8 ffff881ea0d3b868: 00000000000147c0 ffff881fd3247300 ffff881ea0d3b878: ffff881eed49f0d8 ffff881eed49f0e0 ffff881ea0d3b888: 7fffffffffffffff ffff881ea0d30000 ffff881ea0d3b898: ffff881fd29e1740 ffff881ea0d3b8b0 ffff881ea0d3b8a8: ffffffff8163e879 #1 [ffff881ea0d3b8a8] schedule at ffffffff8163e879 ffff881ea0d3b8b0: ffff881ea0d3b958 ffffffff8163c329 #2 [ffff881ea0d3b8b8] schedule_timeout at ffffffff8163c329 ffff881ea0d3b8c0: 7fffffffffffffff ffff881ea0d3b950 ffff881ea0d3b8d0: 0000000000000000 0000000000000000 ffff881ea0d3b8e0: 0000000000000000 ffff881eed49f000 ffff881ea0d3b8f0: ffff881ea0d3b908 ffffffff812cea44 ffff881ea0d3b900: 0000000000000004 ffff881ea0d3b9b8 ffff881ea0d3b910: ffffffffa07fe1c4 00004000a0d3b928 ffff881ea0d3b920: 0000000000000000 0000000033cf16ff ffff881ea0d3b930: ffff881eed49f0d8 ffff881eed49f0e0 ffff881ea0d3b940: 7fffffffffffffff ffff881ea0d30000 ffff881ea0d3b950: ffff881fd29e1740 ffff881ea0d3b9b8 ffff881ea0d3b960: ffffffff8163ec46 #3 [ffff881ea0d3b960] wait_for_completion at ffffffff8163ec46 ffff881ea0d3b968: 0000000000000001 ffff881ea0d30000 ffff881ea0d3b978: ffffffff810b9930 ffff881eed49f0e8 ffff881ea0d3b988: ffff881eed49f0e8 0000000033cf16ff ffff881ea0d3b998: ffff881eed49f000 0000000000020001 ffff881ea0d3b9a8: ffffffffa07ffce3 0000000000020000 ffff881ea0d3b9b8: ffff881ea0d3b9e0 ffffffffa07ffb74 #4 [ffff881ea0d3b9c0] xfs_buf_submit_wait at ffffffffa07ffb74 [xfs] ffff881ea0d3b9c8: ffff881eed49f000 0000000000020001 ffff881ea0d3b9d8: ffffffffa082c829 ffff881ea0d3b9f0 ffff881ea0d3b9e8: ffffffffa07ffce3 #5 [ffff881ea0d3b9e8] _xfs_buf_read at ffffffffa07ffce3 [xfs] ffff881ea0d3b9f0: ffff881ea0d3ba30 ffffffffa07ffdda #6 [ffff881ea0d3b9f8] xfs_buf_read_map at ffffffffa07ffdda [xfs] ffff881ea0d3ba00: ffffffffa083ff70 0000000000000000 ffff881ea0d3ba10: ffff881e962a5000 ffff881e962a5000 ffff881ea0d3ba20: ffff881ea0d3ba98 ffff881fd29e1740 ffff881ea0d3ba30: ffff881ea0d3ba70 ffffffffa082c829 #7 [ffff881ea0d3ba38] xfs_trans_read_buf_map at ffffffffa082c829 [xfs] ffff881ea0d3ba40: 00020000a0d3ba50 ffff883fafef1dc0 ffff881ea0d3ba50: ffff881e962a5000 ffff883fafef1de8 ffff881ea0d3ba60: ffff881ea0d3bb08 ffff881ea0d3bb00 ffff881ea0d3ba70: ffff881ea0d3bad8 ffffffffa07f3b3e #8 [ffff881ea0d3ba78] xfs_imap_to_bp at ffffffffa07f3b3e [xfs] ffff881ea0d3ba80: ffff881ea0d3ba90 ffffffffa083ff70 ffff881ea0d3ba90: 0000000000000000 00000000e91fb540 ffff881ea0d3baa0: 0000000000000020 0000000033cf16ff ffff881ea0d3bab0: ffff883fafef1dc0 ffff881e962a5000 ffff881ea0d3bac0: 0000000000000000 0000000000000000 ffff881ea0d3bad0: ffff883fafef1de8 ffff881ea0d3bb40 ffff881ea0d3bae0: ffffffffa07f403e #9 [ffff881ea0d3bae0] xfs_iread at ffffffffa07f403e [xfs] ffff881ea0d3bae8: ffff881e00000000 ffffffffa081d1b7 ffff881ea0d3baf8: ffff883fafef1dc0 ffff881e962a5000 ffff881ea0d3bb08: 00000001003f55e2 0000000033cf16ff ffff881ea0d3bb18: ffff883fafef1dc0 ffff881e962a5000 ffff881ea0d3bb28: 0000000000000000 00000001003f55e2 ffff881ea0d3bb38: 000000000000001f ffff881ea0d3bbd8 ffff881ea0d3bb48: ffffffffa0808158 #10 [ffff881ea0d3bb48] xfs_iget at ffffffffa0808158 [xfs] ffff881ea0d3bb50: 0000000000015500 003f55e2811c2a2d ffff881ea0d3bb60: ffff881ea0d3bc30 0000000000015500 ffff881ea0d3bb70: ffff881eb69b75c0 0000000000015500 ffff881ea0d3bb80: 0000000000000000 ffff881e962a5000 ffff881ea0d3bb90: 0000000000000000 ffff881ea0d3bbe8 ffff881ea0d3bba0: 00000000003f55e2 ffff881eb69b7610 ffff881ea0d3bbb0: 0000000000000008 ffff881e95d1cc80 ffff881ea0d3bbc0: 0000000000000000 0000000000000000 ffff881ea0d3bbd0: ffff881ea0d3bc30 ffff881ea0d3bc20 ffff881ea0d3bbe0: ffffffffa0810c37 #11 [ffff881ea0d3bbe0] xfs_lookup at ffffffffa0810c37 [xfs] ffff881ea0d3bbe8: 00000001003f55e2 0000000033cf16ff ffff881ea0d3bbf8: ffff881fc30c9140 ffff881e95d14c00 ffff881ea0d3bc08: ffff881e95d14c00 00000000ffffff9c ffff881ea0d3bc18: ffff881ea0d3bef0 ffff881ea0d3bc58 ffff881ea0d3bc28: ffffffffa080da0b #12 [ffff881ea0d3bc28] xfs_vn_lookup at ffffffffa080da0b [xfs] ffff881ea0d3bc30: ffff881ea0d3bdd0 ffff881fc30c9178 ffff881ea0d3bc40: 000000000000001b 0000000033cf16ff ffff881ea0d3bc50: ffff881fc30c9140 ffff881ea0d3bc78 ffff881ea0d3bc60: ffffffff811eac3d #13 [ffff881ea0d3bc60] lookup_real at ffffffff811eac3d ffff881ea0d3bc68: 00000000ffffff9c 0000000000000001 ffff881ea0d3bc78: ffff881ea0d3bca8 ffffffff811eb5b2 #14 [ffff881ea0d3bc80] __lookup_hash at ffffffff811eb5b2 ffff881ea0d3bc88: 01ff881ea0d3bef0 0000000033cf16ff ffff881ea0d3bc98: ffff881ea0d3bdc0 ffff881ea0d3bd28 ffff881ea0d3bca8: ffff881ea0d3bce0 ffffffff81636244 #15 [ffff881ea0d3bcb0] lookup_slow at ffffffff81636244 ffff881ea0d3bcb8: 0000000000000000 0000000000000001 ffff881ea0d3bcc8: 0000000000000041 ffff881ea0d3bdc0 ffff881ea0d3bcd8: 00000000ffffff9c ffff881ea0d3bd78 ffff881ea0d3bce8: ffffffff811f0483 #16 [ffff881ea0d3bce8] path_lookupat at ffffffff811f0483 ffff881ea0d3bcf0: ffff881ea0d3bda8 ffff881ea0d3bd60 ffff881ea0d3bd00: 0000000000000004 ffff881ea0d3bf58 ffff881ea0d3bd10: 0000000000000000 ffff881ea0d3bf68 ffff881ea0d3bd20: ffffffff812748eb ffff881e00000001 ffff881ea0d3bd30: 0000000000000000 ffffffff811c3345 ffff881ea0d3bd40: ffffffff811f117f 0000000033cf16ff ffff881ea0d3bd50: ffff881ea0d3bdc0 ffff881e7b38b000 ffff881ea0d3bd60: 0000000000000001 00000000ffffff9c ffff881ea0d3bd70: ffff881ea0d3bef0 ffff881ea0d3bdb0 ffff881ea0d3bd80: ffffffff811f04db #17 [ffff881ea0d3bd80] filename_lookup at ffffffff811f04db ffff881ea0d3bd88: 00007fa6c6f00980 ffff881e7b38b000 ffff881ea0d3bd98: 0000000000000001 ffff881ea0d3bea0 ffff881ea0d3bda8: 00000000ffffff9c ffff881ea0d3be80 ffff881ea0d3bdb8: ffffffff811f22a7 #18 [ffff881ea0d3bdb8] user_path_at_empty at ffffffff811f22a7 ffff881ea0d3bdc0: ffff881fccf81420 ffff881e95d14c00 ffff881ea0d3bdd0: 0000001b67eb798a ffff881e7b38b031 ffff881ea0d3bde0: ffff883fca04d920 ffff883fc1e99740 ffff881ea0d3bdf0: ffff881e95d1ce38 0000000200000001 ffff881ea0d3be00: 0000000000000000 ffff881fffcb47c0 ffff881ea0d3be10: ffff881fffcb47c0 ffff881eb683a280 ffff881ea0d3be20: ffff881ea0d3be38 ffffffff810b6a15 ffff881ea0d3be30: ffff881eb683a280 ffff881ea0d3be60 ffff881ea0d3be40: ffffffff810b6a59 ffff881eb683a280 ffff881ea0d3be50: ffff881fffcb47c0 0000000033cf16ff ffff881ea0d3be60: 00007fa6c6f00980 0000000000000001 ffff881ea0d3be70: 00000000ffffff9c 00007fa6c6f00b40 ffff881ea0d3be80: ffff881ea0d3be90 ffffffff811f2311 #19 [ffff881ea0d3be88] user_path_at at ffffffff811f2311 ffff881ea0d3be90: ffff881ea0d3bee0 ffffffff811e5b53 #20 [ffff881ea0d3be98] vfs_fstatat at ffffffff811e5b53 ffff881ea0d3bea0: ffff881eb683a280 ffff881e2fec5180 ffff881ea0d3beb0: 0000000033cf16ff 00007fa6c6f00980 ffff881ea0d3bec0: 00007fa6c6f00980 000000000073ad70 ffff881ea0d3bed0: 00007fafbe2f10c8 000000003db2f41a ffff881ea0d3bee0: ffff881ea0d3bf68 ffffffff811e60be #21 [ffff881ea0d3bee8] SYSC_newstat at ffffffff811e60be ffff881ea0d3bef0: 0000000000000000 0000000100000001 ffff881ea0d3bf00: ffff881fd165d080 ffffffff8101c899 ffff881ea0d3bf10: 00000000a0d3bf38 ffff881ea0ce4000 ffff881ea0d3bf20: ffff881ea0ce4248 ffff881ea0ce4000 ffff881ea0d3bf30: ffff881ea0ce4248 ffff881ea0d30000 ffff881ea0d3bf40: ffff881ea0d3bf78 ffffffff8110d336 ffff881ea0d3bf50: 00007fafbe2f1000 0000000033cf16ff ffff881ea0d3bf60: 00007fa6c6efff50 ffff881ea0d3bf78 ffff881ea0d3bf70: ffffffff811e639e #22 [ffff881ea0d3bf70] sys_newstat at ffffffff811e639e ffff881ea0d3bf78: 00007fa6c6f00b40 ffffffff81649909 #23 [ffff881ea0d3bf80] system_call_fastpath at ffffffff81649909 RIP: 00007fb0e1d905a5 RSP: 00007fa6c6f00638 RFLAGS: 00003202 RAX: 0000000000000004 RBX: ffffffff81649909 RCX: 0000000000020000 RDX: 00007fa6c6f00980 RSI: 00007fa6c6f00980 RDI: 00007fa6c6f00b40 RBP: 00007fa6c6f00b40 R8: 0000000000000000 R9: 00007fa6c6f00c40 R10: 0000000000000000 R11: 0000000000003246 R12: ffffffff811e639e R13: ffff881ea0d3bf78 R14: 00007fa6c6efff50 R15: 0000000033cf16ff ORIG_RAX: 0000000000000004 CS: 0033 SS: 002b
下发的io:
crash> request.start_time ffff881eed49f600 start_time = 14715925391 crash> jiffies jiffies = $3 = 14715950507
可以看出,30s的超时,这个已经有25s了,也就是这个io很有可能又需要abort。
积压的io:
request_queue.in_flight 0xffff883fc2be6a08 in_flight = {0, 15}
磁盘的ioerr 统计:
crash> scsi_device.iorequest_cnt,iodone_cnt,ioerr_cnt 0xffff881fcf17f800 iorequest_cnt = { counter = 56035857 } iodone_cnt = { counter = 56035683 } ioerr_cnt = { counter = 15749----------err次数 }
因为io_complete本来很快,那么拿到mutex的进程就会很快释放mutex,这样也就不会导致后来的进程排队时间这么长了,根据这个crash的结果,有理由估计第一个crash也是类似。
结论:
1.访问某些扇区,可能会导致多次重试而io缓慢?如果后面的io继续来到,则可能积压,而如果等待队列过长,可能导致某些进程持续获取不到锁而hung。
2.hungtask检测挑选的是第一个满足条件的task 来打印,但其实有可能另外一个进程等待已经超过2个hung检测的周期而没有被选中,因为它排在后面,也可能因为khungtask在上一次调度的时候,大家都没有到检测周期,等到了下一个检测周期之后,可能多个task 满足条件而挑选的是第一个task,甚至有可能有的task在下一个检测周期之间到达了阈值而因为等待的资源得到满足之后又重新开始调度了,也就是hung的检测是大概的,不是精确的。