linux thread_info 与thread_struct
有个同事看3.10代码中,看着两个结构,会混淆,所以我简单答复了一下。
thread_info是和内核栈放一块的,网上到处都是thread_info的资料,但thread_struct的资料比较少,在此记录下,以备忘
struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; struct restart_block restart_block; void __user *sysenter_return; #ifdef CONFIG_X86_32 unsigned long previous_esp; /* ESP of the previous stack in case of nested (IRQ) stacks */ __u8 supervisor_stack[0]; #endif unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ };
thread_info 在task_struct中的stack成员,它为啥能和内核栈成为union呢,按道理union里面的内容都是不同时有效的,也就是既然用作了A成员,则B不可能使用,但是明显我们的
thread_info结构和内核栈是同时使用的,其实可以理解为thread_info 放在了内核栈的下面,因为栈的增长方向是地址大到地址小,所以两者不冲突。这也间接说明了,
其实内核栈没有union那么大,要被thread_info占据一部分。放在一起还有个好处就是根据esp能够
快速地查找到task_struct的指针,因为thread_info的第一个成员就是task_struct指针。通过将esp的末尾几位设置为0就ok。到底设置多少位,是与栈的大小相关的。
比如64为的x86,默认内核栈大小为:
#define THREAD_SIZE_ORDER 2 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define get_current() (current_thread_info()->task) #define current get_current() static inline struct thread_info *current_thread_info(void) { register unsigned long sp asm ("sp"); return (struct thread_info *)(sp & ~(THREAD_SIZE - 1)); }
所以经常可以看到代码中使用 current 宏,就是通过sp指针来找到taks_struct.看下面的一个例子更能理解:
crash> bt PID: 178838 TASK: ffff88290f7ddee0 CPU: 9 COMMAND: "kthread_send/9"----------------当前task指针为ffff88290f7ddee0 #0 [ffff882fbe843a70] machine_kexec at ffffffff8105d77b #1 [ffff882fbe843ad0] __crash_kexec at ffffffff8110aca2 #2 [ffff882fbe843ba0] panic at ffffffff816ad52f #3 [ffff882fbe843c20] watchdog_timer_fn at ffffffff81135a51 #4 [ffff882fbe843c58] __hrtimer_run_queues at ffffffff810b93a6 #5 [ffff882fbe843cb0] hrtimer_interrupt at ffffffff810b993f #6 [ffff882fbe843cf8] local_apic_timer_interrupt at ffffffff8105467b #7 [ffff882fbe843d10] smp_apic_timer_interrupt at ffffffff816c9e83 #8 [ffff882fbe843d28] apic_timer_interrupt at ffffffff816c6732 #9 [ffff882fbe843dc8] queued_spin_lock_slowpath at ffffffff816adeee #10 [ffff882fbe843dd8] _raw_spin_lock at ffffffff816bb080 #11 [ffff882fbe843de8] dev_watchdog at ffffffff815bca52 #12 [ffff882fbe843e28] call_timer_fn at ffffffff8109a9c8 #13 [ffff882fbe843e60] run_timer_softirq at ffffffff8109ceed #14 [ffff882fbe843ed8] __do_softirq at ffffffff8109404d #15 [ffff882fbe843f48] call_softirq at ffffffff816c8afc #16 [ffff882fbe843f60] do_softirq at ffffffff8102d435 #17 [ffff882fbe843f80] irq_exit at ffffffff81094495 #18 [ffff882fbe843f98] smp_apic_timer_interrupt at ffffffff816c9e88 #19 [ffff882fbe843fb0] apic_timer_interrupt at ffffffff816c6732 --- <IRQ stack> --- #20 [ffff882b680d3c28] apic_timer_interrupt at ffffffff816c6732 [exception RIP: ixgbe_xmit_frame_ring+83] RIP: ffffffffc01299e3 RSP: ffff882b680d3cd0 RFLAGS: 00000212---------------------在中断之前的rsp RAX: 0000000000000562 RBX: 0000000000000001 RCX: 000000000000403d RDX: ffff882fb9331c00 RSI: ffff8828d7b8fac0 RDI: 0000000000000001 RBP: ffff882b680d3d48 R8: 0000000000000008 R9: 0000a0a5447b9d78 R10: ffff8828c6e84f00 R11: 000000002b3000b8 R12: ffff8828c0291b00 R13: 0000000022300000 R14: 0000000000000001 R15: ffff882b680d3cc0 ORIG_RAX: ffffffffffffff10 CS: 0010 SS: 0018 #21 [ffff882b680d3d50] ixgbe_xmit_frame at ffffffffc012a918 [ixgbe] #22 [ffff882b680d3d80] wit_send_tasklet at ffffffffc043b63c [witdriver] #23 [ffff882b680d3e78] wit_kthread_xmit_fn at ffffffffc043ba95 [witdriver] #24 [ffff882b680d3ec8] kthread at ffffffff810b5241 #25 [ffff882b680d3f50] ret_from_fork at ffffffff816c5577
根据task_struct 找stack:
crash> task_struct.stack ffff88290f7ddee0 stack = 0xffff882b680d0000 crash> rd 0xffff882b680d0000 ffff882b680d0000: ffff88290f7ddee0----------------------stack中的第一个成员就是指向task_struct的
再看看esp 的值 ffff882b680d3cd0 与 stack的值 0xffff882b680d0000 ,两者其实就是14位的相差,也就是 16k的低位不同。
有时候我们会遇到内核堆栈越界的情况,越界就是栈变量向下扩展的时候,踩到了thread_info结构的成员。
这时会遇到:Thread overran stack, or stack corrupted 这样的打印,判断的标准就是thread_info的上面留了一个magic特征字:
#define STACK_END_MAGIC 0x57AC6E9D
以下面例子来说明:
crash> struct thread_info struct thread_info { struct task_struct *task; struct exec_domain *exec_domain; __u32 flags; __u32 status; __u32 cpu; int preempt_count; mm_segment_t addr_limit; struct restart_block restart_block; void *sysenter_return; unsigned int sig_on_uaccess_error : 1; unsigned int uaccess_err : 1; } SIZE: 104
crash> px 0xffff882b680d0000 + 104
$8 = 0xffff882b680d0068
crash> rd 0xffff882b680d0068
ffff882b680d0068: 0000000057ac6e9d .n.W.... -----------------对应的magic特征字
在一些服务器中,经常会使用 echo 1 > /proc/sys/kernel/stack_tracer_enabled 的方式来监控线程栈,这个会使得能够打印最深的栈
cat /sys/kernel/debug/tracing/stack_trace Depth Size Location (41 entries) ----- ---- -------- 0) 4120 16 mempool_alloc_slab+0x15/0x20 1) 4104 128 mempool_alloc+0x6e/0x170 2) 3976 16 sg_pool_alloc+0x45/0x50 3) 3960 88 __sg_alloc_table+0xd6/0x140 4) 3872 40 sg_alloc_table_chained+0x3c/0x90 5) 3832 40 scsi_init_sgtable+0x26/0x70 6) 3792 72 scsi_init_io+0x4e/0x200 7) 3720 80 sd_setup_read_write_cmnd+0x3d/0x950 [sd_mod] 8) 3640 16 sd_init_command+0x2f/0xc0 [sd_mod] 9) 3624 32 scsi_setup_cmnd+0x111/0x1c0 10) 3592 56 scsi_prep_fn+0xdb/0x180 11) 3536 40 blk_peek_request+0x16a/0x290 12) 3496 104 scsi_request_fn+0x48/0x680 13) 3392 24 __blk_run_queue+0x39/0x50 14) 3368 192 cfq_insert_request+0x384/0x550 15) 3176 56 __elv_add_request+0x1a2/0x2e0 16) 3120 72 blk_queue_bio+0x35b/0x3a0 17) 3048 88 generic_make_request+0x10b/0x320 18) 2960 88 submit_bio+0x70/0x150 19) 2872 48 _submit_bh+0x127/0x160 20) 2824 16 submit_bh+0x10/0x20 21) 2808 88 ext4_read_block_bitmap_nowait+0x48c/0x5f0 [ext4] 22) 2720 152 ext4_mb_init_cache+0x181/0x6e0 [ext4] 23) 2568 72 ext4_mb_load_buddy+0x2b6/0x340 [ext4] 24) 2496 160 ext4_mb_regular_allocator+0x1d7/0x470 [ext4] 25) 2336 176 ext4_mb_new_blocks+0x658/0xa20 [ext4] 26) 2160 232 ext4_alloc_branch+0x3b9/0x430 [ext4] 27) 1928 248 ext4_ind_map_blocks+0x34f/0x7b0 [ext4] 28) 1680 136 ext4_map_blocks+0x2a5/0x6f0 [ext4] 29) 1544 104 _ext4_get_block+0x1df/0x220 [ext4] 30) 1440 16 ext4_get_block+0x16/0x20 [ext4] 31) 1424 184 __block_write_begin+0x17d/0x4b0 32) 1240 136 ext4_write_begin+0x18f/0x440 [ext4] 33) 1104 200 generic_file_buffered_write+0x124/0x2c0 34) 904 128 __generic_file_aio_write+0x1e2/0x400 35) 776 64 generic_file_aio_write+0x59/0xa0 36) 712 184 ext4_file_write+0xdb/0x470 [ext4] 37) 528 216 do_sync_write+0x93/0xe0 38) 312 64 vfs_write+0xc0/0x1f0 39) 248 72 SyS_write+0x7f/0xe0 40) 176 176 system_call_fastpath+0x1c/0x21
如果新增加了内核模块,测试时最好能够监控起来,保证不会栈越界。
如果说 thread_info 在进程运行时访问很多,比如取当前task_struct指针,设置是否能够抢占的 preempt_count ,是跟arch体系无关的一些参数,那么thread_struct 就是与体系强相关的
一个结构了,比如x86的架构如下,321位和64位用一些宏来控制。
struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; unsigned long sp0; unsigned long sp; #ifdef CONFIG_X86_32 unsigned long sysenter_cs; #else unsigned long usersp; /* Copy from PDA */ unsigned short es; unsigned short ds; unsigned short fsindex; unsigned short gsindex; #endif #ifdef CONFIG_X86_32 unsigned long ip; #endif #ifdef CONFIG_X86_64 unsigned long fs; #endif unsigned long gs; /* Save middle states of ptrace breakpoints */ struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ unsigned long debugreg6; /* Keep track of the exact dr7 value set by the user */ unsigned long ptrace_dr7; /* Fault info: */ unsigned long cr2; unsigned long trap_nr; unsigned long error_code; /* floating point and extended processor state */ struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ struct vm86_struct __user *vm86_info; unsigned long screen_bitmap; unsigned long v86flags; unsigned long v86mask; unsigned long saved_sp0; unsigned int saved_fs; unsigned int saved_gs; #endif /* IO permissions: */ unsigned long *io_bitmap_ptr; unsigned long iopl; /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; };
arm32的长成这样:
struct thread_struct { /* fault info */ unsigned long address; unsigned long trap_no; unsigned long error_code; /* debugging */ struct debug_info debug; };
arm64的长成这样:
struct cpu_context { unsigned long x19; unsigned long x20; unsigned long x21; unsigned long x22; unsigned long x23; unsigned long x24; unsigned long x25; unsigned long x26; unsigned long x27; unsigned long x28; unsigned long fp; unsigned long sp; unsigned long pc; }; struct thread_struct { struct cpu_context cpu_context; /* cpu context */ unsigned long tp_value; struct fpsimd_state fpsimd_state; unsigned long fault_address; /* fault info */ struct debug_info debug; /* debugging */ };
因为不同的结构,寄存器明显不一样,所以cpu的上下文显然不一样,这个结构就是用来保存在进程切换的时候,用于特定于arch的进程上下文切换的。