Linux内核TSS的使用

参见文章:http://blog.chinaunix.net/uid-22695386-id-272098.html

linux2.4之前的内核有进程最大数的限制,受限制的原因是,每一个进程都有自已的TSS和LDT,而TSS(任务描述符)和LDT(私有描述符)必须放在GDT中,GDT最大只能存放8192个描述符,除掉系统用掉的12描述符之外,最大进程数=(8192-12)/2, 总共4090个进程。从Linux2.4以后,全部进程使用同一个TSS,准确的说是,每个CPU一个TSS,在同一个CPU上的进程使用同一个TSS。TSS的定义在asm-i386/processer.h中,定义如下:

extern struct tss_struct init_tss[NR_CPUS];

在start_kernel()->trap_init()->cpu_init()初始化并加载TSS:

void __init cpu_init (void)
{
int nr = smp_processor_id();    //获取当前cpu

struct tss_struct * t = &init_tss[nr]; //当前cpu使用的tss

t->esp0 = current->thread.esp0;            //把TSS中esp0更新为当前进程的esp0
set_tss_desc(nr,t);
gdt_table[__TSS(nr)].b &= 0xfffffdff;
load_TR(nr);                                              //加载TSS
load_LDT(&init_mm.context);                //加载LDT

}

我们知道,任务切换(硬切换)需要用到TSS来保存全部寄存器(2.4以前使用jmp来实现切换),

中断发生时也需要从TSS中读取ring0的esp0,那么,进程使用相同的TSS,任务切换怎么办?

其实2.4以后不再使用硬切换,而是使用软切换,寄存器不再保存在TSS中了,而是保存在

task->thread中只用TSS的esp0和IO许可位图,所以,在进程切换过程中,只需要更新TSS中

的esp0、io bitmap,代码在sched.c中:

schedule()->switch_to()->__switch_to(),

void fastcall __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
     *next = &next_p->thread;
struct tss_struct *tss = init_tss + smp_processor_id(); //当前cpu的TSS

/*
* Reload esp0, LDT and the page table pointer:
*/
ttss->esp0 = next->esp0; //用下一个进程的esp0更新tss->esp0

//拷贝下一个进程的io_bitmap到tss->io_bitmap

if (prev->ioperm || next->ioperm) {
   if (next->ioperm) {
    /*
    * 4 cachelines copy ... not good, but not that
    * bad either. Anyone got something better?
    * This only affects processes which use ioperm().
    * [Putting the TSSs into 4k-tlb mapped regions
    * and playing VM tricks to switch the IO bitmap
    * is not really acceptable.]
    */
    memcpy(tss->io_bitmap, next->io_bitmap,
     IO_BITMAP_BYTES);
    tss->bitmap = IO_BITMAP_OFFSET;
   } else
    /*
    * a bitmap offset pointing outside of the TSS limit
    * causes a nicely controllable SIGSEGV if a process
    * tries to use a port IO instruction. The first
    * sys_ioperm() call sets up the bitmap properly.
    */
    tss->bitmap = INVALID_IO_BITMAP_OFFSET;
}
}

以及代码:

   1: /*
   2:  *    switch_to(x,yn) should switch tasks from x to y.
   3:  *
   4:  * We fsave/fwait so that an exception goes off at the right time
   5:  * (as a call from the fsave or fwait in effect) rather than to
   6:  * the wrong process. Lazy FP saving no longer makes any sense
   7:  * with modern CPU's, and this simplifies a lot of things (SMP
   8:  * and UP become the same).
   9:  *
  10:  * NOTE! We used to use the x86 hardware context switching. The
  11:  * reason for not using it any more becomes apparent when you
  12:  * try to recover gracefully from saved state that is no longer
  13:  * valid (stale segment register values in particular). With the
  14:  * hardware task-switch, there is no way to fix up bad state in
  15:  * a reasonable manner.
  16:  *
  17:  * The fact that Intel documents the hardware task-switching to
  18:  * be slow is a fairly red herring - this code is not noticeably
  19:  * faster. However, there _is_ some room for improvement here,
  20:  * so the performance issues may eventually be a valid point.
  21:  * More important, however, is the fact that this allows us much
  22:  * more flexibility.
  23:  *
  24:  * The return value (in %ax) will be the "prev" task after
  25:  * the task-switch, and shows up in ret_from_fork in entry.S,
  26:  * for example.
  27:  */
  28: __notrace_funcgraph struct task_struct *
  29: __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  30: {
  31:     struct thread_struct *prev = &prev_p->thread,
  32:                  *next = &next_p->thread;
  33:     int cpu = smp_processor_id();
  34:     struct tss_struct *tss = &per_cpu(init_tss, cpu);
  35:     fpu_switch_t fpu;
  36:  
  37:     /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
  38:  
  39:     fpu = switch_fpu_prepare(prev_p, next_p);
  40:  
  41:     /*
  42:      * Reload esp0.
  43:      */
  44:     load_sp0(tss, next);
  45:  
  46:     /*
  47:      * Save away %gs. No need to save %fs, as it was saved on the
  48:      * stack on entry.  No need to save %es and %ds, as those are
  49:      * always kernel segments while inside the kernel.  Doing this
  50:      * before setting the new TLS descriptors avoids the situation
  51:      * where we temporarily have non-reloadable segments in %fs
  52:      * and %gs.  This could be an issue if the NMI handler ever
  53:      * used %fs or %gs (it does not today), or if the kernel is
  54:      * running inside of a hypervisor layer.
  55:      */
  56:     lazy_save_gs(prev->gs);
  57:  
  58:     /*
  59:      * Load the per-thread Thread-Local Storage descriptor.
  60:      */
  61:     load_TLS(next, cpu);
  62:  
  63:     /*
  64:      * Restore IOPL if needed.  In normal use, the flags restore
  65:      * in the switch assembly will handle this.  But if the kernel
  66:      * is running virtualized at a non-zero CPL, the popf will
  67:      * not restore flags, so it must be done in a separate step.
  68:      */
  69:     if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
  70:         set_iopl_mask(next->iopl);
  71:  
  72:     /*
  73:      * Now maybe handle debug registers and/or IO bitmaps
  74:      */
  75:     if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
  76:              task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
  77:         __switch_to_xtra(prev_p, next_p, tss);
  78:  
  79:     /*
  80:      * Leave lazy mode, flushing any hypercalls made here.
  81:      * This must be done before restoring TLS segments so
  82:      * the GDT and LDT are properly updated, and must be
  83:      * done before math_state_restore, so the TS bit is up
  84:      * to date.
  85:      */
  86:     arch_end_context_switch(next_p);
  87:  
  88:     /*
  89:      * Restore %gs if needed (which is common)
  90:      */
  91:     if (prev->gs | next->gs)
  92:         lazy_load_gs(next->gs);
  93:  
  94:     switch_fpu_finish(next_p, fpu);
  95:  
  96:     percpu_write(current_task, next_p);
  97:  
  98:     return prev_p;
  99: }

先分析一下comments

/*
*    switch_to(x,yn) should switch tasks from x to y.
*
* We fsave/fwait so that an exception goes off at the right time
* (as a call from the fsave or fwait in effect) rather than to
* the wrong process. Lazy FP saving no longer makes any sense
* with modern CPU's, and this simplifies a lot of things (SMP
* and UP become the same).
*
* NOTE! We used to use the x86 hardware context switching. The
* reason for not using it any more becomes apparent when you
* try to recover gracefully from saved state that is no longer
* valid (stale 【变味的,失效的】segment register values in particular). With the
* hardware task-switch, there is no way to fix up bad state in
* a reasonable manner.
*
* The fact that Intel documents the hardware task-switching to
* be slow is a fairly red herring【题外话】 - this code is not noticeably
* faster. However, there _is_ some room for improvement here,
* so the performance issues may eventually be a valid point.
* More important, however, is the fact that this allows us much
* more flexibility.
*
* The return value (in %ax) will be the "prev" task after
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/

大致意思是,为了灵活起见,我们将Intel硬件任务切换变为软任务切换

根据开关引用的文章,每个CPU上执行的进程都使用同一个TSS段,

    int cpu = smp_processor_id();
    struct tss_struct *tss = &per_cpu(init_tss, cpu);

而且里面有效的信息只有esp0io_map成员。

/*
     * Reload esp0.
     */
    load_sp0(tss, next);

原来保存在TSS段中,属于每个进程的上下文信息都保存在下面的结构体(thread_struct)中:

    struct thread_struct *prev = &prev_p->thread,
                 *next = &next_p->thread;

   1: struct thread_struct {
   2:     /* Cached TLS descriptors: */
   3:     struct desc_struct    tls_array[GDT_ENTRY_TLS_ENTRIES];
   4:     unsigned long        sp0;
   5:     unsigned long        sp;
   6: #ifdef CONFIG_X86_32
   7:     unsigned long        sysenter_cs;
   8: #else
   9:     unsigned long        usersp;    /* Copy from PDA */
  10:     unsigned short        es;
  11:     unsigned short        ds;
  12:     unsigned short        fsindex;
  13:     unsigned short        gsindex;
  14: #endif
  15: #ifdef CONFIG_X86_32
  16:     unsigned long        ip;
  17: #endif
  18: #ifdef CONFIG_X86_64
  19:     unsigned long        fs;
  20: #endif
  21:     unsigned long        gs;
  22:     /* Save middle states of ptrace breakpoints */
  23:     struct perf_event    *ptrace_bps[HBP_NUM];
  24:     /* Debug status used for traps, single steps, etc... */
  25:     unsigned long           debugreg6;
  26:     /* Keep track of the exact dr7 value set by the user */
  27:     unsigned long           ptrace_dr7;
  28:     /* Fault info: */
  29:     unsigned long        cr2;
  30:     unsigned long        trap_no;
  31:     unsigned long        error_code;
  32:     /* floating point and extended processor state */
  33:     unsigned long        has_fpu;
  34:     struct fpu        fpu;
  35: #ifdef CONFIG_X86_32
  36:     /* Virtual 86 mode info */
  37:     struct vm86_struct __user *vm86_info;
  38:     unsigned long        screen_bitmap;
  39:     unsigned long        v86flags;
  40:     unsigned long        v86mask;
  41:     unsigned long        saved_sp0;
  42:     unsigned int        saved_fs;
  43:     unsigned int        saved_gs;
  44: #endif
  45:     /* IO permissions: */
  46:     unsigned long        *io_bitmap_ptr;
  47:     unsigned long        iopl;
  48:     /* Max allowed port in the bitmap, in bytes: */
  49:     unsigned        io_bitmap_max;
  50: };

在Linux操作系统中,gs寄存器用于存储存放TLS的地址。(在Windows中,使用fs寄存器来存放TEB结构体的地址)。

参见:http://www.linuxidc.com/Linux/2012-06/64079p2.htm

Linux的glibc使用GS寄存器来访问TLS,也就是说,GS寄存器指示的段指向本线程的TEB(Windows的术语),也就是TLS,这么做有个好处,那就是可以高效的访问TLS里面存储的信息而不用一次次的调用系统调用,当然使用系统调用的方式也是可以的。之所以可以这么做,是因为Intel对各个寄存器的作用的规范规定的比较松散,因此你可以拿GS,FS等段寄存器来做几乎任何事,当然也就可以做TLS直接访问了,最终glibc在线程启动的时候首先将GS寄存器指向GDT的第6个段,完全使用段机制来支持针对TLS的寻址访问,后续的访问TLS信息就和访问用户态的信息一样高效了。

下面代码,将当前的CPU中的gs寄存器的内容写回到prev结构体中。

/*
     * Save away %gs. No need to save %fs, as it was saved on the
     * stack on entry.  No need to save %es and %ds, as those are
     * always kernel segments while inside the kernel.  Doing this
     * before setting the new TLS descriptors avoids the situation
     * where we temporarily have non-reloadable segments in %fs
     * and %gs.  This could be an issue if the NMI handler ever
     * used %fs or %gs (it does not today), or if the kernel is
     * running inside of a hypervisor layer.
     */
    lazy_save_gs(prev->gs);

接下来

/*
     * Load the per-thread Thread-Local Storage descriptor.
     */
    load_TLS(next, cpu);

 

更新GDT表中表示TLS的相关表项。

 

   1: #define load_TLS(t, cpu)            native_load_tls(t, cpu)
   2:  
   3: static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
   4: {
   5:     struct desc_struct *gdt = get_cpu_gdt_table(cpu);
   6:     unsigned int i;
   7:  
   8:     for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
   9:         gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
  10: }

 

首先获取到当前CPU的GDT表

struct gdt_page {
    struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));

DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);

static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
    return per_cpu(gdt_page, cpu).gdt;
}

per_cpu机制,是保证每个CPU都有一份自己的关键数据结构

参见:http://www.unixresources.net/linux/clf/linuxK/archive/00/00/47/91/479165.html

在该函数中,为每个CPU分配一段专有数据区,并将.data.percpu中的数据拷贝到其中,
每个CPU各有一份。由于数据从__per_cpu_start处转移到各CPU自己的专有数据区中了,
因此存取其中的变量就不能再用原先的值了,比如存取per_cpu__runqueues
就不能再用per_cpu__runqueues了,需要做一个偏移量的调整,
即需要加上各CPU自己的专有数据区首地址相对于__per_cpu_start的偏移量。
在这里也就是__per_cpu_offset[i],其中CPU i的专有数据区相对于
__per_cpu_start的偏移量为__per_cpu_offset[i]。
这样,就可以方便地计算专有数据区中各变量的新地址,比如对于per_cpu_runqueues,
其新地址即变成per_cpu_runqueues+__per_cpu_offset[i]。

load_TLS的深入到此结束,言归正传。


/*
     * Now maybe handle debug registers and/or IO bitmaps
     */
    if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
             task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
        __switch_to_xtra(prev_p, next_p, tss);

 

接下来处理调试寄存器,以及io位图。

   1: void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
   2:               struct tss_struct *tss)
   3: {
   4:     struct thread_struct *prev, *next;
   5:  
   6:     prev = &prev_p->thread;
   7:     next = &next_p->thread;
   8:  
   9:     if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
  10:         test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
  11:         unsigned long debugctl = get_debugctlmsr();
  12:  
  13:         debugctl &= ~DEBUGCTLMSR_BTF;
  14:         if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
  15:             debugctl |= DEBUGCTLMSR_BTF;
  16:  
  17:         update_debugctlmsr(debugctl);
  18:     }
  19:  
  20:     if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
  21:         test_tsk_thread_flag(next_p, TIF_NOTSC)) {
  22:         /* prev and next are different */
  23:         if (test_tsk_thread_flag(next_p, TIF_NOTSC))
  24:             hard_disable_TSC();
  25:         else
  26:             hard_enable_TSC();
  27:     }
  28:  
  29:     if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
  30:         /*
  31:          * Copy the relevant range of the IO bitmap.
  32:          * Normally this is 128 bytes or less:
  33:          */
  34:         memcpy(tss->io_bitmap, next->io_bitmap_ptr,
  35:                max(prev->io_bitmap_max, next->io_bitmap_max));
  36:     } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
  37:         /*
  38:          * Clear any possible leftover bits:
  39:          */
  40:         memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
  41:     }
  42:     propagate_user_return_notify(prev_p, next_p);
  43: }
posted @ 2014-01-02 14:33  Daniel King  阅读(4144)  评论(0编辑  收藏  举报