Linux内核TSS的使用

参见文章：http://blog.chinaunix.net/uid-22695386-id-272098.html

linux2.4之前的内核有进程最大数的限制，受限制的原因是，每一个进程都有自已的TSS和LDT，而TSS（任务描述符)和LDT(私有描述符)必须放在GDT中，GDT最大只能存放8192个描述符，除掉系统用掉的12描述符之外，最大进程数=(8192-12)/2, 总共4090个进程。从Linux2.4以后，全部进程使用同一个TSS，准确的说是，每个CPU一个TSS，在同一个CPU上的进程使用同一个TSS。TSS的定义在asm-i386/processer.h中，定义如下：
extern struct tss_struct init_tss[NR_CPUS];
在start_kernel()->trap_init()->cpu_init()初始化并加载TSS:
void __init cpu_init (void)
{
int nr = smp_processor_id();    //获取当前cpu
struct tss_struct * t = &init_tss[nr]; //当前cpu使用的tss
t->esp0 = current->thread.esp0;            //把TSS中esp0更新为当前进程的esp0
set_tss_desc(nr,t);
gdt_table[__TSS(nr)].b &= 0xfffffdff;
load_TR(nr);                                              //加载TSS
load_LDT(&init_mm.context);                //加载LDT
}
我们知道，任务切换(硬切换)需要用到TSS来保存全部寄存器(2.4以前使用jmp来实现切换)，
中断发生时也需要从TSS中读取ring0的esp0，那么，进程使用相同的TSS，任务切换怎么办？
其实2.4以后不再使用硬切换，而是使用软切换，寄存器不再保存在TSS中了，而是保存在
task->thread中，只用TSS的esp0和IO许可位图，所以，在进程切换过程中，只需要更新TSS中
的esp0、io bitmap，代码在sched.c中：
schedule()->switch_to()->__switch_to()，
void fastcall __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
     *next = &next_p->thread;
struct tss_struct *tss = init_tss + smp_processor_id(); //当前cpu的TSS
/*
* Reload esp0, LDT and the page table pointer:
*/
ttss->esp0 = next->esp0; //用下一个进程的esp0更新tss->esp0
//拷贝下一个进程的io_bitmap到tss->io_bitmap
if (prev->ioperm || next->ioperm) {
   if (next->ioperm) {
    /*
    * 4 cachelines copy ... not good, but not that
    * bad either. Anyone got something better?
    * This only affects processes which use ioperm().
    * [Putting the TSSs into 4k-tlb mapped regions
    * and playing VM tricks to switch the IO bitmap
    * is not really acceptable.]
    */
    memcpy(tss->io_bitmap, next->io_bitmap,
     IO_BITMAP_BYTES);
    tss->bitmap = IO_BITMAP_OFFSET;
   } else
    /*
    * a bitmap offset pointing outside of the TSS limit
    * causes a nicely controllable SIGSEGV if a process
    * tries to use a port IO instruction. The first
    * sys_ioperm() call sets up the bitmap properly.
    */
    tss->bitmap = INVALID_IO_BITMAP_OFFSET;
}
}

以及代码：

   1: /*   2:  *    switch_to(x,yn) should switch tasks from x to y.   3:  *   4:  * We fsave/fwait so that an exception goes off at the right time   5:  * (as a call from the fsave or fwait in effect) rather than to   6:  * the wrong process. Lazy FP saving no longer makes any sense   7:  * with modern CPU's, and this simplifies a lot of things (SMP   8:  * and UP become the same).   9:  *  10:  * NOTE! We used to use the x86 hardware context switching. The  11:  * reason for not using it any more becomes apparent when you  12:  * try to recover gracefully from saved state that is no longer  13:  * valid (stale segment register values in particular). With the  14:  * hardware task-switch, there is no way to fix up bad state in  15:  * a reasonable manner.  16:  *  17:  * The fact that Intel documents the hardware task-switching to  18:  * be slow is a fairly red herring - this code is not noticeably  19:  * faster. However, there _is_ some room for improvement here,  20:  * so the performance issues may eventually be a valid point.  21:  * More important, however, is the fact that this allows us much  22:  * more flexibility.  23:  *  24:  * The return value (in %ax) will be the "prev" task after  25:  * the task-switch, and shows up in ret_from_fork in entry.S,  26:  * for example.  27:  */  28: __notrace_funcgraph struct task_struct *  29: __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  30: {  31:     struct thread_struct *prev = &prev_p->thread,  32:                  *next = &next_p->thread;  33:     int cpu = smp_processor_id();  34:     struct tss_struct *tss = &per_cpu(init_tss, cpu);  35:     fpu_switch_t fpu;  36:    37:     /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */  38:    39:     fpu = switch_fpu_prepare(prev_p, next_p);  40:    41:     /*  42:      * Reload esp0.  43:      */  44:     load_sp0(tss, next);  45:    46:     /*  47:      * Save away %gs. No need to save %fs, as it was saved on the  48:      * stack on entry.  No need to save %es and %ds, as those are  49:      * always kernel segments while inside the kernel.  Doing this  50:      * before setting the new TLS descriptors avoids the situation  51:      * where we temporarily have non-reloadable segments in %fs  52:      * and %gs.  This could be an issue if the NMI handler ever  53:      * used %fs or %gs (it does not today), or if the kernel is  54:      * running inside of a hypervisor layer.  55:      */  56:     lazy_save_gs(prev->gs);  57:    58:     /*  59:      * Load the per-thread Thread-Local Storage descriptor.  60:      */  61:     load_TLS(next, cpu);  62:    63:     /*  64:      * Restore IOPL if needed.  In normal use, the flags restore  65:      * in the switch assembly will handle this.  But if the kernel  66:      * is running virtualized at a non-zero CPL, the popf will  67:      * not restore flags, so it must be done in a separate step.  68:      */  69:     if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))  70:         set_iopl_mask(next->iopl);  71:    72:     /*  73:      * Now maybe handle debug registers and/or IO bitmaps  74:      */  75:     if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||  76:              task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))  77:         __switch_to_xtra(prev_p, next_p, tss);  78:    79:     /*  80:      * Leave lazy mode, flushing any hypercalls made here.  81:      * This must be done before restoring TLS segments so  82:      * the GDT and LDT are properly updated, and must be  83:      * done before math_state_restore, so the TS bit is up  84:      * to date.  85:      */  86:     arch_end_context_switch(next_p);  87:    88:     /*  89:      * Restore %gs if needed (which is common)  90:      */  91:     if (prev->gs | next->gs)  92:         lazy_load_gs(next->gs);  93:    94:     switch_fpu_finish(next_p, fpu);  95:    96:     percpu_write(current_task, next_p);  97:    98:     return prev_p;  99: }

先分析一下comments

/*
* switch_to(x,yn) should switch tasks from x to y.
*
* We fsave/fwait so that an exception goes off at the right time
* (as a call from the fsave or fwait in effect) rather than to
* the wrong process. Lazy FP saving no longer makes any sense
* with modern CPU's, and this simplifies a lot of things (SMP
* and UP become the same).
*
* NOTE! We used to use the x86 hardware context switching. The
* reason for not using it any more becomes apparent when you
* try to recover gracefully from saved state that is no longer
* valid (stale 【变味的，失效的】segment register values in particular). With the
* hardware task-switch, there is no way to fix up bad state in
* a reasonable manner.
*
* The fact that Intel documents the hardware task-switching to
* be slow is a fairly red herring【题外话】 - this code is not noticeably
* faster. However, there _is_ some room for improvement here,
* so the performance issues may eventually be a valid point.
* More important, however, is the fact that this allows us much
* more flexibility.
*
* The return value (in %ax) will be the "prev" task after
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/

大致意思是，为了灵活起见，我们将Intel硬件任务切换变为软任务切换。

根据开关引用的文章，每个CPU上执行的进程都使用同一个TSS段，

int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);

而且里面有效的信息只有esp0和io_map成员。

/*
     * Reload esp0.
     */
    load_sp0(tss, next);

原来保存在TSS段中，属于每个进程的上下文信息都保存在下面的结构体（thread_struct）中：

struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;

   1: struct thread_struct {   2:     /* Cached TLS descriptors: */   3:     struct desc_struct    tls_array[GDT_ENTRY_TLS_ENTRIES];   4:     unsigned long        sp0;   5:     unsigned long        sp;   6: #ifdef CONFIG_X86_32   7:     unsigned long        sysenter_cs;   8: #else   9:     unsigned long        usersp;    /* Copy from PDA */  10:     unsigned short        es;  11:     unsigned short        ds;  12:     unsigned short        fsindex;  13:     unsigned short        gsindex;  14: #endif  15: #ifdef CONFIG_X86_32  16:     unsigned long        ip;  17: #endif  18: #ifdef CONFIG_X86_64  19:     unsigned long        fs;  20: #endif  21:     unsigned long        gs;  22:     /* Save middle states of ptrace breakpoints */  23:     struct perf_event    *ptrace_bps[HBP_NUM];  24:     /* Debug status used for traps, single steps, etc... */  25:     unsigned long           debugreg6;  26:     /* Keep track of the exact dr7 value set by the user */  27:     unsigned long           ptrace_dr7;  28:     /* Fault info: */  29:     unsigned long        cr2;  30:     unsigned long        trap_no;  31:     unsigned long        error_code;  32:     /* floating point and extended processor state */  33:     unsigned long        has_fpu;  34:     struct fpu        fpu;  35: #ifdef CONFIG_X86_32  36:     /* Virtual 86 mode info */  37:     struct vm86_struct __user *vm86_info;  38:     unsigned long        screen_bitmap;  39:     unsigned long        v86flags;  40:     unsigned long        v86mask;  41:     unsigned long        saved_sp0;  42:     unsigned int        saved_fs;  43:     unsigned int        saved_gs;  44: #endif  45:     /* IO permissions: */  46:     unsigned long        *io_bitmap_ptr;  47:     unsigned long        iopl;  48:     /* Max allowed port in the bitmap, in bytes: */  49:     unsigned        io_bitmap_max;  50: };

在Linux操作系统中，gs寄存器用于存储存放TLS的地址。（在Windows中，使用fs寄存器来存放TEB结构体的地址）。

参见：http://www.linuxidc.com/Linux/2012-06/64079p2.htm

Linux的glibc使用GS寄存器来访问TLS，也就是说，GS寄存器指示的段指向本线程的TEB(Windows的术语)，也就是TLS，这么做有个好处，那就是可以高效的访问TLS里面存储的信息而不用一次次的调用系统调用，当然使用系统调用的方式也是可以的。之所以可以这么做，是因为Intel对各个寄存器的作用的规范规定的比较松散，因此你可以拿GS，FS等段寄存器来做几乎任何事，当然也就可以做TLS直接访问了，最终glibc在线程启动的时候首先将GS寄存器指向GDT的第6个段，完全使用段机制来支持针对TLS的寻址访问，后续的访问TLS信息就和访问用户态的信息一样高效了。

下面代码，将当前的CPU中的gs寄存器的内容写回到prev结构体中。

/*
     * Save away %gs. No need to save %fs, as it was saved on the
     * stack on entry. No need to save %es and %ds, as those are
     * always kernel segments while inside the kernel. Doing this
     * before setting the new TLS descriptors avoids the situation
     * where we temporarily have non-reloadable segments in %fs
     * and %gs. This could be an issue if the NMI handler ever
     * used %fs or %gs (it does not today), or if the kernel is
     * running inside of a hypervisor layer.
     */
    lazy_save_gs(prev->gs);

接下来

/*
     * Load the per-thread Thread-Local Storage descriptor.
     */
    load_TLS(next, cpu);

更新GDT表中表示TLS的相关表项。

   1: #define load_TLS(t, cpu)            native_load_tls(t, cpu)   2:     3: static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)   4: {   5:     struct desc_struct *gdt = get_cpu_gdt_table(cpu);   6:     unsigned int i;   7:     8:     for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)   9:         gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];  10: }

首先获取到当前CPU的GDT表

struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));

DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);

static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
return per_cpu(gdt_page, cpu).gdt;
}

per_cpu机制，是保证每个CPU都有一份自己的关键数据结构

参见：http://www.unixresources.net/linux/clf/linuxK/archive/00/00/47/91/479165.html

在该函数中，为每个CPU分配一段专有数据区，并将.data.percpu中的数据拷贝到其中，
每个CPU各有一份。由于数据从__per_cpu_start处转移到各CPU自己的专有数据区中了，
因此存取其中的变量就不能再用原先的值了，比如存取per_cpu__runqueues
就不能再用per_cpu__runqueues了，需要做一个偏移量的调整，
即需要加上各CPU自己的专有数据区首地址相对于__per_cpu_start的偏移量。
在这里也就是__per_cpu_offset[i]，其中CPU i的专有数据区相对于
__per_cpu_start的偏移量为__per_cpu_offset[i]。
这样，就可以方便地计算专有数据区中各变量的新地址，比如对于per_cpu_runqueues，
其新地址即变成per_cpu_runqueues+__per_cpu_offset[i]。

load_TLS的深入到此结束，言归正传。

/*
     * Now maybe handle debug registers and/or IO bitmaps
     */
    if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
             task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
        __switch_to_xtra(prev_p, next_p, tss);

接下来处理调试寄存器，以及io位图。

   1: void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,   2:               struct tss_struct *tss)   3: {   4:     struct thread_struct *prev, *next;   5:     6:     prev = &prev_p->thread;   7:     next = &next_p->thread;   8:     9:     if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^  10:         test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {  11:         unsigned long debugctl = get_debugctlmsr();  12:    13:         debugctl &= ~DEBUGCTLMSR_BTF;  14:         if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))  15:             debugctl |= DEBUGCTLMSR_BTF;  16:    17:         update_debugctlmsr(debugctl);  18:     }  19:    20:     if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^  21:         test_tsk_thread_flag(next_p, TIF_NOTSC)) {  22:         /* prev and next are different */  23:         if (test_tsk_thread_flag(next_p, TIF_NOTSC))  24:             hard_disable_TSC();  25:         else  26:             hard_enable_TSC();  27:     }  28:    29:     if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {  30:         /*  31:          * Copy the relevant range of the IO bitmap.  32:          * Normally this is 128 bytes or less:  33:          */  34:         memcpy(tss->io_bitmap, next->io_bitmap_ptr,  35:                max(prev->io_bitmap_max, next->io_bitmap_max));  36:     } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {  37:         /*  38:          * Clear any possible leftover bits:  39:          */  40:         memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);  41:     }  42:     propagate_user_return_notify(prev_p, next_p);  43: }