Mac操作系统XNU内核(八)系统调用过程代码简单分析

(一)首先,系统调用有两种方式:

  •     0x80、0x81、0x82三个中断号;
  •     专门指令(至少分Intel架构和ARM架构),比如SYSENTER/SYSCALL

(二)话分两头,先说中断向量方式

  这是终端向量定义的部分代码:

INTERRUPT(0x7d)
INTERRUPT(0x7e)
USER_TRAP(0x7f, idt64_dtrace_ret) /* Required by dtrace "fasttrap" */

USER_TRAP_SPC(0x80,idt64_unix_scall)
USER_TRAP_SPC(0x81,idt64_mach_scall)
USER_TRAP_SPC(0x82,idt64_mdep_scall)

INTERRUPT(0x83)
INTERRUPT(0x84)
INTERRUPT(0x85)
INTERRUPT(0x86)

  (BSD风格的系统调用,终端号就是0x80)

  触发中断以及后面的逻辑,都在汇编文件idt64.s中实现,下面简单看看:

/*
 * System call handlers.
 * These are entered via a syscall interrupt. The system call number in %rax
 * is saved to the error code slot in the stack frame. We then branch to the
 * common state saving code.
 */
        
#ifndef UNIX_INT
#error NO UNIX INT!!!
#endif
Entry(idt64_unix_scall)
    swapgs                /* switch to kernel gs (cpu_data) */
    pushq    %rax            /* save system call number */
    PUSH_FUNCTION(HNDL_UNIX_SCALL)
    pushq    $(UNIX_INT)

  接下来执行PUSH_FUNCTIOIN(HNDL_UNIX_SCALL),先展开PUSH_FUNCTION看看:

#if 1
#define PUSH_FUNCTION(func)              
    sub    $8, %rsp            ;
    push    %rax                ;
    leaq    func(%rip), %rax        ;
    movq    %rax, 8(%rsp)            ;
    pop    %rax
#else
#define PUSH_FUNCTION(func) pushq func
#endif

  系统调用号,在寄存器RAX,接下来看看HNDL_UNIX_SCALL:

Entry(hndl_unix_scall)

        TIME_TRAP_UENTRY

    movq    %gs:CPU_ACTIVE_THREAD,%rcx    /* get current thread     */
    movq    TH_TASK(%rcx),%rbx        /* point to current task  */
    incl    TH_SYSCALLS_UNIX(%rcx)        /* increment call count   */

    /* Check for active vtimers in the current task */
    TASK_VTIMER_CHECK(%rbx,%rcx)

    sti

    CCALL1(unix_syscall, %r15)
    /*
     * always returns through thread_exception_return
     */

  主要有一行:unix_syscall,看看unix_syscall函数的definition:

/*
 * Function:    unix_syscall
 *
 * Inputs:    regs    - pointer to i386 save area
 *
 * Outputs:    none
 */
void
unix_syscall(x86_saved_state_t *state)
{
    thread_t        thread;
    void            *vt;
    unsigned int        code;
    struct sysent        *callp;

    int            error;
    vm_offset_t        params;
    struct proc        *p;
    struct uthread        *uthread;
    x86_saved_state32_t    *regs;
    boolean_t        is_vfork;

    assert(is_saved_state32(state));
    regs = saved_state32(state);
#if DEBUG
    if (regs->eax == 0x800)
        thread_exception_return();
#endif
    thread = current_thread();
    uthread = get_bsdthread_info(thread);

    /* Get the approriate proc; may be different from task's for vfork() */
    is_vfork = uthread->uu_flag & UT_VFORK;
    if (__improbable(is_vfork != 0))
        p = current_proc();
    else 
        p = (struct proc *)get_bsdtask_info(current_task());

    /* Verify that we are not being called from a task without a proc */
    if (__improbable(p == NULL)) {
        regs->eax = EPERM;
        regs->efl |= EFL_CF;
        task_terminate_internal(current_task());
        thread_exception_return();
        /* NOTREACHED */
    }

    code = regs->eax & I386_SYSCALL_NUMBER_MASK;
    DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u
",
                              code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
    params = (vm_offset_t) (regs->uesp + sizeof (int));

    regs->efl &= ~(EFL_CF);

    callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];

    if (__improbable(callp == sysent)) {
        code = fuword(params);
        params += sizeof(int);
        callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
    }

.........

  通过寄存器中的数据得到code,再通过code取得数组sysent中的系统调用函数,交给callp;后面的代码冗长,这里就不全部贴出来咯。

  (关于sysent数组,改天详述)

  (三)再说系统调用专用指令方式(以Intel架构为例)

  SYSENTER用于32位,SYSCALL用于64位,只说SYSCALL吧,先看汇编:

Entry(hi64_syscall)
Entry(idt64_syscall)
L_syscall_continue:
    swapgs                /* Kapow! get per-cpu data area */
    mov    %rsp, %gs:CPU_UBER_TMP    /* save user stack */
    mov    %gs:CPU_UBER_ISF, %rsp    /* switch stack to pcb */

    /*
     * Save values in the ISF frame in the PCB
     * to cons up the saved machine state.
     */
    movl    $(USER_DS), ISF64_SS(%rsp)    
    movl    $(SYSCALL_CS), ISF64_CS(%rsp)    /* cs - a pseudo-segment */
    mov    %r11, ISF64_RFLAGS(%rsp)    /* rflags */
    mov    %rcx, ISF64_RIP(%rsp)        /* rip */
    mov    %gs:CPU_UBER_TMP, %rcx
    mov    %rcx, ISF64_RSP(%rsp)        /* user stack */
    mov    %rax, ISF64_ERR(%rsp)        /* err/rax - syscall code */
    movq    $(T_SYSCALL), ISF64_TRAPNO(%rsp)    /* trapno */
    leaq    HNDL_SYSCALL(%rip), %r11;
    movq    %r11, ISF64_TRAPFN(%rsp)
    mov    ISF64_RFLAGS(%rsp), %r11    /* Avoid leak, restore R11 */
    jmp    L_dispatch_U64            /* this can only be 64-bit */

  主要看看HNDL_SYSCALL:

/*
 * 64bit Tasks
 * System call entries via syscall only:
 *
 *    r15     x86_saved_state64_t
 *    rsp     kernel stack
 *
 *    both rsp and r15 are 16-byte aligned
 *    interrupts disabled
 *    direction flag cleared
 */

Entry(hndl_syscall)
    TIME_TRAP_UENTRY

    movq    %gs:CPU_ACTIVE_THREAD,%rcx    /* get current thread     */
    movq    TH_TASK(%rcx),%rbx        /* point to current task  */

    /* Check for active vtimers in the current task */
    TASK_VTIMER_CHECK(%rbx,%rcx)

    /*
     * We can be here either for a mach, unix machdep or diag syscall,
     * as indicated by the syscall class:
     */
    movl    R64_RAX(%r15), %eax        /* syscall number/class */
    movl    %eax, %edx
    andl    $(SYSCALL_CLASS_MASK), %edx    /* syscall class */
    cmpl    $(SYSCALL_CLASS_MACH<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_mach_scall64)
    cmpl    $(SYSCALL_CLASS_UNIX<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_unix_scall64)
    cmpl    $(SYSCALL_CLASS_MDEP<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_mdep_scall64)
    cmpl    $(SYSCALL_CLASS_DIAG<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_diag_scall64)

    /* Syscall class unknown */
    sti
    CCALL3(i386_exception, $(EXC_SYSCALL), %rax, $1)
    /* no return */

  可以看到,这里根据寄存器和全局参数区分4种系统调用,BSD风格的系统调用只是第1种,还有3种:mach syscall、machdep syscall、diag syscall;

  如果是BSD风格系统调用,那么就继续执行hndl_unix_scall64:

Entry(hndl_unix_scall64)
    incl    TH_SYSCALLS_UNIX(%rcx)        /* increment call count   */
    sti

    CCALL1(unix_syscall64, %r15)
    /*
     * always returns through thread_exception_return
     */

  只有一个函数调用,unix_syscall64,接下来看看这个函数的definition:

void
unix_syscall64(x86_saved_state_t *state)
{
    thread_t    thread;
    unsigned int    code;
    struct sysent    *callp;
    void        *uargp;
    int        args_in_regs;
    int        error;
    struct proc    *p;
    struct uthread    *uthread;
    x86_saved_state64_t *regs;

    assert(is_saved_state64(state));
    regs = saved_state64(state);
#if    DEBUG
    if (regs->rax == 0x2000800)
        thread_exception_return();
#endif
    thread = current_thread();
    uthread = get_bsdthread_info(thread);

    /* Get the approriate proc; may be different from task's for vfork() */
    if (__probable(!(uthread->uu_flag & UT_VFORK)))
        p = (struct proc *)get_bsdtask_info(current_task());
    else 
        p = current_proc();

    /* Verify that we are not being called from a task without a proc */
    if (__improbable(p == NULL)) {
        regs->rax = EPERM;
        regs->isf.rflags |= EFL_CF;
        task_terminate_internal(current_task());
        thread_exception_return();
        /* NOTREACHED */
    }
    args_in_regs = 6;

    code = regs->rax & SYSCALL_NUMBER_MASK;
    DEBUG_KPRINT_SYSCALL_UNIX(
        "unix_syscall64: code=%d(%s) rip=%llx
",
        code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
    callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
    uargp = (void *)(&regs->rdi);

    if (__improbable(callp == sysent)) {
            /*
         * indirect system call... system call number
         * passed as 'arg0'
         */
            code = regs->rdi;
        callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
        uargp = (void *)(&regs->rsi);
        args_in_regs = 5;
    }

..........

  可以看到这里首先从x86_saved_state_t中取得系统调用号code,然后从数组sysent中得到系统调用函数,给callp;再后面是一些参数处理,和callp的执行。

  接下去就到了具体的系统调用函数。

  (大概介绍如上,有人拍砖吗?一起了解啊~)

原文地址:https://www.cnblogs.com/andypeker/p/4385802.html