select源码分析(linux2.6.11)

本文以tcp poll为例子来分析select的源码,下面是函数调用顺序。
select--->sys_select->do_select--->sock_poll--->tcp_poll
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
{
    fd_set_bits fds;
    char *bits;
    long timeout;
    int ret, size, max_fdset;

    timeout = MAX_SCHEDULE_TIMEOUT;
    if (tvp) {
        time_t sec, usec;

        if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp)))
            || (ret = __get_user(sec, &tvp->tv_sec))
            || (ret = __get_user(usec, &tvp->tv_usec)))
            goto out_nofds;

        ret = -EINVAL;
        if (sec < 0 || usec < 0)
            goto out_nofds;

        if ((unsigned long) sec < MAX_SELECT_SECONDS) {
            timeout = ROUND_UP(usec, 1000000/HZ);
            timeout += sec * (unsigned long) HZ;
        }
    }

    ret = -EINVAL;
    if (n < 0)
        goto out_nofds;

    /* max_fdset can increase, so grab it once to avoid race */
    max_fdset = current->files->max_fdset;
    if (n > max_fdset)
        n = max_fdset;

    ret = -ENOMEM;
    size = FDS_BYTES(n);
    bits = select_bits_alloc(size);
    if (!bits)
        goto out_nofds;
    fds.in      = (unsigned long *)  bits;
    fds.out     = (unsigned long *) (bits +   size);
    fds.ex      = (unsigned long *) (bits + 2*size);
    fds.res_in  = (unsigned long *) (bits + 3*size);
    fds.res_out = (unsigned long *) (bits + 4*size);
    fds.res_ex  = (unsigned long *) (bits + 5*size);

  /* 将所有关心的fd的读、写、异常位从用户态复制到内核态 */
    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);

  /* 主要函数 */
    ret = do_select(n, &fds, &timeout);

    if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
        time_t sec = 0, usec = 0;
        if (timeout) {
            sec = timeout / HZ;
            usec = timeout % HZ;
            usec *= (1000000/HZ);
        }
        put_user(sec, &tvp->tv_sec);
        put_user(usec, &tvp->tv_usec);
    }

    if (ret < 0)
        goto out;
    if (!ret) {
        ret = -ERESTARTNOHAND;
        if (signal_pending(current))
            goto out;
        ret = 0;
    }

    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;

out:
    select_bits_free(bits, size);
out_nofds:
    return ret;
}
int do_select(int n, fd_set_bits *fds, long *timeout)
{
    struct poll_wqueues table;
    poll_table *wait;
    int retval, i;
    long __timeout = *timeout;

    spin_lock(&current->files->file_lock);
    retval = max_select_fd(n, fds);
    spin_unlock(&current->files->file_lock);

    if (retval < 0)
        return retval;
    n = retval;

    poll_initwait(&table);
    wait = &table.pt;
    if (!__timeout)
        wait = NULL;
    retval = 0;
    for (;;) {
        unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

        /* 设置当前的进程状态为可中断睡眠状态,但是当前进程还没有被调度出去 */
        set_current_state(TASK_INTERRUPTIBLE);

        inp = fds->in; outp = fds->out; exp = fds->ex;
        rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

        for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
            unsigned long in, out, ex, all_bits, bit = 1, mask, j;
            unsigned long res_in = 0, res_out = 0, res_ex = 0;
            struct file_operations *f_op = NULL;
            struct file *file = NULL;

            /* 这里要跳过一些并没有关心的bit位,浪费了时间 */
            in = *inp++; out = *outp++; ex = *exp++;
            all_bits = in | out | ex;
            if (all_bits == 0) {
                i += __NFDBITS;
                continue;
            }

            /* 循环遍历所有关注的bit 位*/
            for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
                if (i >= n)
                    break;
                if (!(bit & all_bits))
                    continue;
                file = fget(i);
                if (file) {
                    f_op = file->f_op;
                    mask = DEFAULT_POLLMASK;
                    if (f_op && f_op->poll)
                        /* 调用poll函数,将当前进程挂上等待队列,以及设置唤醒函数(驱动收到数据时会调用唤醒函数唤醒进程)。并获取当前关心的fd的可读、可写、异常情况
                          (套接字的sock_poll 初始化在socket_file_ops)*/
                        mask = (*f_op->poll)(file, retval ? NULL : wait);
                    fput(file);
                    /* 表示可读 */
                    if ((mask & POLLIN_SET) && (in & bit)) {
                        res_in |= bit;
                        retval++;
                    }
                    /* 表示可写 */
                    if ((mask & POLLOUT_SET) && (out & bit)) {
                        res_out |= bit;
                        retval++;
                    }
                    /* 表示异常 */
                    if ((mask & POLLEX_SET) && (ex & bit)) {
                        res_ex |= bit;
                        retval++;
                    }
                }
                /**
                * 如果有必要,就重新调度进程
                */
                cond_resched();
            }
            if (res_in)
                *rinp = res_in;
            if (res_out)
                *routp = res_out;
            if (res_ex)
                *rexp = res_ex;
        }
        /* 遍历完后,检查retval,看是否有可读可写异常,如果有retval不为0,那么则退出死循环 */
        wait = NULL;
        if (retval || !__timeout || signal_pending(current))
            break;
        if(table.error) {
            retval = table.error;
            break;
        }
        /* 如果上面没有检查到关心的bit位有可读可写异常。如果调用select时设置的是无限等待,
          那么下面函数会进行进程调度,将当前进程调度出去。驱动收到数据时会调换用poll函数设置的唤醒函数,来唤醒当前进程对关心的bit位进行重新检查*/
        __timeout = schedule_timeout(__timeout);
    }
    __set_current_state(TASK_RUNNING);

    poll_freewait(&table);

    /*
     * Up-to-date the caller timeout.
     */
    *timeout = __timeout;
    return retval;
}
/* No kernel lock held - perfect */
static unsigned int sock_poll(struct file *file, poll_table * wait)
{
    struct socket *sock;

    /*
     *  We can't return errors to poll, so it's either yes or no.
     */
    sock = SOCKET_I(file->f_dentry->d_inode);
    /* 例子 tcp_poll */
    return sock->ops->poll(file, sock, wait);
}
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
    unsigned int mask;
    struct sock *sk = sock->sk;
    struct tcp_sock *tp = tcp_sk(sk);

    /* 将当前进程加入等待队列,并且有唤醒函数 */
    poll_wait(file, sk->sk_sleep, wait);
    if (sk->sk_state == TCP_LISTEN)
        return tcp_listen_poll(sk, wait);

    mask = 0;
    if (sk->sk_err)
        mask = POLLERR;

    if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
        mask |= POLLHUP;
    if (sk->sk_shutdown & RCV_SHUTDOWN)
        mask |= POLLIN | POLLRDNORM;

    /* Connected? */
    if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
        /* Potential race condition. If read of tp below will
         * escape above sk->sk_state, we can be illegally awaken
         * in SYN_* states. */
        if ((tp->rcv_nxt != tp->copied_seq) &&
            (tp->urg_seq != tp->copied_seq ||
             tp->rcv_nxt != tp->copied_seq + 1 ||
             sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
            mask |= POLLIN | POLLRDNORM;

        if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
            if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
                mask |= POLLOUT | POLLWRNORM;
            } else {  /* send SIGIO later */
                set_bit(SOCK_ASYNC_NOSPACE,
                    &sk->sk_socket->flags);
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);

                /* Race breaker. If space is freed after
                 * wspace test but before the flags are set,
                 * IO signal will be lost.
                 */
                if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
                    mask |= POLLOUT | POLLWRNORM;
            }
        }

        if (tp->urg_data & TCP_URG_VALID)
            mask |= POLLPRI;
    }
    return mask;
}
/*真正的等待处 ,每个监控调用一次 */
void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p)
{
    struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
    struct poll_table_page *table = p->table;

    if (!table || POLL_TABLE_FULL(table)) {
        struct poll_table_page *new_table;

        new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
        if (!new_table) {
            p->error = -ENOMEM;
            __set_current_state(TASK_RUNNING);
            return;
        }
        new_table->entry = new_table->entries;
        new_table->next = table;
        p->table = new_table;
        table = new_table;
    }

    /* Add a new entry */
    {
        struct poll_table_entry * entry = table->entry;
        table->entry = entry+1;
        get_file(filp);
        entry->filp = filp;
        entry->wait_address = wait_address;
        /*  添加当前进程到等待队列, 这里面含有唤醒函数 */
        init_waitqueue_entry(&entry->wait, current);
        add_wait_queue(wait_address,&entry->wait);
    }
}
/**
 * 非互斥进程由default_wake_function唤醒。它是try_to_wake_up的一个简单封装。
 */
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
{
    task_t *p = curr->task;
    return try_to_wake_up(p, mode, sync);
}
/**
 * 通过把进程状态设置为TASK_RUNNING,并把该进程插入本地CPU的运行队列来唤醒睡眠或停止的进程
 * p-被唤醒进程的描述符
 * state-可以被唤醒的进程状态掩码。
 * sync-一个标志,用来禁止被唤醒的进程抢占本地CPU上正在运行的进程。
 */
static int try_to_wake_up(task_t * p, unsigned int state, int sync)
{
    int cpu, this_cpu, success = 0;
    unsigned long flags;
    long old_state;
    runqueue_t *rq;
#ifdef CONFIG_SMP
    unsigned long load, this_load;
    struct sched_domain *sd;
    int new_cpu;
#endif

    /**
     * 调用task_rq_lock来禁止中断,并获得进程所在CPU上的运行队列的锁(可能与当前CPU的运行队列不一样,并且被唤醒的进程可能并不在队列上)
     */
    rq = task_rq_lock(p, &flags);
    schedstat_inc(rq, ttwu_cnt);
    old_state = p->state;
    /**
     * 只唤醒state对应状态的进程。如果被唤醒的进程状态不在state中,直接退出。本次唤醒无效。
     * 例如:通过信号就不会唤醒TASK_UNINTERRUPTIBLE状态的进程。
     */
    if (!(old_state & state))
        goto out;

    /**
     * 如果进程已经属于某个运行队列,就跳转到out_running,将它的状态修改为TASK_RUNNING状态后退出。
     */
    if (p->array)
        goto out_running;

    cpu = task_cpu(p);
    this_cpu = smp_processor_id();

#ifdef CONFIG_SMP
    /**
     * 在SMP上,需要检查被唤醒的进程是否应该从最近运行的CPU的运行队列迁移到另外一个CPU的运行队列。
     */

    /**
     * 被唤醒任务正在CPU上运行,不必考虑迁移了。
     */
    if (unlikely(task_running(rq, p)))
        goto out_activate;

    /**
     * 优先将进程放到进程所在CPU上运行。
     */
    new_cpu = cpu;

    /**
     * 如果进程所在CPU就是当前进程所在CPU,或者被唤醒进程不允许在当前进程所在CPU上运行,那么跳转到out_set_cpu
     */
    if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
        goto out_set_cpu;

    load = source_load(cpu);
    this_load = target_load(this_cpu);

    /*
     * If sync wakeup then subtract the (maximum possible) effect of
     * the currently running task from the load of the current CPU:
     */
    if (sync)
        this_load -= SCHED_LOAD_SCALE;

    /* Don't pull the task off an idle CPU to a busy one */
    /**
     * 如果被唤醒任务所在的CPU工作量小于当前CPU的工作量,也跳转到out_set_cpu
     */
    if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
        goto out_set_cpu;

    /**
     * 试图将进程迁移到本地CPU。
     */
    new_cpu = this_cpu; /* Wake to this CPU if we can */

    /*
     * Scan domains for affine wakeup and passive balancing
     * possibilities.
     */
    for_each_domain(this_cpu, sd) {
        unsigned int imbalance;
        /*
         * Start passive balancing when half the imbalance_pct
         * limit is reached.
         */
        imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;

        if ((sd->flags & SD_WAKE_AFFINE) &&
                !task_hot(p, rq->timestamp_last_tick, sd)) {
            /*
             * This domain has SD_WAKE_AFFINE and p is cache cold
             * in this domain.
             */
            if (cpu_isset(cpu, sd->span)) {
                schedstat_inc(sd, ttwu_wake_affine);
                goto out_set_cpu;
            }
        } else if ((sd->flags & SD_WAKE_BALANCE) &&
                imbalance*this_load <= 100*load) {
            /*
             * This domain has SD_WAKE_BALANCE and there is
             * an imbalance.
             */
            if (cpu_isset(cpu, sd->span)) {
                schedstat_inc(sd, ttwu_wake_balance);
                goto out_set_cpu;
            }
        }
    }

    new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
    schedstat_inc(rq, ttwu_attempts);
    new_cpu = wake_idle(new_cpu, p);
    if (new_cpu != cpu) {
        schedstat_inc(rq, ttwu_moved);
        set_task_cpu(p, new_cpu);
        task_rq_unlock(rq, &flags);
        /* might preempt at this point */
        rq = task_rq_lock(p, &flags);
        old_state = p->state;
        if (!(old_state & state))
            goto out;
        if (p->array)
            goto out_running;

        this_cpu = smp_processor_id();
        cpu = task_cpu(p);
    }

out_activate:
#endif /* CONFIG_SMP */
    /**
     * 如果是TASK_UNINTERRUPTIBLE,就递减nr_uninterruptible
     * 并将activated设为-1,表示进程是从TASK_UNINTERRUPTIBLE状态被唤醒这个事实。
     */
    if (old_state == TASK_UNINTERRUPTIBLE) {
        rq->nr_uninterruptible--;
        /*
         * Tasks on involuntary sleep don't earn
         * sleep_avg beyond just interactive state.
         */
        p->activated = -1;
    }

    /*
     * Sync wakeups (i.e. those types of wakeups where the waker
     * has indicated that it will leave the CPU in short order)
     * don't trigger a preemption, if the woken up task will run on
     * this cpu. (in this case the 'I will reschedule' promise of
     * the waker guarantees that the freshly woken up task is going
     * to be considered on this CPU.)
     */
    /**
     * activate_task函数依次执行以下步骤澹?
     *     1:调用sched_clock获得当前时间戳,如果目标CPU不是本地CPU,那么还会补偿时钟中断的偏差。
     *     2:调用recalc_task_prio,计算进程的动态优先级。
     *     3:根据情况设置activated
     *     4:设置进程的时间戳。
     *     5:将进程插入进程集合。
     */
    activate_task(p, rq, cpu == this_cpu);
    /**
     * 如果目标CPU不是本地CPU,或者没有SYNC标志,就检查新进程的动态优先级是否比运行队列中当前进程的优先级高。
     */
    if (!sync || cpu != this_cpu) {
        if (TASK_PREEMPTS_CURR(p, rq))/* 进程的优先级比所在队列的当前进程优先级高,需要抢占。 */
            /**
             * resched_task函数进行进程抢占。
             * 在单处理器上,它仅仅设置TIF_NEED_RESCHED标志。
             * 在多处理器上,它可能会发送IPI,强制让CPU产生调度。
             */
            resched_task(rq->curr);
    }
    success = 1;

out_running:
    /**
     * 将进程状态设置为为TASK_RUNNING,注意两个流程会走到这里。
     */
    p->state = TASK_RUNNING;
out:
    /**
     * 开中断并打开运行队列的锁。
     */
    task_rq_unlock(rq, &flags);

    /**
     * 返回0:进程没有被唤醒。否则返回1,进程被唤醒。
     */
    return success;
}

当底层驱动收到数据后,会产生中断信号,调用 default_wake_function函数来唤醒对应的进程,唤醒后进程继续do_select来检查关心的bit位。至于驱动具体是如何通知上层的,还需要进一步学习与分析。

原文地址:https://www.cnblogs.com/jaydenhpj/p/5121030.html