poll系统调用的内核态实现机制分析

版权所有，转载请标明出处 All right reserved，Copyright by 徐行而至浅唱而归

前面已经比较详尽的分析了系统调用引发的内核执行过程，本文将继续分析一下linux2.6.38内核源码中poll函数（与select实现类似）的实现。

通过前面的分析，我们知道，应用程序中的open、read、write函数系统调用都会触发软中断异常，从而进入异常处理，在异常处理中将会获取用户态传入的系统调用号，根据系统调用号在系统调用表中索引出实际的系统调用处理函数，如内核里的sys_open、sys_read、sys_write函数，而内核里的这些函数又会对应到驱动程序里的open、read、write函数。

poll机制也不例外，用户空间里调用poll函数或者select函数时，都会调用到内核空间的sys_poll或者sys_select函数。下面以sys_poll来分析用户空间poll的实现：

用户空间调用:poll
	内核: asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,long timeout);
	即SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,long, timeout_msecs)         //以前分析过，实际为宏

其实现如下：

fsselect.c

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
		long, timeout_msecs)
{
	struct timespec end_time, *to = NULL;
	int ret;

	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
	}

	ret = do_sys_poll(ufds, nfds, to);

	if (ret == -EINTR) {
		struct restart_block *restart_block;

		restart_block = ¤t_thread_info()->restart_block;
		restart_block->fn = do_restart_poll;
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

为了使结构简要直观，我们只列出调用关系框架：

用户空间:poll
	内核: asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,long timeout);
	即SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,long, timeout_msecs)         //以前分析过，实际为宏
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));//配置超时时间
		do_sys_poll(ufds, nfds, to);
			poll_initwait(&table);//初始化等待队列
				init_poll_funcptr(&pwq->pt, __pollwait);
					pt->qproc = qproc; /*table->pt->qproc= __pollwait;详见注释1-2*/
			do_poll(nfds, head, &table, end_time);
				for (;;)
				{
					for (; pfd != pfd_end; pfd++)//针对多个进程
					{
						if (do_pollfd(pfd, pt))        /*do_pollfd会调用驱动poll函数，poll里面的poll_wait最终调用pt->qproc函数（即__pollwait）将进程中可能引起待监测内容状态变化的等待队列链表头挂载到查询表中，除此以外，poll还会根据条件判断事件是否发生，发生则返回真，详见注释1-1*/
						{
							count++;       //记录等待事件发生的进程数
							 pt = NULL;   
						}
					}
					if (!count)   /*若count为0(表示无等待的事件发生)*/
					{
						count = wait->error;
						if (signal_pending(current))     /*判断是否为信号唤醒*/
						count = -EINTR;
					}
				/*若count不为0(有等待的事件发生了)或者timed_out不为0（有信号发生或超时），则推出休眠*/ 
					if (count || timed_out)                                                               
						break ；
				/*上述条件不满足时进入休眠，若有等待的事件发生了，超时或收到信号则唤醒*/
					if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
						timed_out = 1;
				}
			poll_freewait(&table); //清除poll_wqueues占用的资源，包括加入到查询表中的等待队列链表头

注释1-1：

do_pollfd(pfd, pt)

mask = file->f_op->poll(file, pwait);/*调用驱动程序中的poll函数，若poll驱动函数返回值不为0则会使count++*/

调用关系如下：

do_pollfd(pfd, pt)
	mask = file->f_op->poll(file, pwait); //即调用驱动中的poll函数（在写驱动程序时编写poll函数）
		poll_wait //其作用为挂载当前进程中可能引起待监测内容状态变化的等待队列链表头到poll_table查询表中，具体实现如下
			p->qproc(filp, wait_address, p);（p->qproc = __pollwait前面已有初始化）
			__pollwait(struct file *filp, wait_queue_head_t *wait_address,poll_table *p)（上面展开即为本函数调用）
			{
				entry->wait_address = wait_address; //挂载进程等待队列链表头到查询表
				add_wait_queue(wait_address, &entry->wait);
			}

除此以外，poll驱动函数还会根据条件判断事件是否发生，发生则返回真，退出休眠；

下面是一个poll驱动函数的例子：

static unsigned forth_drv_poll(struct file *file, poll_table *wait)
{
	unsigned int mask = 0;
	poll_wait(file, &button_waitq, wait); // 将button_waitq等待链表头加入wait查询表
	if (ev_press) //判断事件是否发生，发生了则返回真；
		mask |= POLLIN | POLLRDNORM;
	return mask;
}

注释1-2：

1. 此处主要目的为初始化函数指针，使其指向__pollwait函数，在poll驱动函数中将调用poll_wait，而该函数中将调用__pollwait函数将当前进程挂到等待队列中，

2. __pollwait函数中有这么两句：entry->wait_address = wait_address; add_wait_queue(wait_address, &entry->wait);用于将当前进程中可能引起监测内容变化的等待队列链表头挂到查询表中，table->pt->qproc= __pollwait就就是初始化table->pt->qproc函数指针。

从上述分析可知，poll即使只有一个描述符就绪，也要遍历整个集合。如果集合中活跃的描述符很少，遍历过程的开销就会变得很大，而如果集合中大部分的描述符都是活跃的，遍历过程的开销又可以忽略。因此集合中大部分的描述符都是活跃的情况下，poll的使用效率高。每次用户空间调用poll函数时，都要将全部的数据复制到内核，复制数据增加了开销

本文参考：http://watter1985.iteye.com/blog/1614039

poll系统调用的内核态实现机制分析

版权所有，转载请标明出处 All right reserved，Copyright by 徐行而至 浅唱而归

版权所有，转载请标明出处 All right reserved，Copyright by 徐行而至浅唱而归