Linux中的进程调度(三)

上次主要分析了进入wake_up_new_tak()后，调用effective_prio来获得调度器所关心的优先级，现在接着往下走。在执行完effective_prio后，会执行以下语句：

	p->prio = effective_prio(p);

	if (!p->sched_class->task_new || !current->se.on_rq) {
		activate_task(rq, p, 0);
	} else {
		/*
		 * Let the scheduling class do new task startup
		 * management (if any):
		 */
		p->sched_class->task_new(rq, p);
		inc_nr_running(p, rq);
	}
	check_preempt_curr(rq, p);
	task_rq_unlock(rq, &flags);

!p->sched_class->task_new的情况只有在当前进程为idle进程时才会发生（可以去查看sched_idleclass.c，该调度类的new函数就为空)，这里我们先不管它，直接进入else语句块。注意到sched_fair.c里面有如下代码

/*
 * All the scheduling class methods:
 */
static const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,

#ifdef CONFIG_SMP
	.load_balance		= load_balance_fair,
	.move_one_task		= move_one_task_fair,
#endif

	.set_curr_task          = set_curr_task_fair,
	.task_tick		= task_tick_fair,
	.task_new		= task_new_fair,
};

也就是说，对于sched_fair调度类，它的task_new函数就是task_new_fair。首先，调用sched_class->task_new函数来将新进程加入可运行队列。深入这个函数的源码：

/*
 * Share the fairness runtime between parent and child, thus the
 * total amount of pressure for CPU stays equal - new tasks
 * get a chance to run but frequent forkers are not allowed to
 * monopolize the CPU. Note: the parent runqueue is locked,
 * the child is not running yet.
 */
static void task_new_fair(struct rq *rq, struct task_struct *p)
{
	struct cfs_rq *cfs_rq = task_cfs_rq(p);
	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
	int this_cpu = smp_processor_id();

	sched_info_queued(p);

	update_curr(cfs_rq);
	place_entity(cfs_rq, se, 1);

	/* 'curr' will be NULL if the child belongs to a different group */
	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
			curr && curr->vruntime < se->vruntime) {
		/*
		 * Upon rescheduling, sched_class::put_prev_task() will place
		 * 'current' within the tree based on its new key value.
		 */
		swap(curr->vruntime, se->vruntime);
	}

	enqueue_task_fair(rq, p, 0);
	resched_task(rq->curr);
}

先是通过smp_processor_id()获得当前内核运行在哪个CPU上（其具体实现是在系统开机时划定一个区域，专门存放per_cpu变量，细节以后再研究) 然后，调用sched_info_queued(p)，这个函数如果在没有定义CONFIG_SCHEDSTATS和CONFIG_TASK_DELAY_ACCT时是一个空函数，如果定义了CONFIG_SCHEDSTATS或者CONFIG_TASK_DELAY_ACCT，那么这个函数如下：

static inline void sched_info_queued(struct task_struct *t)
{
	if (unlikely(sched_info_on()))
		if (!t->sched_info.last_queued)
			t->sched_info.last_queued = task_rq(t)->clock;
}

也就是记录了一个进程所在队列最后一次被操作的时间。记录这个究竟有什么用呢？Google上找了一下，很少有讲，不过有看到如下说明： If you say Y here, additional code will be inserted into the scheduler and related routines to collect statistics about scheduler behavior and provide them in /proc/schedstat. These stats may be useful for both tuning and debugging the scheduler If you aren't debugging the scheduler or trying to tune a specific application, you can say N to avoid the very slight overhead this adds. 也就是说，只是做一些调试用或者针对某个特定应用。那我们这里不管它。接着刚才的的路线往下走，会继续调用update_curr(cfs_rq)，进入到这个函数中去看看究竟做了些什么：

static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_of(cfs_rq)->clock;
	unsigned long delta_exec;

	if (unlikely(!curr))
		return;

	/*
	 * Get the amount of time the current task was running
	 * since the last time we changed load (this cannot
	 * overflow on 32 bits):
	 */
	delta_exec = (unsigned long)(now - curr->exec_start);

	__update_curr(cfs_rq, curr, delta_exec);
	curr->exec_start = now;

	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

		cpuacct_charge(curtask, delta_exec);
	}
}

先看 u64 now = rq_of(cfs_rq)->clock;　cfs_rq不已经是一个运行队列了么？怎么还能rq_of? 原来，为了不只是在单纯的进程之前平衡调度，还需要考虑不同用户之间的进程调度保持相对平衡，不能发生一个用户的进程大量得到执行，而另一个用户的进程很少得到调度执行的情况，所以，新版本的内核又对若干进程进行了分组，各个平行的组之间也要保持相对平衡。所以，在cfs_rq上还会有一层数据结构来维持这个操作（这个数据结构每个CPU只有一个，具体维持方法随着代码的深入再做分析) 然后，计算delta_exec的值，注意这里使用的是rq的clock减去当前sched_entity(也就是当前进程)的开始运行时间。而且注意到上面的rq_of在不同的预编译选项下，行为是不一样的，而这个预编译条件正好是CONFIG_FAIR_GROUP_SCHED,所以，个人认为这里还藏着些什么，先把整个过程走一遍，等回过头来再慢慢推敲 计算完delta_exec，也就是实际的运行时间(这个时间不是虚拟意义上的时间)后，将其作为参数，调用__update_curr

/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 */
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
	      unsigned long delta_exec)
{
	unsigned long delta_exec_weighted;
	u64 vruntime;

	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));

	curr->sum_exec_runtime += delta_exec;
	schedstat_add(cfs_rq, exec_clock, delta_exec);
	delta_exec_weighted = delta_exec;
	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
							&curr->load);
	}
	curr->vruntime += delta_exec_weighted;

	/*
	 * maintain cfs_rq->min_vruntime to be a monotonic increasing
	 * value tracking the leftmost vruntime in the tree.
	 */
	if (first_fair(cfs_rq)) {
		vruntime = min_vruntime(curr->vruntime,
				__pick_next_entity(cfs_rq)->vruntime);
	} else
		vruntime = curr->vruntime;

	cfs_rq->min_vruntime =
		max_vruntime(cfs_rq->min_vruntime, vruntime);
}

代码注释中说是重新统计一下当前进程的vruntime，具体是怎么统计的？ schedstat_set和上面讲到的编译调度器信息有关，也是由预编译选项决定，对调度器的功能上没有什么作用，不去管它。将currr的sum_exec_runtime加上delta_exec，这里全都是实际时间，也就是有的地方所讲的wall clock。同样，schedstat_add()做调度器debug和编译信息用，不去管它。下面就要开始比较重要的工作了。如果当前进程的weight等于nice值为零时的weight时，虚拟时间和实际时间重合(也就是说虚拟时间以nice值为零，也就是优先级是100的进程为准，然后做一定伸缩)。if语句里的unlikely是一个宏，将这个宏展开后是gcc编译器的一个优化手法，总的来说，就是可以当作unlikely并不存在，但是，这个语句指出，这个if条件在大多数情况下是不成立的，编译时可以将此语句尽量往后放，少做出一些分支判断，从而加快运行速度。（顺便运行了一下top，发现确实如此，系统中绝大多数进程的NICE值都为0，也就是说，那个if语句确实会很少被执行到）那万一当前进程的优先级不是100呢，那就只好进入进去，去执行calc_delta_fair了，这个函数的源码如下：

static inline unsigned long
calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
{
	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
}

static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
		struct load_weight *lw)
{
	u64 tmp;

	if (unlikely(!lw->inv_weight))
		lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;

	tmp = (u64)delta_exec * weight;
	/*
	 * Check whether we'd overflow the 64-bit multiplication:
	 */
	if (unlikely(tmp > WMULT_CONST))
		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
			WMULT_SHIFT/2);
	else
		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}

这里的代码比较难懂，推了会也没从原理上推出来，而且好像用到了好几个近似值，不过可以代入实际数据进行验证，至于编码者是怎么想到这样写我就不清楚了。总结下，它的作用就是返回一个值，这个值等于：实际运行时间＊（NICE值为0时的weight/当前进程的weight) 从这里可以看出，如果一个进程的weight越大（其实就是优先级越小，NICE值越小)，那么，返回的值就越小，也就是说，对于相同的实际执行时间，最后落实到进程的虚拟运行时间上时，优先级高的落实的少，优先级低的落实的多。而调度器就是根据虚拟运行时间的多少进行调度的，所以说，优先级越高，实际获得的运行时间就越长，虽然他们的虚拟运行时间看起来是一样的。在完成了这最重要的一步后，__update_curr还做了另外一件事，就是更新cfs_rq中的min_vruntime值(这个值我们下面会看到它的作用)，这个值取的是：当前进程的虚拟运行时间与cfs_rq所对应的红黑树的最左节点所对应进程的虚拟运行时间之间的较小值　和　原有的min_vruntime　之间的　较大值。如果当前进程一直在创建子进程的话，那么curr->vruntime会不断增长，从而vruntime会取到红黑树中最左孩子的值，然后再取它和原有的min_vruntime之间较大者，不会使min_vrumtime一直增长，从而造成这个进程对系统资源的垄断。以上便是对update_curr(cfs_rq)函数的分析。总的来说，它的作用就是根据当前进程的运行时间，结合它的权重，更新它的虚拟运行时间，顺便设置了cfs_rq中min_vrumtime的值，为下一步加入新进程做好准备。