Linux中的进程调度(四)

先把我们最初开始的函数写上，前面写的太长，不再看下原始代码可能回不去了。

/*
 * Share the fairness runtime between parent and child, thus the
 * total amount of pressure for CPU stays equal - new tasks
 * get a chance to run but frequent forkers are not allowed to
 * monopolize the CPU. Note: the parent runqueue is locked,
 * the child is not running yet.
 */
static void task_new_fair(struct rq *rq, struct task_struct *p)
{
	struct cfs_rq *cfs_rq = task_cfs_rq(p);
	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
	int this_cpu = smp_processor_id();

	sched_info_queued(p);

	update_curr(cfs_rq);
	place_entity(cfs_rq, se, 1);

	/* 'curr' will be NULL if the child belongs to a different group */
	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
			curr && curr->vruntime < se->vruntime) {
		/*
		 * Upon rescheduling, sched_class::put_prev_task() will place
		 * 'current' within the tree based on its new key value.
		 */
		swap(curr->vruntime, se->vruntime);
	}

	enqueue_task_fair(rq, p, 0);
	resched_task(rq->curr);
}

上次已经分析过update_curr的代码，现在接着往下走，去看place_entity()

static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
	u64 vruntime;

	vruntime = cfs_rq->min_vruntime;

	if (sched_feat(TREE_AVG)) {
		struct sched_entity *last = __pick_last_entity(cfs_rq);
		if (last) {
			vruntime += last->vruntime;
			vruntime >>= 1;
		}
	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
		vruntime += sched_vslice(cfs_rq)/2;

	/*
	 * The 'current' period is already promised to the current tasks,
	 * however the extra weight of the new task will slow them down a
	 * little, place the new task so that it fits in the slot that
	 * stays open at the end.
	 */
	if (initial && sched_feat(START_DEBIT))
		vruntime += sched_vslice_add(cfs_rq, se);

	if (!initial) {
		/* sleeps upto a single latency don't count. */
		if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se))
			vruntime -= sysctl_sched_latency;

		/* ensure we never gain time by being placed backwards. */
		vruntime = max_vruntime(se->vruntime, vruntime);
	}

	se->vruntime = vruntime;
}

其中的if else if语句块在默认情况下是都不成立的，不成立的原因很简单，初始化时就这么写的，这里就不啰嗦了。其中

if (initial && sched_feat(START_DEBIT))

在默认情况下还是成立的，所以进去看一下

static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	return __sched_vslice(cfs_rq->load.weight + se->load.weight,
			cfs_rq->nr_running + 1);
}

注释会对我们有帮助～

/*
 * We calculate the vruntime slice.
 *
 * vs = s/w = p/rw
 */
static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
{
	u64 vslice = __sched_period(nr_running);

	vslice *= NICE_0_LOAD;
	do_div(vslice, rq_weight);

	return vslice;
}

再进入到__sched_period中去

/*
 * The idea is to set a period in which each task runs once.
 *
 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
 * this period because otherwise the slices get too small.
 *
 * p = (nr <= nl) ? l : l*nr/nl
 */
static u64 __sched_period(unsigned long nr_running)
{
	u64 period = sysctl_sched_latency;
	unsigned long nr_latency = sched_nr_latency;

	if (unlikely(nr_running > nr_latency)) {
		period *= nr_running;
		do_div(period, nr_latency);
	}

	return period;
}

注释里写的已经很清楚了，如果进程太多的话，一个调度周期就被分的太细，这样纯粹调度花费的时间所占的比重就已经很大，所以要适当把调度周期放大些，放大是根据进程数量按比例来的。然后根据新调整的调度周期，将新建进程的虚拟运行时间增加一点（个人感觉这也是考虑到如果不增加的话，如果一个进程不断的fork子进程，会造成CPU垄断）由于在调用place_entity时最后一个参数为１，所以if(!initial)不成立，直接将计算出来的虚拟时间赋给p的虚拟运行时间。这便是place_entity完成的功能。　主要就是给当前进程分配一个初始的虚拟运行时间，虽然他还没有运行，就像它的名字，place_entity，给新进程找到一个合适的地方，以便接下来将其放入红黑树。接下来再回到task_+new_fair中去，place_entity执行完毕后进入一个if条件语句，当满足以下条件时该语句块会被执行： 1)系统定义了创建进程后子进程先执行 2)当前进程所在的CPU就是子进程所被分配到的CPU ３）curr不为空(注释中说明是如果父子进程不属于同一个调度组时curr可能为空，这个在后面会研究) 4)curr的虚拟运行时间小于新进程的虚拟运行时间如果满足这几个条件，会将父子进程的虚拟运行时间交换，为什么要交换呢？很明显，条件中说，子进程要先运行，而此时子进程的虚拟运行时间又比父进程的运行时间大，调度器每次都会选一个虚拟运行时间最小的进程来运行，所以当然要将他们两个交换了。这一切准备好之后(主要是记录更新了当前进程的虚拟运行时间，给新创建的子进程增加了适当的虚拟运行时间），便可以将新创建的进程插入红黑树以待调度了。于是在task_new_fair里就进程了enqueue_task函数。

/*
 * The enqueue_task method is called before nr_running is
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;

	for_each_sched_entity(se) {
		if (se->on_rq)
			break;
		cfs_rq = cfs_rq_of(se);
		enqueue_entity(cfs_rq, se, wakeup);
		wakeup = 1;
	}
}

如果在有组调度的情况下，这个函数不仅会将当前子进程加入红黑树，具体我们后面再分析。总之，它的主要动作就是调用enqueue_entity:

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	if (wakeup) {
		place_entity(cfs_rq, se, 0);
		enqueue_sleeper(cfs_rq, se);
	}

	update_stats_enqueue(cfs_rq, se);
	check_spread(cfs_rq, se);
	if (se != cfs_rq->curr)
		__enqueue_entity(cfs_rq, se);
	account_entity_enqueue(cfs_rq, se);
}

在做了一些必要的更新信息后，主要动作是调用__enqueue_entity

/*
 * Enqueue an entity into the rb-tree:
 */
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	s64 key = entity_key(cfs_rq, se);
	int leftmost = 1;

	/*
	 * Find the right place in the rbtree:
	 */
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
		if (key < entity_key(cfs_rq, entry)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = 0;
		}
	}

	/*
	 * Maintain a cache of leftmost tree entries (it is frequently
	 * used):
	 */
	if (leftmost)
		cfs_rq->rb_leftmost = &se->run_node;

	rb_link_node(&se->run_node, parent, link);
	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}

这个函数很容易读懂，先是找到要插入的位置，然后将其插入，（如果插入位置是最左边的叶子结点，那么需要更新缓冲)，然后链入结点，并修改结点颜色以保证红黑树的性质，以防子树高度过于不平衡。具体算法可参见算法导论，很容易懂。不过在红黑树的结点表示上，这里用到了一个技巧：　把所有结点的地址按４字节对齐，这样，每个节点的首地址最后两个比特位肯定为零，由于红黑树的颜色状态只有两个，要么红要么黑，所以编码者就利用了最低位来表示当前节点的颜色。这种用内存的方式～～将进程插入红黑树后，就可以通过内核在系统调用返回用户空间时进行一次调度了。

	enqueue_task_fair(rq, p, 0);
	resched_task(rq->curr);

进入resched_task中去：

/*
 * resched_task - mark a task 'to be rescheduled now'.
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
#ifdef CONFIG_SMP

#ifndef tsk_is_polling
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
#endif

static void resched_task(struct task_struct *p)
{
	int cpu;

	assert_spin_locked(&task_rq(p)->lock);

	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
		return;

	set_tsk_thread_flag(p, TIF_NEED_RESCHED);

	cpu = task_cpu(p);
	if (cpu == smp_processor_id())
		return;

	/* NEED_RESCHED must be visible before we test polling */
	smp_mb();
	if (!tsk_is_polling(p))
		smp_send_reschedule(cpu);
}

先加锁.

if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
		return;

这一句很有意思，如果当前进程的TIF_NEED_RESCHED标志已经置位，那么便可以直接返回了，test_tsk_thread_flag是怎么工作的呢？一步步往下深入：

 static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
     return test_ti_thread_flag(task_thread_info(tsk), flag);
 }

  static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
  {
      return test_bit(flag,&ti->flags);
  }

 #define test_bit(nr,addr) 
 (__builtin_constant_p(nr) ? 
  constant_test_bit((nr),(addr)) : 
  variable_test_bit((nr),(addr)))

当前，flag标志应该是一个变量,（谁说的C语言里面没有多态？),接着往里面走：

 static inline int variable_test_bit(int nr, const volatile unsigned long * addr)
 {
     int oldbit;

     __asm__ __volatile__(
         "btl %2,%1ntsbbl %0,%0"
         :"=r" (oldbit)
         :"m" (ADDR),"Ir" (nr));
     return oldbit;
 }

汇编语句。 btl的功能是测试某个数的特定位是零还是１，测试结果放到CF标志位里。（具体可见intel的用户手册）.然后，sbbl %0, %0，也就是说，让oldbit与oldbit的值带位相减，即oldbit-oldbit-CF，如果CF标志位是零，也就是刚才位测试的结果是零的话，当然最后返回的oldbit也是零了，如果CF标志位是１，那么返回的就不一样了。返回的值为-1，多么巧妙。如果经测试，TIF_NEED_RESCHED还没有置位，那么就将其置位。

      set_tsk_thread_flag(p, TIF_NEED_RESCHED);

 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
     set_ti_thread_flag(task_thread_info(tsk), flag);
 }

  static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
  {
      set_bit(flag,&ti->flags);
  }

这里所有的涉及平台相关的汇编指令全是以Intel x86为例

  static inline void set_bit(int nr, volatile unsigned long * addr)
  {
      __asm__ __volatile__( LOCK_PREFIX
          "btsl %1,%0"
          :"+m" (ADDR)
          :"Ir" (nr));
  }

btsl就不说了，这个指令的功能就是置位，而且是原子的。置好位后,resched_task的任务只剩下对多处理器存在的情况下任务的处理了，先留着，把整个调度过程走一遍再回过头来分析。