Linux中的进程调度(六)

从现在开始来分析和负载平衡有关的策略。

/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 *
 * It also gets called by the fork code, when changing the parent's
 * timeslices.
 */
void scheduler_tick(void)
{
	int cpu = smp_processor_id();
	struct rq *rq = cpu_rq(cpu);
	struct task_struct *curr = rq->curr;

	sched_clock_tick();

	spin_lock(&rq->lock);
	update_rq_clock(rq);
	update_cpu_load(rq);
	curr->sched_class->task_tick(rq, curr, 0);
	spin_unlock(&rq->lock);

#ifdef CONFIG_SMP
	rq->idle_at_tick = idle_cpu(cpu);//检查当前cpu运行队列是否为空（只有idle进程)
	trigger_load_balance(rq, cpu);
#endif
}

可见，在每次处理时钟中断时，在最后会检查一下是否需要进行一次负载平衡。进入到trigger_load_balance中去，从名字就可以猜出个大概。

/*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 *
 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
 * idle load balancing owner or decide to stop the periodic load balancing,
 * if the whole system is idle.
 */
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
#ifdef CONFIG_NO_HZ
	/*
	 * If we were in the nohz mode recently and busy at the current
	 * scheduler tick, then check if we need to nominate new idle
	 * load balancer.
	 */
	if (rq->in_nohz_recently && !rq->idle_at_tick) {
		rq->in_nohz_recently = 0;

		if (atomic_read(&nohz.load_balancer) == cpu) {
			cpumask_clear_cpu(cpu, nohz.cpu_mask);
			atomic_set(&nohz.load_balancer, -1);
		}

		if (atomic_read(&nohz.load_balancer) == -1) {
			/*
			 * simple selection for now: Nominate the
			 * first cpu in the nohz list to be the next
			 * ilb owner.
			 *
			 * TBD: Traverse the sched domains and nominate
			 * the nearest cpu in the nohz.cpu_mask.
			 */
			int ilb = cpumask_first(nohz.cpu_mask);

			if (ilb < nr_cpu_ids)
				resched_cpu(ilb);
		}
	}

	/*
	 * If this cpu is idle and doing idle load balancing for all the
	 * cpus with ticks stopped, is it time for that to stop?
	 */
	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
		resched_cpu(cpu);
		return;
	}

	/*
	 * If this cpu is idle and the idle load balancing is done by
	 * someone else, then no need raise the SCHED_SOFTIRQ
	 */
	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
	    cpumask_test_cpu(cpu, nohz.cpu_mask))
		return;
#endif
	if (time_after_eq(jiffies, rq->next_balance))
		raise_softirq(SCHED_SOFTIRQ);
}

忽略掉CONFIG_NO_HZ的部分，可以看到，这个函数就是判断一下当前的jiffies是不是已经比rq->next_balance值大，如果值大的话，会进一步调用raise_softirq提交一个软中断。提交的过程很简单，就是把SCHED_SOFTIRQ对应的位置位，处理软中断时检查是否位，如果置位调用相应的软中断处理函数。用cscope在源码中搜索，发现有如下语句：

 #ifdef CONFIG_SMP
     open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 #endif

呵，这个软中断还是特别为SMP架构准备的呢～顺着这个线索，去查看run_rebalance_domains的实现

/*
 * run_rebalance_domains is triggered when needed from the scheduler tick.
 * In CONFIG_NO_HZ case, the idle load balance owner will do the
 * rebalancing for all the cpus for whom scheduler ticks are stopped.
 */
static void run_rebalance_domains(struct softirq_action *h)
{
	int this_cpu = smp_processor_id();
	struct rq *this_rq = cpu_rq(this_cpu);
	enum cpu_idle_type idle = this_rq->idle_at_tick ?
						CPU_IDLE : CPU_NOT_IDLE;

	rebalance_domains(this_cpu, idle);

#ifdef CONFIG_NO_HZ
	/*
	 * If this cpu is the owner for idle load balancing, then do the
	 * balancing on behalf of the other idle cpus whose ticks are
	 * stopped.
	 */
	if (this_rq->idle_at_tick &&
	    atomic_read(&nohz.load_balancer) == this_cpu) {
		struct rq *rq;
		int balance_cpu;

		for_each_cpu(balance_cpu, nohz.cpu_mask) {
			if (balance_cpu == this_cpu)
				continue;

			/*
			 * If this cpu gets work to do, stop the load balancing
			 * work being done for other cpus. Next load
			 * balancing owner will pick it up.
			 */
			if (need_resched())
				break;

			rebalance_domains(balance_cpu, CPU_IDLE);

			rq = cpu_rq(balance_cpu);
			if (time_after(this_rq->next_balance, rq->next_balance))
				this_rq->next_balance = rq->next_balance;
		}
	}
#endif
}

忽略CONFIG_NO_HZ，那么这个函数就是根据当前cpu的负载状态（为idle进程还是其它)确定idle参数，然后调用rebalance_domains

/*
 * It checks each scheduling domain to see if it is due to be balanced,
 * and initiates a balancing operation if so.
 *
 * Balancing parameters are set up in arch_init_sched_domains.
 */
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
	int balance = 1;
	struct rq *rq = cpu_rq(cpu);
	unsigned long interval;
	struct sched_domain *sd;
	/* Earliest time when we have to do rebalance again */
	unsigned long next_balance = jiffies + 60*HZ;
	int update_next_balance = 0;
	int need_serialize;
	cpumask_var_t tmp;

	/* Fails alloc?  Rebalancing probably not a priority right now. */
	if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
		return;

	for_each_domain(cpu, sd) {//对于该cpu所在的调度域及其父调度域
		if (!(sd->flags & SD_LOAD_BALANCE))//如果这个调度域已经明确表示不参与负载平衡，则跳过
			continue;

		interval = sd->balance_interval;//得到该调度域的平衡周期
		if (idle != CPU_IDLE)
			interval *= sd->busy_factor;//根据当前cpu状态对此周期进行修正

		/* scale ms to jiffies */
		interval = msecs_to_jiffies(interval);//将毫秒转化成jiffie数
		if (unlikely(!interval))
			interval = 1;
		if (interval > HZ*NR_CPUS/10)//继续修正
			interval = HZ*NR_CPUS/10;

		need_serialize = sd->flags & SD_SERIALIZE;

		if (need_serialize) {
			if (!spin_trylock(&balancing))
				goto out;
		}

		if (time_after_eq(jiffies, sd->last_balance + interval)) {//真的需要进行负载平衡了
			if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
				/*
				 * We've pulled tasks over so either we're no
				 * longer idle, or one of our SMT siblings is
				 * not idle.
				 */
				idle = CPU_NOT_IDLE;//注释已经写的很清楚
			}
			sd->last_balance = jiffies;//更新最后一次平衡的时间
		}
		if (need_serialize)
			spin_unlock(&balancing);
out:
		if (time_after(next_balance, sd->last_balance + interval)) {//设置下一次进行平衡操作的时间
			next_balance = sd->last_balance + interval;
			update_next_balance = 1;
		}

		/*
		 * Stop the load balance at this level. There is another
		 * CPU in our sched group which is doing load balancing more
		 * actively.
		 */
		if (!balance)
			break;
	}

	/*
	 * next_balance will be updated only when there is a need.
	 * When the cpu is attached to null domain for ex, it will not be
	 * updated.
	 */
	if (likely(update_next_balance))
		rq->next_balance = next_balance;

	free_cpumask_var(tmp);
}

重点就是load_balance了

/*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
 */
static int load_balance(int this_cpu, struct rq *this_rq,
			struct sched_domain *sd, enum cpu_idle_type idle,
			int *balance, struct cpumask *cpus)
{
	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
	struct sched_group *group;
	unsigned long imbalance;
	struct rq *busiest;
	unsigned long flags;

	cpumask_setall(cpus);//先将所有cpu置位

	/*
	 * When power savings policy is enabled for the parent domain, idle
	 * sibling can pick up load irrespective of busy siblings. In this case,
	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
	 * portraying it as CPU_NOT_IDLE.
	 */
	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))//SMP结构中，SHARE_CPUPOWER不会出现
		sd_idle = 1;

	schedstat_inc(sd, lb_count[idle]);//更新统计信息

redo:
	update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新
	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
				   cpus, balance);//找也该调度域中最忙的调度组

	if (*balance == 0)
		goto out_balanced;

	if (!group) {
		schedstat_inc(sd, lb_nobusyg[idle]);
		goto out_balanced;
	}

	busiest = find_busiest_queue(group, idle, imbalance, cpus);
	if (!busiest) {
		schedstat_inc(sd, lb_nobusyq[idle]);
		goto out_balanced;
	}

	BUG_ON(busiest == this_rq);

	schedstat_add(sd, lb_imbalance[idle], imbalance);

	ld_moved = 0;
	if (busiest->nr_running > 1) {
		/*
		 * Attempt to move tasks. If find_busiest_group has found
		 * an imbalance but busiest->nr_running nr_balance_failed++;

		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

			spin_lock_irqsave(&busiest->lock, flags);

			/* don't kick the migration_thread, if the curr
			 * task on busiest cpu can't be moved to this_cpu
			 */
			if (!cpumask_test_cpu(this_cpu,
					      &busiest->curr->cpus_allowed)) {
				spin_unlock_irqrestore(&busiest->lock, flags);
				all_pinned = 1;
				goto out_one_pinned;
			}

			if (!busiest->active_balance) {
				busiest->active_balance = 1;
				busiest->push_cpu = this_cpu;
				active_balance = 1;
			}
			spin_unlock_irqrestore(&busiest->lock, flags);
			if (active_balance)
				wake_up_process(busiest->migration_thread);

			/*
			 * We've kicked active balancing, reset the failure
			 * counter.
			 */
			sd->nr_balance_failed = sd->cache_nice_tries+1;
		}
	} else
		sd->nr_balance_failed = 0;

	if (likely(!active_balance)) {
		/* We were unbalanced, so reset the balancing interval */
		sd->balance_interval = sd->min_interval;
	} else {
		/*
		 * If we've begun active balancing, start to back off. This
		 * case may not be covered by the all_pinned logic if there
		 * is only 1 task on the busy runqueue (because we don't call
		 * move_tasks).
		 */
		if (sd->balance_interval max_interval)
			sd->balance_interval *= 2;
	}

	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
		ld_moved = -1;

	goto out;

out_balanced:
	schedstat_inc(sd, lb_balanced[idle]);

	sd->nr_balance_failed = 0;

out_one_pinned:
	/* tune up the balancing interval */
	if ((all_pinned && sd->balance_interval balance_interval max_interval))
		sd->balance_interval *= 2;

	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
		ld_moved = -1;
	else
		ld_moved = 0;
out:
	if (ld_moved)
		update_shares(sd);
	return ld_moved;
}

其中update_shares有必要去看一下

static void update_shares(struct sched_domain *sd)
{
	u64 now = cpu_clock(raw_smp_processor_id());
	s64 elapsed = now - sd->last_update;

	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {还要再进行一次确认，是不是需要将此调度域中的每个进程组的share值更新
		sd->last_update = now;
		walk_tg_tree(tg_nop, tg_shares_up, sd);
	}
}

如果真的需要去更新该调度域的各个进程组的share值的话，将调用wsalk_tg_tree进行更新操作，tg_nop,shares_up是两个函数指针，其中在这里，tg_nop进行空操作，shares_up将进行真正的更新操作。

/*
 * Iterate the full tree, calling @down when first entering a node and @up when
 * leaving it for the final time.
 */
static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
	struct task_group *parent, *child;
	int ret;

	rcu_read_lock();
	parent = &root_task_group;
down:
	ret = (*down)(parent, data);
	if (ret)
		goto out_unlock;
	list_for_each_entry_rcu(child, &parent->children, siblings) {
		parent = child;
		goto down;

up:
		continue;
	}
	ret = (*up)(parent, data);
	if (ret)
		goto out_unlock;

	child = parent;
	parent = parent->parent;
	if (parent)
		goto up;
out_unlock:
	rcu_read_unlock();

	return ret;
}

代码比较难读，不如自己在纸上画个图，实际走一遍，就看清楚了，这里实际上就是从下而上，从左到右，依次更新每个调度组的share值，具体的更新方法在shares_up函数中体现。

//注释中也说明了刚才的遍历方法
/*
 * Re-compute the task group their per cpu shares over the given domain.
 * This needs to be done in a bottom-up fashion because the rq weight of a
 * parent group depends on the shares of its child groups.
 */
static int tg_shares_up(struct task_group *tg, void *data)
{
	unsigned long weight, rq_weight = 0;
	unsigned long shares = 0;
	struct sched_domain *sd = data;
	int i;

	for_each_cpu(i, sched_domain_span(sd)) {//对于该调度域的所有cpu
		/*
		 * If there are currently no tasks on the cpu pretend there
		 * is one of average load so that when a new task gets to
		 * run here it will not get delayed by group starvation.
		 */
		weight = tg->cfs_rq[i]->load.weight;//将该调度组在该调度域中各个cpu上的运行队列的负载相加
		if (!weight)//如果在该cpu上没有负载，就要分一些过来了，注意与下一条语句联系
			weight = NICE_0_LOAD;

		tg->cfs_rq[i]->rq_weight = weight;//注意这里是cfs_rq的rq_weight，
		rq_weight += weight;//计算总的rq_weight
		shares += tg->cfs_rq[i]->shares;//将该调度组在该调度域中各个cpu上的运行队列的shares值相加
	}
	//进行一下修正
	if ((!shares && rq_weight) || shares > tg->shares)
		shares = tg->shares;

	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))//对于SMP来说，这个条件是满足的，
		shares = tg->shares;//shares值直接变成了该调度组的shares值了

	for_each_cpu(i, sched_domain_span(sd))//进行完刚才的统计后，再来一次循环，这次要更新了
		update_group_shares_cpu(tg, i, shares, rq_weight);

	return 0;
}

再来看update_group_shares_cpu

/*
 * Calculate and set the cpu's group shares.
 */
static void//注意这里的参数，tg就是在刚才树的遍历中遍历到的组，cpu是该调度域中的cpu i，sd_shares是该调度组的shares值，sd_rq_weight是该组在该调度域中各个cpu上的运行队列的负载和
update_group_shares_cpu(struct task_group *tg, int cpu,
			unsigned long sd_shares, unsigned long sd_rq_weight)
{
	unsigned long shares;
	unsigned long rq_weight;

	if (!tg->se[cpu])
		return;

	rq_weight = tg->cfs_rq[cpu]->rq_weight;

	/*
	 *           Sum shares * rq_weight
	 * shares =  -----------------------
	 *               Sum rq_weight
	 *
	 */
	shares = (sd_shares * rq_weight) / sd_rq_weight;//代码中的注释已经写的很清楚了
	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);//进行一下修正

	if (abs(shares - tg->se[cpu]->load.weight) >
			sysctl_sched_shares_thresh) {//为了避免操作过于频繁，只有结果大于一个可控值时，才进行更新。
		struct rq *rq = cpu_rq(cpu);
		unsigned long flags;

		spin_lock_irqsave(&rq->lock, flags);
		tg->cfs_rq[cpu]->shares = shares;

		__set_se_shares(tg->se[cpu], shares);//shares值最后还是要落实到"se"(scheduler entiry)中去
		spin_unlock_irqrestore(&rq->lock, flags);
	}
}

注意上面的计算方法，是说对组内的cpu来讲，共同来分担该组的shares值，具体的分担方法是，按比例来，哪个cpu的负载占所有cpu负载的百分比大，哪个cpu分得的shares值也就大一些，优先级就大一些，运行的时候就会多一些筹码，这里需要返回去看pick_up_next的部分代码 __set_se_shares代码如下

static void __set_se_shares(struct sched_entity *se, unsigned long shares)
{
	struct cfs_rq *cfs_rq = se->cfs_rq;
	int on_rq;

	on_rq = se->on_rq;
	if (on_rq)
		dequeue_entity(cfs_rq, se, 0);

	se->load.weight = shares;
	se->load.inv_weight = 0;

	if (on_rq)
		enqueue_entity(cfs_rq, se, 0);
}

很好理解，先移出可执行队列，更新其负载后，再移入可执行队列。 update_shares到这里就分析完了，注意update_shares的执行时间是在已经确定需要进行负载平衡，但是还没有开始确定怎么平衡之前。先更新一下该调度域中各个组中的负载情况，有助于下面的调度组以及进程的选择。返回刚才的load_balance函数，继续往下进行。

redo:
	update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新
	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
				   cpus, balance);//找到该调度域中最忙的调度组

	if (*balance == 0)
		goto out_balanced;

	if (!group) {//如果都不太忙，当然不需要平衡操作
		schedstat_inc(sd, lb_nobusyg[idle]);
		goto out_balanced;
	}

	busiest = find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列，也就是最忙的cpu
	if (!busiest) {//如果所有cpu都不符合标准，也不需要平衡操作
		schedstat_inc(sd, lb_nobusyq[idle]);
		goto out_balanced;
	}

	BUG_ON(busiest == this_rq);

	schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息

	ld_moved = 0;//是否移动了某些进程的标志
	if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程，注释中写了，如果进程数小于１，那么将其移走后，进程数达到零，不还是不平衡么？所以干脆不移动
		/*
		 * Attempt to move tasks. If find_busiest_group has found
		 * an imbalance but busiest->nr_running <= 1, the group is
		 * still unbalanced. ld_moved simply stays zero, so it is
		 * correctly treated as an imbalance.
		 */
		local_irq_save(flags);
		double_rq_lock(this_rq, busiest);
		ld_moved = move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示，在busiest队列中挑选可进程，移动到this_rq中去。
				      imbalance, sd, idle, &all_pinned);
		double_rq_unlock(this_rq, busiest);
		local_irq_restore(flags);

		/*
		 * some other cpu did the load balance for us.
		 */
		if (ld_moved && this_cpu != smp_processor_id())
			resched_cpu(this_cpu);

		/* All tasks on this runqueue were pinned by CPU affinity */
		if (unlikely(all_pinned)) {
			cpumask_clear_cpu(cpu_of(busiest), cpus);
			if (!cpumask_empty(cpus))
				goto redo;
			goto out_balanced;
		}
	}

先看 find_busiest_group函数，这个函数比较长

/*
 * find_busiest_group finds and returns the busiest CPU group within the
 * domain. It calculates and returns the amount of weighted load which
 * should be moved to restore balance via the imbalance parameter.
 */
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
		   unsigned long *imbalance, enum cpu_idle_type idle,
		   int *sd_idle, const struct cpumask *cpus, int *balance)
{
	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
	unsigned long max_pull;
	unsigned long busiest_load_per_task, busiest_nr_running;
	unsigned long this_load_per_task, this_nr_running;
	int load_idx, group_imb = 0;
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
	int power_savings_balance = 1;
	unsigned long leader_nr_running = 0, min_load_per_task = 0;
	unsigned long min_nr_running = ULONG_MAX;
	struct sched_group *group_min = NULL, *group_leader = NULL;
#endif

	max_load = this_load = total_load = total_pwr = 0;
	busiest_load_per_task = busiest_nr_running = 0;
	this_load_per_task = this_nr_running = 0;

	if (idle == CPU_NOT_IDLE)//先根据传进来的idle参数来确定load_idx的值，这个值在下面寻找最忙调度组（不是进程组)时会作为一个重要指标
		load_idx = sd->busy_idx;//busy_idx默认为3
	else if (idle == CPU_NEWLY_IDLE)
		load_idx = sd->newidle_idx;//newidle_idx为2
	else
		load_idx = sd->idle_idx;idle_idx为１

	do {//从这里一直到while(group!=sd->groups)是一个大循环，其目的就是遍布这个调度域中所有的调度组，找出最忙的那个，其中，this_cpu所属的调度组不参与与其它调度组的竞争
		unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
		int local_group;
		int i;
		int __group_imb = 0;
		unsigned int balance_cpu = -1, first_idle_cpu = 0;
		unsigned long sum_nr_running, sum_weighted_load;
		unsigned long sum_avg_load_per_task;
		unsigned long avg_load_per_task;

		local_group = cpumask_test_cpu(this_cpu,
					       sched_group_cpus(group));//如果发现tihs_cpu属于当前的调度组，那么将local_group置位

		if (local_group)
			balance_cpu = cpumask_first(sched_group_cpus(group));//如果正在处理"local_group"，那么将balance_cpu暂定为该组中第一个cpu

		/* Tally up the load of all CPUs in the group */
		sum_weighted_load = sum_nr_running = avg_load = 0;
		sum_avg_load_per_task = avg_load_per_task = 0;

		max_cpu_load = 0;
		min_cpu_load = ~0UL;

		for_each_cpu_and(i, sched_group_cpus(group), cpus) {//对于该组中每个cpu
			struct rq *rq = cpu_rq(i);

			if (*sd_idle && rq->nr_running)
				*sd_idle = 0;

			/* Bias balancing toward cpus of our domain */
			if (local_group) {//如果是本地组，且当前cpu为idle_cpu,并在循环中还没有进行过对balance_cpu的修正
				if (idle_cpu(i) && !first_idle_cpu) {
					first_idle_cpu = 1;
					balance_cpu = i;//将balance_cpu置为i，仔细考虑下，这里的逻辑就是说，如果本地组中有空闲cpu，那么就将第一个空闲cpu作为balance_cpu，否则，将该组中第一个cpu作为balance_cpu
				}

				load = target_load(i, load_idx);//累加计算该组的负载，增加的数目要根据前面确定的load_idx来确定
			} else {//如果当前组不是本地组
				load = source_load(i, load_idx);//同上
				if (load > max_cpu_load)//如果该调度组的总负载大于已经找到的最大负载，或者小于已经找到的最小负载，则更新最大/最小值
					max_cpu_load = load;
				if (min_cpu_load > load)
					min_cpu_load = load;
			}

			avg_load += load;//根据load_idx计算出来的负载之和
			sum_nr_running += rq->nr_running;//组内各个cpu上可运行队列中进程数目之和
			sum_weighted_load += weighted_cpuload(i);//该组内当前所有cpu的负载之和，注意这里是当前的，和avg_load不同，因为avg_load的计算涉及到历史值，也就是和load_idx有关

			sum_avg_load_per_task += cpu_avg_load_per_task(i);//该cpu上所有进程的平均负载
		}

		/*
		 * First idle cpu or the first cpu(busiest) in this sched group
		 * is eligible for doing load balancing at this and above
		 * domains. In the newly idle case, we will allow all the cpu's
		 * to do the newly idle load balance.
		 */
		if (idle != CPU_NEWLY_IDLE && local_group &&
		    balance_cpu != this_cpu && balance) {
			*balance = 0;
			goto ret;
		}

		total_load += avg_load;//调度域的总负载
		total_pwr += group->__cpu_power;//这个cpu_power还没弄清是怎么回事

		/* Adjust by relative CPU power of the group */
		avg_load = sg_div_cpu_power(group,
				avg_load * SCHED_LOAD_SCALE);//根据该总的avg_load以及其power确定该组最终的avg_load


		/*
		 * Consider the group unbalanced when the imbalance is larger
		 * than the average weight of two tasks.
		 *
		 * APZ: with cgroup the avg task weight can vary wildly and
		 *      might not be a suitable number - should we keep a
		 *      normalized nr_running number somewhere that negates
		 *      the hierarchy?
		 */
		avg_load_per_task = sg_div_cpu_power(group,
				sum_avg_load_per_task * SCHED_LOAD_SCALE);//同样，修正该组的avg_load_per_task值

		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)//如果该组内最大cpu负载值与最小cpu负载值之差大于平均负载值的2倍，则__group_imb(imbalance）置1，下面会看到它的作用
			__group_imb = 1;

		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

		if (local_group) {//如果是本地组，只更新this相关的变量，并不更新busiest指针的指向
			this_load = avg_load;
			this = group;
			this_nr_running = sum_nr_running;
			this_load_per_task = sum_weighted_load;
		} else if (avg_load > max_load &&
			   (sum_nr_running > group_capacity || __group_imb)) {//如果有不平衡的情况，或者组内的进程数目已经超过了该组的能力，且该组的平均负载大于已知的其它组的最大平均负载
			max_load = avg_load;//更新最大值
			busiest = group;//更新指针指向
			busiest_nr_running = sum_nr_running;//更新最忙组中的进程数目
			busiest_load_per_task = sum_weighted_load;
			group_imb = __group_imb;
		}

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)//SMT结构暂不分析,所以会跳过很长的代码，直到整个do-while循环结束
		/*
		 * Busy processors will not participate in power savings
		 * balance.
		 */
		if (idle == CPU_NOT_IDLE ||
				!(sd->flags & SD_POWERSAVINGS_BALANCE))
			goto group_next;

		/*
		 * If the local group is idle or completely loaded
		 * no need to do power savings balance at this domain
		 */
		if (local_group && (this_nr_running >= group_capacity ||
				    !this_nr_running))
			power_savings_balance = 0;

		/*
		 * If a group is already running at full capacity or idle,
		 * don't include that group in power savings calculations
		 */
		if (!power_savings_balance || sum_nr_running >= group_capacity
		    || !sum_nr_running)
			goto group_next;

		/*
		 * Calculate the group which has the least non-idle load.
		 * This is the group from where we need to pick up the load
		 * for saving power
		 */
		if ((sum_nr_running 
		     cpumask_first(sched_group_cpus(group_min)))) {
			group_min = group;
			min_nr_running = sum_nr_running;
			min_load_per_task = sum_weighted_load /
						sum_nr_running;
		}

		/*
		 * Calculate the group which is almost near its
		 * capacity but still has some space to pick up some load
		 * from other group and save more power
		 */
		if (sum_nr_running  leader_nr_running ||
			    (sum_nr_running == leader_nr_running &&
			     cpumask_first(sched_group_cpus(group)) next;
	} while (group != sd->groups);//至此，已经将该调度域中所有调度组全部遍历完，如果有符合条件的最忙调度组的话，busiest已经指向它

	if (!busiest || this_load >= max_load || busiest_nr_running == 0)//没有符合条件的，或者本地调度组比找到的那一组还要忙，或者最忙的组中已经没有进程，则不需要平衡
		goto out_balanced;

	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;

	if (this_load >= avg_load ||
			100*max_load imbalance_pct*this_load)//另一个不需要平衡的条件，当前组负载大于平均平均负载，或者最大负载与当前组负载之比小于某个值
		goto out_balanced;

	busiest_load_per_task /= busiest_nr_running;//最忙组中每个进程的平均负载
	if (group_imb)//如果组内cpu上的最大负载与最小负载之差大于组内平均负载的2倍，则进行一下修正
		busiest_load_per_task = min(busiest_load_per_task, avg_load);

	/*
	 * We're trying to get all the cpus to the average_load, so we don't
	 * want to push ourselves above the average load, nor do we wish to
	 * reduce the max loaded cpu below the average load, as either of these
	 * actions would just result in more rebalancing later, and ping-pong
	 * tasks around. Thus we look for the minimum possible imbalance.
	 * Negative imbalances (*we* are more loaded than anyone else) will
	 * be counted as no imbalance for these purposes -- we can't fix that
	 * by pulling tasks to us. Be careful of negative numbers as they'll
	 * appear as very large values with unsigned longs.
	 */
	if (max_load <= busiest_load_per_task)
		goto out_balanced;

	/*
	 * In the presence of smp nice balancing, certain scenarios can have
	 * max load less than avg load(as we skip the groups at or below
	 * its cpu_power, while calculating max_load..)
	 */
	if (max_load __cpu_power,
				(avg_load - this_load) * this->__cpu_power)
			/ SCHED_LOAD_SCALE;//计算一下需要移动的负载量，下面就是一些太细节的东西了，从逻辑上也讲不好是什么道理，所以不进行分析

	/*
	 * if *imbalance is less than the average load per runnable task
	 * there is no gaurantee that any tasks will be moved so we'll have
	 * a think about bumping its value to force at least one task to be
	 * moved
	 */
	if (*imbalance  this_load_per_task)
				imbn = 1;
		} else
			this_load_per_task = cpu_avg_load_per_task(this_cpu);

		if (max_load - this_load + busiest_load_per_task >=
					busiest_load_per_task * imbn) {
			*imbalance = busiest_load_per_task;
			return busiest;
		}

		/*
		 * OK, we don't have enough imbalance to justify moving tasks,
		 * however we may be able to increase total CPU power used by
		 * moving them.
		 */

		pwr_now += busiest->__cpu_power *
				min(busiest_load_per_task, max_load);
		pwr_now += this->__cpu_power *
				min(this_load_per_task, this_load);
		pwr_now /= SCHED_LOAD_SCALE;

		/* Amount of load we'd subtract */
		tmp = sg_div_cpu_power(busiest,
				busiest_load_per_task * SCHED_LOAD_SCALE);
		if (max_load > tmp)
			pwr_move += busiest->__cpu_power *
				min(busiest_load_per_task, max_load - tmp);

		/* Amount of load we'd add */
		if (max_load * busiest->__cpu_power __cpu_power);
		else
			tmp = sg_div_cpu_power(this,
				busiest_load_per_task * SCHED_LOAD_SCALE);
		pwr_move += this->__cpu_power *
				min(this_load_per_task, this_load + tmp);
		pwr_move /= SCHED_LOAD_SCALE;

		/* Move if we gain throughput */
		if (pwr_move > pwr_now)
			*imbalance = busiest_load_per_task;
	}

	return busiest;

out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
		goto ret;

	if (this == group_leader && group_leader != group_min) {
		*imbalance = min_load_per_task;
		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
				cpumask_first(sched_group_cpus(group_leader));
		}
		return group_min;
	}
#endif
ret:
	*imbalance = 0;
	return NULL;
}

顺着load_balance的调用路线，接下来就要执行find_busiest_queue了，这个函数比较好理解

/*
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
static struct rq *
find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
		   unsigned long imbalance, const struct cpumask *cpus)
{
	struct rq *busiest = NULL, *rq;
	unsigned long max_load = 0;
	int i;

	for_each_cpu(i, sched_group_cpus(group)) {
		unsigned long wl;

		if (!cpumask_test_cpu(i, cpus))//该cpu不在当前调度组中
			continue;

		rq = cpu_rq(i);
		wl = weighted_cpuload(i);

		if (rq->nr_running == 1 && wl > imbalance)//如果该cpu上只有一个进程，且其负载比需要移动的负载量大
			continue;

		if (wl > max_load) {//更新最大值及最忙队列指针
			max_load = wl;
			busiest = rq;
		}
	}

	return busiest;
}

再次回到load_balance的调用路线中，这次终于可以实施最终的移动了

busiest = find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列，也就是最忙的cpu
	if (!busiest) {//如果所有cpu都不符合标准，也不需要平衡操作
		schedstat_inc(sd, lb_nobusyq[idle]);
		goto out_balanced;
	}

	BUG_ON(busiest == this_rq);

	schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息

	ld_moved = 0;//是否移动了某些进程的标志
	if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程，注释中写了，如果进程数小于１，那么将其移走后，进程数达到零，不还是不平衡么？所以干脆不移动
		/*
		 * Attempt to move tasks. If find_busiest_group has found
		 * an imbalance but busiest->nr_running <= 1, the group is
		 * still unbalanced. ld_moved simply stays zero, so it is
		 * correctly treated as an imbalance.
		 */
		local_irq_save(flags);
		double_rq_lock(this_rq, busiest);//同时为两个队列加锁，要考虑防死锁，这里的处理是按指针地址大小进行加锁
		ld_moved = move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示，在busiest队列中挑选可进程，移动到this_rq中去。
				      imbalance, sd, idle, &all_pinned);
		double_rq_unlock(this_rq, busiest);
		local_irq_restore(flags);

		/*
		 * some other cpu did the load balance for us.
		 */
		if (ld_moved && this_cpu != smp_processor_id())
			resched_cpu(this_cpu);

		/* All tasks on this runqueue were pinned by CPU affinity */
		if (unlikely(all_pinned)) {
			cpumask_clear_cpu(cpu_of(busiest), cpus);
			if (!cpumask_empty(cpus))
				goto redo;
			goto out_balanced;
		}
	}

加锁之后，便进入到了move_tasks中，

/*
 * move_tasks tries to move up to max_load_move weighted load from busiest to
 * this_rq, as part of a balancing operation within domain "sd".
 * Returns 1 if successful and 0 otherwise.
 *
 * Called with both runqueues locked.
 */
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
		      unsigned long max_load_move,
		      struct sched_domain *sd, enum cpu_idle_type idle,
		      int *all_pinned)
{
	const struct sched_class *class = sched_class_highest;
	unsigned long total_load_moved = 0;
	int this_best_prio = this_rq->curr->prio;

	do {
		total_load_moved +=
			class->load_balance(this_rq, this_cpu, busiest,
				max_load_move - total_load_moved,
				sd, idle, all_pinned, &this_best_prio);
		class = class->next;

		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
			break;

	} while (class && max_load_move > total_load_moved);

	return total_load_moved > 0;
}

函数主要由一个do-while循环完成，开始时class指向sched_class_highest，而在sched.c里面有有：

 #define sched_class_highest (&rt_sched_class)

也就是说，在循环第一次执行时，会调用rt_sched_class调度类里对应的load_balance函数，去sched_rt.c里面寻找，发现如下：

	.load_balance		= load_balance_rt,

static unsigned long
load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
		unsigned long max_load_move,
		struct sched_domain *sd, enum cpu_idle_type idle,
		int *all_pinned, int *this_best_prio)
{
	/* don't touch RT tasks */
	return 0;
}

可见，这是一个空函数，也就是对于负载平衡，是不会将rt类进程迁移走的，循环只好进入下一个调度类，也就是cfs调度类。去执行它所对应的load_balance函数


#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
		  unsigned long max_load_move,
		  struct sched_domain *sd, enum cpu_idle_type idle,
		  int *all_pinned, int *this_best_prio)
{
	long rem_load_move = max_load_move;//rem_load_move remain_load_move
	int busiest_cpu = cpu_of(busiest);//最忙的列队所对应的cpu
	struct task_group *tg;

	rcu_read_lock();
	update_h_load(busiest_cpu);//更新一下

	list_for_each_entry_rcu(tg, &task_groups, list) {
		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
		u64 rem_load, moved_load;

		/*
		 * empty group
		 */
		if (!busiest_cfs_rq->task_weight)
			continue;

		rem_load = (u64)rem_load_move * busiest_weight;
		rem_load = div_u64(rem_load, busiest_h_load + 1);

		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
				rem_load, sd, idle, all_pinned, this_best_prio,
				tg->cfs_rq[busiest_cpu]);

		if (!moved_load)
			continue;

		moved_load *= busiest_h_load;
		moved_load = div_u64(moved_load, busiest_weight + 1);

		rem_load_move -= moved_load;
		if (rem_load_move < 0)
			break;
	}
	rcu_read_unlock();

	return max_load_move - rem_load_move;
}

update_h_load与之前看到过的更新shares值的函数比较像

 static void update_h_load(long cpu)
 {
     walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }

前面已经说过tg_nop函数是一个空函数，来看一下tg_load_down函数

/*
 * Compute the cpu's hierarchical load factor for each task group.
 * This needs to be done in a top-down fashion because the load of a child
 * group is a fraction of its parents load.
 */
static int tg_load_down(struct task_group *tg, void *data)//注释里写的还算清楚
{
	unsigned long load;
	long cpu = (long)data;

	if (!tg->parent) {
		load = cpu_rq(cpu)->load.weight;
	} else {
		load = tg->parent->cfs_rq[cpu]->h_load;//父层需要移动的负载量
		load *= tg->cfs_rq[cpu]->shares;//这个值其实就是本层的load_weight值
		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
	}

	tg->cfs_rq[cpu]->h_load = load;

	return 0;
}

上面的代码算下来，就是本层调度组需要移动的负载量＝本调度组的shares值*(本调度组的load_weight)/父调度组的load_weight 其实说白了就是按负载比例进行分配。将本队列中各组需要移动的负载量计算出来以后，就可以去各组中去挑选实际的进程了。回到load_balance_fair函数中

	update_h_load(busiest_cpu);//更新一下

	list_for_each_entry_rcu(tg, &task_groups, list) {//对于各调度组在该cpu上的运行队列
		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
		unsigned long busiest_h_load = busiest_cfs_rq->h_load;//刚才update_h_load计算好的该组需要移动的负载量
		unsigned long busiest_weight = busiest_cfs_rq->load.weight;//该组的负载
		u64 rem_load, moved_load;

		/*
		 * empty group
		 */
		if (!busiest_cfs_rq->task_weight)
			continue;

		rem_load = (u64)rem_load_move * busiest_weight;
		rem_load = div_u64(rem_load, busiest_h_load + 1);//rem_load＝rem_load_move*(busiest_weight)/(busiest_h_load+1)

		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
				rem_load, sd, idle, all_pinned, this_best_prio,
				tg->cfs_rq[busiest_cpu]);//最终的动作就是这里了

		if (!moved_load)
			continue;

		moved_load *= busiest_h_load;
		moved_load = div_u64(moved_load, busiest_weight + 1);

		rem_load_move -= moved_load;//移动完一个组，将“成果”反馈，看看还是不是需要继续移动下一个组中的进程
		if (rem_load_move < 0)
			break;
	}
	rcu_read_unlock();

	return max_load_move - rem_load_move;

对于__load_balance_fair，如下：

static unsigned long
__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
		unsigned long max_load_move, struct sched_domain *sd,
		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
		struct cfs_rq *cfs_rq)
{
	struct rq_iterator cfs_rq_iterator;

	cfs_rq_iterator.start = load_balance_start_fair;
	cfs_rq_iterator.next = load_balance_next_fair;
	cfs_rq_iterator.arg = cfs_rq;

	return balance_tasks(this_rq, this_cpu, busiest,
			max_load_move, sd, idle, all_pinned,
			this_best_prio, &cfs_rq_iterator);
}

还需要进入到balance_task中去

static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
	      unsigned long max_load_move, struct sched_domain *sd,
	      enum cpu_idle_type idle, int *all_pinned,
	      int *this_best_prio, struct rq_iterator *iterator)
{
	int loops = 0, pulled = 0, pinned = 0;
	struct task_struct *p;
	long rem_load_move = max_load_move;

	if (max_load_move == 0)
		goto out;

	pinned = 1;

	/*
	 * Start the load-balancing iterator:
	 */
	p = iterator->start(iterator->arg);
next:
	if (!p || loops++ > sysctl_sched_nr_migrate)
		goto out;

	if ((p->se.load.weight >> 1) > rem_load_move ||
	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {//如果该进程负载大于剩余需要移动的负载量的2倍，则不考虑移动此进程，如果此进程不能被移动，则同样不考虑移动此进程
		p = iterator->next(iterator->arg);
		goto next;
	}

	pull_task(busiest, p, this_rq, this_cpu);//可以移动，此函数将进程拉到this_cpu的this_rq上来
	pulled++;//移动进程数加1
	rem_load_move -= p->se.load.weight;//剩余需要移动负载量减小

	/*
	 * We only want to steal up to the prescribed amount of weighted load.
	 */
	if (rem_load_move > 0) {
		if (p->prio prio;
		p = iterator->next(iterator->arg);
		goto next;
	}
out:
	/*
	 * Right now, this is one of only two places pull_task() is called,
	 * so we can safely collect pull_task() stats here rather than
	 * inside pull_task().
	 */
	schedstat_add(sd, lb_gained[idle], pulled);//统计信息

	if (all_pinned)
		*all_pinned = pinned;

	return max_load_move - rem_load_move;
}

can_migrate_task的代码如下：

/*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
static
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
		     struct sched_domain *sd, enum cpu_idle_type idle,
		     int *all_pinned)
{
	/*
	 * We do not migrate tasks that are:
	 * 1) running (obviously), or
	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
	 * 3) are cache-hot on their current CPU.
	 */
	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
		schedstat_inc(p, se.nr_failed_migrations_affine);
		return 0;
	}
	*all_pinned = 0;

	if (task_running(rq, p)) {
		schedstat_inc(p, se.nr_failed_migrations_running);
		return 0;
	}

	/*
	 * Aggressive migration if:
	 * 1) task is cache cold, or
	 * 2) too many balance attempts have failed.
	 */

	if (!task_hot(p, rq->clock, sd) ||
			sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
		if (task_hot(p, rq->clock, sd)) {
			schedstat_inc(sd, lb_hot_gained[idle]);
			schedstat_inc(p, se.nr_forced_migrations);
		}
#endif
		return 1;
	}

	if (task_hot(p, rq->clock, sd)) {
		schedstat_inc(p, se.nr_failed_migrations_hot);
		return 0;
	}
	return 1;
}

注释中写的极为详细，这里不作过多解释。那么往下，就来看一看pull_task吧

/*
 * pull_task - move a task from a remote runqueue to the local runqueue.
 * Both runqueues must be locked.
 */
static void pull_task(struct rq *src_rq, struct task_struct *p,
		      struct rq *this_rq, int this_cpu)//注释说的很清楚
{
	deactivate_task(src_rq, p, 0);//将p从src队列中拿掉
	set_task_cpu(p, this_cpu);//将p中相应指针指向this_cpu，但是还没入新的可执行队列
	activate_task(this_rq, p, 0);//最终动作，将p加入this_rq队列
	/*
	 * Note that idle threads have a prio of MAX_PRIO, for this test
	 * to be always true for them.
	 */
	check_preempt_curr(this_rq, p, 0);
}

set_task_cpu函数：


void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
	int old_cpu = task_cpu(p);
	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
	struct cfs_rq *old_cfsrq = task_cfs_rq(p),
		      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
	u64 clock_offset;

	clock_offset = old_rq->clock - new_rq->clock;

	trace_sched_migrate_task(p, task_cpu(p), new_cpu);

#ifdef CONFIG_SCHEDSTATS
	if (p->se.wait_start)
		p->se.wait_start -= clock_offset;
	if (p->se.sleep_start)
		p->se.sleep_start -= clock_offset;
	if (p->se.block_start)
		p->se.block_start -= clock_offset;
	if (old_cpu != new_cpu) {
		schedstat_inc(p, se.nr_migrations);
		if (task_hot(p, old_rq->clock, NULL))
			schedstat_inc(p, se.nr_forced2_migrations);
	}
#endif
	p->se.vruntime -= old_cfsrq->min_vruntime -
					 new_cfsrq->min_vruntime;

	__set_task_cpu(p, new_cpu);
}

__set_task_cpu:

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
	set_task_rq(p, cpu);
#ifdef CONFIG_SMP
	/*
	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
	 * successfuly executed on another CPU. We must ensure that updates of
	 * per-task data have been completed by this moment.
	 */
	smp_wmb();
	task_thread_info(p)->cpu = cpu;
#endif
}

set_task_rq:

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
	p->se.parent = task_group(p)->se[cpu];
#endif

#ifdef CONFIG_RT_GROUP_SCHED
	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
	p->rt.parent = task_group(p)->rt_se[cpu];
#endif
}

可见，p进程原来属于哪个组，移动后还是属于哪个组，只不过它被移动到了该组在其它cpu上的运行队列中由move_tasks产生的动作到这里就完了，其实就是按照先算出来的每个组需要移动的负载量，依次从每个组中挑选进程移走。再次回到load_balance函数中，现在的情况是，通过寻找该调度域中最忙的调度组，以及找到最忙调度组中的最忙cpu，又通过move_tasks将各种进程组中在此队列上的进程进行了适当的迁移，迁移到了this_cpu上，那么，可以最后检查一下工作了，看下刚才上述那些工作完成的怎么样

if (!ld_moved) {//如果没有移动进程
		schedstat_inc(sd, lb_failed[idle]);
		sd->nr_balance_failed++;

		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {//如果失败次数已经超过cache_nice_tries+2（这个值看名字应该是保证cache hot用的)

			spin_lock_irqsave(&busiest->lock, flags);

			/* don't kick the migration_thread, if the curr
			 * task on busiest cpu can't be moved to this_cpu
			 */
			if (!cpumask_test_cpu(this_cpu,
					      &busiest->curr->cpus_allowed)) {//找下原因，是不是因为进程被设定了不允许移动到this_cpu上
				spin_unlock_irqrestore(&busiest->lock, flags);
				all_pinned = 1;
				goto out_one_pinned;
			}

			if (!busiest->active_balance) {
				busiest->active_balance = 1;
				busiest->push_cpu = this_cpu;
				active_balance = 1;
			}
			spin_unlock_irqrestore(&busiest->lock, flags);
			if (active_balance)//实在不行，唤醒migration_thread进程，同步的去移动进程
				wake_up_process(busiest->migration_thread);

			/*
			 * We've kicked active balancing, reset the failure
			 * counter.
			 */
			sd->nr_balance_failed = sd->cache_nice_tries+1;
		}
	} else
		sd->nr_balance_failed = 0;

	if (likely(!active_balance)) {
		/* We were unbalanced, so reset the balancing interval */
		sd->balance_interval = sd->min_interval;//调整一下平衡周期
	} else {
		/*
		 * If we've begun active balancing, start to back off. This
		 * case may not be covered by the all_pinned logic if there
		 * is only 1 task on the busy runqueue (because we don't call
		 * move_tasks).
		 */
		if (sd->balance_interval max_interval)
			sd->balance_interval *= 2;
	}

migration_thread是干什么的呢？原来，每个cpu都会绑定一个migration_thread内核线程，专门应对这种情况，至于绑定的方法，那就是将这个线程的task_struct结构体中cpu掩码设置好就OK了，这也说明了为什么前面代码中会有"不允许移动到this_cpu“的情况。那么migration_thread都干些什么？在sched.c中有如下函数，在fork migration_thread时，该线程将会执行它：

/*
 * migration_thread - this is a highprio system thread that performs
 * thread migration by bumping thread off CPU then 'pushing' onto
 * another runqueue.
 */
static int migration_thread(void *data)
{
	int cpu = (long)data;
	struct rq *rq;

	rq = cpu_rq(cpu);
	BUG_ON(rq->migration_thread != current);

	set_current_state(TASK_INTERRUPTIBLE);
	while (!kthread_should_stop()) {
		struct migration_req *req;
		struct list_head *head;

		spin_lock_irq(&rq->lock);

		if (cpu_is_offline(cpu)) {
			spin_unlock_irq(&rq->lock);
			goto wait_to_die;
		}

		if (rq->active_balance) {
			active_load_balance(rq, cpu);
			rq->active_balance = 0;
		}

		head = &rq->migration_queue;

		if (list_empty(head)) {
			spin_unlock_irq(&rq->lock);
			schedule();
			set_current_state(TASK_INTERRUPTIBLE);
			continue;
		}
		req = list_entry(head->next, struct migration_req, list);
		list_del_init(head->next);

		spin_unlock(&rq->lock);
		__migrate_task(req->task, cpu, req->dest_cpu);
		local_irq_enable();

		complete(&req->done);
	}
	__set_current_state(TASK_RUNNING);
	return 0;

wait_to_die:
	/* Wait for kthread_stop */
	set_current_state(TASK_INTERRUPTIBLE);
	while (!kthread_should_stop()) {
		schedule();
		set_current_state(TASK_INTERRUPTIBLE);
	}
	__set_current_state(TASK_RUNNING);
	return 0;
}

按刚才的情景，会执行到active_load_balance函数

/*
 * active_load_balance is run by migration threads. It pushes running tasks
 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
 * running on each physical CPU where possible, and avoids physical /
 * logical imbalances.
 *
 * Called with busiest_rq locked.
 */
static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
{
	int target_cpu = busiest_rq->push_cpu;
	struct sched_domain *sd;
	struct rq *target_rq;

	/* Is there any task to move? */
	if (busiest_rq->nr_running flags & SD_LOAD_BALANCE) &&
		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				break;
	}

	if (likely(sd)) {
		schedstat_inc(sd, alb_count);

		if (move_one_task(target_rq, target_cpu, busiest_rq,
				  sd, CPU_IDLE))//这里是move_one_task，也就是说只移动一个进程，减小了力度，毕竟是受阻才会执行到这里的
			schedstat_inc(sd, alb_pushed);
		else
			schedstat_inc(sd, alb_failed);
	}
	double_unlock_balance(busiest_rq, target_rq);
}

在该进程被唤醒之前，push_cpu就已经被设置了load_balance里的this_cpu，也就是说，当时移动不了，那过后再移动，但是，目标cpu还是不变的此外，migration_thread线程还会检查rq中是否有提交上来的需要转移的进程，如果有，一并将其转移,那么进程究竟是怎么跑到这个队列中来的呢？用cscope一路查下去，发现是在exec中，也就是sys_execve系统调用的执行过程中。