内核源码分析之进程调度机制（基于3.16-rc4）

进程调度所使用到的数据结构：

1.就绪队列

内核为每一个cpu创建一个进程就绪队列，该队列上的进程均由该cpu执行，代码如下（kernel/sched/core.c）。

1 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

定义了一个struct rq结构体数组，每个数组元素是一个就绪队列，对应一个cpu。下面看下struct rq结构体（kernel/sched/sched.h）：

  1 struct rq {
  2     /* runqueue lock: */
  3     raw_spinlock_t lock;
  4 
  5     /*
  6      * nr_running and cpu_load should be in the same cacheline because
  7      * remote CPUs use both these fields when doing load calculation.
  8      */
  9     unsigned int nr_running;
 10 #ifdef CONFIG_NUMA_BALANCING
 11     unsigned int nr_numa_running;
 12     unsigned int nr_preferred_running;
 13 #endif
 14     #define CPU_LOAD_IDX_MAX 5
 15     unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 16     unsigned long last_load_update_tick;
 17 #ifdef CONFIG_NO_HZ_COMMON
 18     u64 nohz_stamp;
 19     unsigned long nohz_flags;
 20 #endif
 21 #ifdef CONFIG_NO_HZ_FULL
 22     unsigned long last_sched_tick;
 23 #endif
 24     int skip_clock_update;
 25 
 26     /* capture load from *all* tasks on this cpu: */
 27     struct load_weight load;
 28     unsigned long nr_load_updates;
 29     u64 nr_switches;
 30 
 31     struct cfs_rq cfs;
 32     struct rt_rq rt;
 33     struct dl_rq dl;
 34 
 35 #ifdef CONFIG_FAIR_GROUP_SCHED
 36     /* list of leaf cfs_rq on this cpu: */
 37     struct list_head leaf_cfs_rq_list;
 38 
 39     struct sched_avg avg;
 40 #endif /* CONFIG_FAIR_GROUP_SCHED */
 41 
 42     /*
 43      * This is part of a global counter where only the total sum
 44      * over all CPUs matters. A task can increase this counter on
 45      * one CPU and if it got migrated afterwards it may decrease
 46      * it on another CPU. Always updated under the runqueue lock:
 47      */
 48     unsigned long nr_uninterruptible;
 49 
 50     struct task_struct *curr, *idle, *stop;
 51     unsigned long next_balance;
 52     struct mm_struct *prev_mm;
 53 
 54     u64 clock;
 55     u64 clock_task;
 56 
 57     atomic_t nr_iowait;
 58 
 59 #ifdef CONFIG_SMP
 60     struct root_domain *rd;
 61     struct sched_domain *sd;
 62 
 63     unsigned long cpu_capacity;
 64 
 65     unsigned char idle_balance;
 66     /* For active balancing */
 67     int post_schedule;
 68     int active_balance;
 69     int push_cpu;
 70     struct cpu_stop_work active_balance_work;
 71     /* cpu of this runqueue: */
 72     int cpu;
 73     int online;
 74 
 75     struct list_head cfs_tasks;
 76 
 77     u64 rt_avg;
 78     u64 age_stamp;
 79     u64 idle_stamp;
 80     u64 avg_idle;
 81 
 82     /* This is used to determine avg_idle's max value */
 83     u64 max_idle_balance_cost;
 84 #endif
 85 
 86 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 87     u64 prev_irq_time;
 88 #endif
 89 #ifdef CONFIG_PARAVIRT
 90     u64 prev_steal_time;
 91 #endif
 92 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 93     u64 prev_steal_time_rq;
 94 #endif
 95 
 96     /* calc_load related fields */
 97     unsigned long calc_load_update;
 98     long calc_load_active;
 99 
100 #ifdef CONFIG_SCHED_HRTICK
101 #ifdef CONFIG_SMP
102     int hrtick_csd_pending;
103     struct call_single_data hrtick_csd;
104 #endif
105     struct hrtimer hrtick_timer;
106 #endif
107 
108 #ifdef CONFIG_SCHEDSTATS
109     /* latency stats */
110     struct sched_info rq_sched_info;
111     unsigned long long rq_cpu_time;
112     /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
113 
114     /* sys_sched_yield() stats */
115     unsigned int yld_count;
116 
117     /* schedule() stats */
118     unsigned int sched_count;
119     unsigned int sched_goidle;
120 
121     /* try_to_wake_up() stats */
122     unsigned int ttwu_count;
123     unsigned int ttwu_local;
124 #endif
125 
126 #ifdef CONFIG_SMP
127     struct llist_head wake_list;
128 #endif
129 };

struct rq

该结构体是本地cpu所有进程组成的就绪队列，在linux内核中，进程被分为普通进程和实时进程，这两种进程的调度策略是不同的，因此在31-32行可以看到rq结构体中又内嵌了struct cfs_rq cfs和struct rt_rq rt两个子就绪队列，分别来组织普通进程和实时进程（普通进程将采用完全公平调度策略cfs，而实时进程将采用实时调度策略），第33行struct dl_rq dl调度空闲进程，暂且不讨论。所以，如果咱们研究的是普通进程的调度，需要关心的就是struct cfs_rq cfs队列；如果研究的是实时进程，就只关心struct rt_rq rt队列。

1.1普通进程的就绪队列struct cfs_rq（kernel/sched/sched.h）

 1 /* CFS-related fields in a runqueue */
 2 struct cfs_rq {
 3     struct load_weight load;
 4     unsigned int nr_running, h_nr_running;
 5 
 6     u64 exec_clock;
 7     u64 min_vruntime;
 8 #ifndef CONFIG_64BIT
 9     u64 min_vruntime_copy;
10 #endif
11 
12     struct rb_root tasks_timeline;
13     struct rb_node *rb_leftmost;
14 
15     /*
16      * 'curr' points to currently running entity on this cfs_rq.
17      * It is set to NULL otherwise (i.e when none are currently running).
18      */
19     struct sched_entity *curr, *next, *last, *skip;
20 
21 #ifdef    CONFIG_SCHED_DEBUG
22     unsigned int nr_spread_over;
23 #endif
24 
25 #ifdef CONFIG_SMP
26     /*
27      * CFS Load tracking
28      * Under CFS, load is tracked on a per-entity basis and aggregated up.
29      * This allows for the description of both thread and group usage (in
30      * the FAIR_GROUP_SCHED case).
31      */
32     unsigned long runnable_load_avg, blocked_load_avg;
33     atomic64_t decay_counter;
34     u64 last_decay;
35     atomic_long_t removed_load;
36 
37 #ifdef CONFIG_FAIR_GROUP_SCHED
38     /* Required to track per-cpu representation of a task_group */
39     u32 tg_runnable_contrib;
40     unsigned long tg_load_contrib;
41 
42     /*
43      *   h_load = weight * f(tg)
44      *
45      * Where f(tg) is the recursive weight fraction assigned to
46      * this group.
47      */
48     unsigned long h_load;
49     u64 last_h_load_update;
50     struct sched_entity *h_load_next;
51 #endif /* CONFIG_FAIR_GROUP_SCHED */
52 #endif /* CONFIG_SMP */
53 
54 #ifdef CONFIG_FAIR_GROUP_SCHED
55     struct rq *rq;    /* cpu runqueue to which this cfs_rq is attached */
56 
57     /*
58      * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
59      * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
60      * (like users, containers etc.)
61      *
62      * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
63      * list is used during load balance.
64      */
65     int on_list;
66     struct list_head leaf_cfs_rq_list;
67     struct task_group *tg;    /* group that "owns" this runqueue */
68 
69 #ifdef CONFIG_CFS_BANDWIDTH
70     int runtime_enabled;
71     u64 runtime_expires;
72     s64 runtime_remaining;
73 
74     u64 throttled_clock, throttled_clock_task;
75     u64 throttled_clock_task_time;
76     int throttled, throttle_count;
77     struct list_head throttled_list;
78 #endif /* CONFIG_CFS_BANDWIDTH */
79 #endif /* CONFIG_FAIR_GROUP_SCHED */
80 };

struct cfs_rq

cfs_rq就绪队列是以红黑树的形式来组织调度实体。第12行tasks_timeline成员就是红黑树的树根。第13行rb_leftmost指向了红黑树最左边的左孩子（下一个可调度的实体）。第19行curr指向当前正运行的实体，next指向将被唤醒的进程，last指向唤醒next进程的进程，next和last用法后边会提到。第55行rq指向了该cfs_rq就绪队列所属的rq队列。

1.2实时进程的就绪队列struct rt_rq（kernel/sched/sched.h）

 1 /* Real-Time classes' related field in a runqueue: */
 2 struct rt_rq {
 3     struct rt_prio_array active;
 4     unsigned int rt_nr_running;
 5 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 6     struct {
 7         int curr; /* highest queued rt task prio */
 8 #ifdef CONFIG_SMP
 9         int next; /* next highest */
10 #endif
11     } highest_prio;
12 #endif
13 #ifdef CONFIG_SMP
14     unsigned long rt_nr_migratory;
15     unsigned long rt_nr_total;
16     int overloaded;
17     struct plist_head pushable_tasks;
18 #endif
19     int rt_queued;
20 
21     int rt_throttled;
22     u64 rt_time;
23     u64 rt_runtime;
24     /* Nests inside the rq lock: */
25     raw_spinlock_t rt_runtime_lock;
26 
27 #ifdef CONFIG_RT_GROUP_SCHED
28     unsigned long rt_nr_boosted;
29 
30     struct rq *rq;
31     struct task_group *tg;
32 #endif
33 };

struct rt_rq

2.调度实体（include/linux/sched.h）

2.1普通进程的调度实体sched_entity

 1 struct sched_entity {
 2     struct load_weight    load;        /* for load-balancing */
 3     struct rb_node        run_node;
 4     struct list_head    group_node;
 5     unsigned int        on_rq;
 6 
 7     u64            exec_start;
 8     u64            sum_exec_runtime;
 9     u64            vruntime;
10     u64            prev_sum_exec_runtime;
11 
12     u64            nr_migrations;
13 
14 #ifdef CONFIG_SCHEDSTATS
15     struct sched_statistics statistics;
16 #endif
17 
18 #ifdef CONFIG_FAIR_GROUP_SCHED
19     int            depth;
20     struct sched_entity    *parent;
21     /* rq on which this entity is (to be) queued: */
22     struct cfs_rq        *cfs_rq;
23     /* rq "owned" by this entity/group: */
24     struct cfs_rq        *my_q;
25 #endif
26 
27 #ifdef CONFIG_SMP
28     /* Per-entity load-tracking */
29     struct sched_avg    avg;
30 #endif
31 };

每个进程描述符中均包含一个该结构体变量，内核使用该结构体来将普通进程组织到采用完全公平调度策略的就绪队列中（struct rq中的cfs队列中，上边提到过），该结构体有两个作用，一是包含有进程调度的信息（比如进程的运行时间，睡眠时间等等，调度程序参考这些信息决定是否调度进程），二是使用该结构体来组织进程，第3行的struct rb_node类型结构体变量run_node是红黑树节点，因此struct sched_entity调度实体将被组织成红黑树的形式，同时意味着普通进程也被组织成红黑树的形式。第18-25行是和组调度有关的成员，需要开启组调度。第20行parent顾名思义指向了当前实体的上一级实体，后边再介绍。第22行的cfs_rq指向了该调度实体所在的就绪队列。第24行my_q指向了本实体拥有的就绪队列（调度组），该调度组（包括组员实体）属于下一个级别，和本实体不在同一个级别，该调度组中所有成员实体的parent域指向了本实体，这就解释了上边的parent，此外，第19行depth代表了此队列（调度组）的深度，每个调度组都比其parent调度组深度大1。内核依赖my_q域实现组调度。

2.2实时进程的调度实体 sched_rt_entity

 1 struct sched_rt_entity {
 2     struct list_head run_list;
 3     unsigned long timeout;
 4     unsigned long watchdog_stamp;
 5     unsigned int time_slice;
 6 
 7     struct sched_rt_entity *back;
 8 #ifdef CONFIG_RT_GROUP_SCHED
 9     struct sched_rt_entity    *parent;
10     /* rq on which this entity is (to be) queued: */
11     struct rt_rq        *rt_rq;
12     /* rq "owned" by this entity/group: */
13     struct rt_rq        *my_q;
14 #endif
15 };

该结构体和上个结构体是类似的，只不过用来组织实时进程，实时进程被组织到struct rq中的rt队列中，上边有提到。每个进程描述符中也包含一个该结构体。该结构体中并未包含struct rb_node类型结构体变量，而在第1行出现了struct list_head类型结构体变量run_list，因此可以看出实时进程的就绪队列是双向链表形式，而不是红黑数的形式。

3.调度类（kernel/sched/sched.h）

 1 struct sched_class {
 2     const struct sched_class *next;
 3 
 4     void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
 5     void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 6     void (*yield_task) (struct rq *rq);
 7     bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
 8 
 9     void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
10 
11     /*
12      * It is the responsibility of the pick_next_task() method that will
13      * return the next task to call put_prev_task() on the @prev task or
14      * something equivalent.
15      *
16      * May return RETRY_TASK when it finds a higher prio class has runnable
17      * tasks.
18      */
19     struct task_struct * (*pick_next_task) (struct rq *rq,
20                         struct task_struct *prev);
21     void (*put_prev_task) (struct rq *rq, struct task_struct *p);
22 
23 #ifdef CONFIG_SMP
24     int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
25     void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
26 
27     void (*post_schedule) (struct rq *this_rq);
28     void (*task_waking) (struct task_struct *task);
29     void (*task_woken) (struct rq *this_rq, struct task_struct *task);
30 
31     void (*set_cpus_allowed)(struct task_struct *p,
32                  const struct cpumask *newmask);
33 
34     void (*rq_online)(struct rq *rq);
35     void (*rq_offline)(struct rq *rq);
36 #endif
37 
38     void (*set_curr_task) (struct rq *rq);
39     void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
40     void (*task_fork) (struct task_struct *p);
41     void (*task_dead) (struct task_struct *p);
42 
43     void (*switched_from) (struct rq *this_rq, struct task_struct *task);
44     void (*switched_to) (struct rq *this_rq, struct task_struct *task);
45     void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
46                  int oldprio);
47 
48     unsigned int (*get_rr_interval) (struct rq *rq,
49                      struct task_struct *task);
50 
51 #ifdef CONFIG_FAIR_GROUP_SCHED
52     void (*task_move_group) (struct task_struct *p, int on_rq);
53 #endif
54 };

内核声明了一个调度类sched_class的结构体类型，用来实现不同的调度策略，可以看到该结构体成员都是函数指针，这些指针指向的函数就是调度策略的具体实现，所有和进程调度有关的函数都直接或者间接调用了这些成员函数，来实现进程调度。此外，每个进程描述符中都包含一个指向该结构体类型的指针sched_class，指向了所采用的调度类。下面我们看下完全公平调度策略类的定义和初始化（kernel/sched/fair.c）。

1 const struct sched_class fair_sched_class;

定义了一个全局的调度策略变量。初始化如下：

 1 const struct sched_class fair_sched_class = {
 2     .next            = &idle_sched_class,
 3     .enqueue_task        = enqueue_task_fair,
 4     .dequeue_task        = dequeue_task_fair,
 5     .yield_task        = yield_task_fair,
 6     .yield_to_task        = yield_to_task_fair,
 7 
 8     .check_preempt_curr    = check_preempt_wakeup,
 9 
10     .pick_next_task        = pick_next_task_fair,
11     .put_prev_task        = put_prev_task_fair,
12 
13 #ifdef CONFIG_SMP
14     .select_task_rq        = select_task_rq_fair,
15     .migrate_task_rq    = migrate_task_rq_fair,
16 
17     .rq_online        = rq_online_fair,
18     .rq_offline        = rq_offline_fair,
19 
20     .task_waking        = task_waking_fair,
21 #endif
22 
23     .set_curr_task          = set_curr_task_fair,
24     .task_tick        = task_tick_fair,
25     .task_fork        = task_fork_fair,
26 
27     .prio_changed        = prio_changed_fair,
28     .switched_from        = switched_from_fair,
29     .switched_to        = switched_to_fair,
30 
31     .get_rr_interval    = get_rr_interval_fair,
32 
33 #ifdef CONFIG_FAIR_GROUP_SCHED
34     .task_move_group    = task_move_group_fair,
35 #endif
36 };

可以看到该结构体变量中函数成员很多，它们实现了不同的功能，待会用到时我们再做分析。

4.进程描述符task_struct（include/linux/sched.h）

 1 struct task_struct {
 2     volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
 3     .....
 4     int on_rq;
 5 
 6     int prio, static_prio, normal_prio;
 7     unsigned int rt_priority;
 8     const struct sched_class *sched_class;
 9     struct sched_entity se;
10     struct sched_rt_entity rt;
11 #ifdef CONFIG_CGROUP_SCHED
12     struct task_group *sched_task_group;
13 #endif
14     struct sched_dl_entity dl;
15     .....
16     .....
17     unsigned int policy;
18     .....
19     .....
20     struct sched_info sched_info;
21     .....
22     .....
23 };

只列出了和进程调度有关的成员。第6行三个变量代表了普通进程的三个优先级，第7行的变量代表了实时进程的优先级。关于进程优先级的概念，大家可以看看《深入理解linux内核架构》这本书的进程管理章节。第8-10行就是我们上边提到的那些结构体在进程描述符中的定义。第17行的policy代表了当前进程的调度策略，内核给出了宏定义，它可以在这些宏中取值，关于详细的讲解还是去看《深入理解linux内核架构》这本书的进程管理部分。policy取了什么值，sched_class也应该取相应的调度类指针。

进程调度过程分析：

进程调度过程分为两部分，一是对进程信息进行修改，主要是修改和调度相关的信息，比如进程的运行时间，睡眠时间，进程的状态，cpu的负荷等等，二是进程的切换。和进程调度相关的所有函数中，只有schedule函数是用来进行进程切换的，其他函数都是用来修改进程的调度信息。schedule函数我们在前边的博文中已经探讨过了，这里不再分析。对于其他函数，我们将按照其功能，逐类来分析。

1.scheduler_tick（kernel/sched/core.c ）

 1 void scheduler_tick(void)
 2 {
 3     int cpu = smp_processor_id();
 4     struct rq *rq = cpu_rq(cpu);
 5     struct task_struct *curr = rq->curr;
 6 
 7     sched_clock_tick();
 8 
 9     raw_spin_lock(&rq->lock);
10     update_rq_clock(rq);
11     curr->sched_class->task_tick(rq, curr, 0);
12     update_cpu_load_active(rq);
13     raw_spin_unlock(&rq->lock);
14 
15     perf_event_task_tick();
16 
17 #ifdef CONFIG_SMP
18     rq->idle_balance = idle_cpu(cpu);
19     trigger_load_balance(rq);
20 #endif
21     rq_last_tick_reset(rq);
22 }

该函数被时钟中断处理程序调用，将当前cpu的负载情况记载到运行队列struct rq的某些成员中，并更新当前进程的时间片。第3行获取当前的cpu号，第4行获取当前cpu的就绪队列（每个cpu有一个）rq，第5行从就绪队列中获取当前运行进程的描述符，第10行更新就绪队列中的clock和clock_task成员值，代表当前的时间，一般我们会用到clock_task。第11行进入当前进程的调度类的task_tick函数中，更新当前进程的时间片，不同调度类的该函数实现不同，待会我们分析下完全公平调度类的该函数。第12行更新就绪队列的cpu负载信息。第18行判断当前cpu是否是空闲的，是的话idle_cpu返回1，否则返回0。第19行挂起SCHED_SOFTIRQ软中断函数，去做周期性的负载平衡操作。第21行将最新的时钟滴答数jiffies存入就绪队列的last_sched_tick域中。再来看下task_tick_fair函数（kernel/sched/fair.c）：

 1 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 2 {
 3     struct cfs_rq *cfs_rq;
 4     struct sched_entity *se = &curr->se;
 5 
 6     for_each_sched_entity(se) {
 7         cfs_rq = cfs_rq_of(se);
 8         entity_tick(cfs_rq, se, queued);
 9     }
10 
11     if (numabalancing_enabled)
12         task_tick_numa(rq, curr);
13 
14     update_rq_runnable_avg(rq, 1);
15 }

如果某个进程的调度类采用完全公平调度类的话，那么上个函数scheduler_tick第11行所执行的task_tick函数指针，就指向了本函数。可以回头看看完全公平调度对象的初始化，第24行的赋值语句.task_tick = task_tick_fair。回到本函数，第4行获取当前进程的普通调度实体，将指针存放到se中，第6-8行遍历当前调度实体的上一级实体，以及上上一级实体，以此类推，然后在entity_tick函数中更新调度实体的运行时间等信息。在这里用循环来遍历的原因是当开启了组调度后，调度实体的parent域就存储了它的上一级节点，该实体和它parent指向的实体不是同一级别，因此使用循环就把从当前级别（组）到最顶层的级别遍历完了；如果未选择组支持，则循环只执行一次，仅对当前调度实体进行更新。下面看下entity_tick的代码（kernel/sched/fair.c）：

 1 static void
 2 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 3 {
 4     /*
 5      * Update run-time statistics of the 'current'.
 6      */
 7     update_curr(cfs_rq);
 8 
 9     /*
10      * Ensure that runnable average is periodically updated.
11      */
12     update_entity_load_avg(curr, 1);
13     update_cfs_rq_blocked_load(cfs_rq, 1);
14     update_cfs_shares(cfs_rq);
15 
16 #ifdef CONFIG_SCHED_HRTICK
17     /*
18      * queued ticks are scheduled to match the slice, so don't bother
19      * validating it and just reschedule.
20      */
21     if (queued) {
22         resched_task(rq_of(cfs_rq)->curr);
23         return;
24     }
25     /*
26      * don't let the period tick interfere with the hrtick preemption
27      */
28     if (!sched_feat(DOUBLE_TICK) &&
29             hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
30         return;
31 #endif
32 
33     if (cfs_rq->nr_running > 1)
34         check_preempt_tick(cfs_rq, curr);
35 }

在该函数中对调度实体（进程）的运行时间等信息进行更新。第7行update_curr函数对当前进程的运行时间进行更新，随后分析。第21行如果传进来的参数queued不为0的话，当前进程会被无条件设置重新调度标志（允许被抢占了）。第33-34行如果当前cfs_rq队列等待调度的进程数量大于1，那么就执行check_preempt_tick函数检查当前进程的时间片是否用完，用完的话就需要调度别的进程来运行（具体来说，如果当前进程“真实时间片”用完，该函数给当前进程设置need_resched标志，然后schedule程序就可以重新在就绪队列中调度新的进程），下面分析update_curr函数（kernel/sched/fair.c）：

 1 static void update_curr(struct cfs_rq *cfs_rq)
 2 {
 3     struct sched_entity *curr = cfs_rq->curr;
 4     u64 now = rq_clock_task(rq_of(cfs_rq));
 5     u64 delta_exec;
 6 
 7     if (unlikely(!curr))
 8         return;
 9 
10     delta_exec = now - curr->exec_start;
11     if (unlikely((s64)delta_exec <= 0))
12         return;
13 
14     curr->exec_start = now;
15 
16     schedstat_set(curr->statistics.exec_max,
17               max(delta_exec, curr->statistics.exec_max));
18 
19     curr->sum_exec_runtime += delta_exec;
20     schedstat_add(cfs_rq, exec_clock, delta_exec);
21 
22     curr->vruntime += calc_delta_fair(delta_exec, curr);
23     update_min_vruntime(cfs_rq);
24 
25     if (entity_is_task(curr)) {
26         struct task_struct *curtask = task_of(curr);
27 
28         trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
29         cpuacct_charge(curtask, delta_exec);
30         account_group_exec_runtime(curtask, delta_exec);
31     }
32 
33     account_cfs_rq_runtime(cfs_rq, delta_exec);
34 }

该函数是更新进程运行时间最核心的一个函数。第3行获取当前的调度实体，第4行从就绪队列rq的clock_task成员中获取当前时间，存入now中，该成员我们在scheduler_tick函数中提到过。第10行用当前时间减去进程在上次时钟中断tick中的开始时间得到进程运行的时间间隔，存入delta_exec中。第14行当前时间又成为进程新的开始时间。第19行将进程运行的时间间隔delta_exec累加到调度实体的sum_exec_runtime成员中，该成员代表进程到目前为止运行了多长时间。第20行将进程运行的时间间隔delta_exec也累加到公平调度就绪队列cfs_rq的exec_clock成员中。第22行calc_delta_fair函数很关键，它将进程执行的真实运行时间转换成虚拟运行时间，然后累加到调度实体的vruntime域中，进程的虚拟时间非常重要，完全公平调度策略就是依赖该时间进行调度。关于进程的真实时间和虚拟时间的概念，以及它们之间的转换算法，文章的后面会提到，详细的内容大家可以看看《深入理解linux内核架构》的进程管理章节。第23行更新cfs_rq队列中的最小虚拟运行时间min_vruntime，该时间是就绪队列中所有进程包括当前进程的已运行的最小虚拟时间，只能单调递增，待会我们分析update_min_vruntime函数，该函数比较重要。第25-30行，如果调度单位是进程的话（不是组），会更新进程描述符中的运行时间。第33行更新cfs_rq队列的剩余运行时间，并计算出期望运行时间，必要的话可以对进程重新调度。下面我们先分析update_min_vruntime函数，然后分析calc_delta_fair函数（kernel/sched/fair.c）：

 1 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 2 {
 3     u64 vruntime = cfs_rq->min_vruntime;
 4 
 5     if (cfs_rq->curr)
 6         vruntime = cfs_rq->curr->vruntime;
 7 
 8     if (cfs_rq->rb_leftmost) {
 9         struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
10                            struct sched_entity,
11                            run_node);
12 
13         if (!cfs_rq->curr)
14             vruntime = se->vruntime;
15         else
16             vruntime = min_vruntime(vruntime, se->vruntime);
17     }
18 
19     /* ensure we never gain time by being placed backwards. */
20     cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
21 #ifndef CONFIG_64BIT
22     smp_wmb();
23     cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
24 #endif
25 }

每个cfs_rq队列均有一个min_vruntime成员，装的是就绪队列中所有进程包括当前进程已运行的虚拟时间中最小的那个时间。本函数来更新这个时间。第5-6行如果当前有进程正在执行，将当前进程已运行的虚拟时间保存在vruntime变量中。第8-17行如果就绪队列中有下一个要被调度的进程（由rb_leftmost指针指向），则进入if体，第13-16行从当前进程和下个被调度进程中，选择最小的已运行虚拟时间，保存到vruntime中。第20行从当前队列的min_vruntime域和vruntime变量中，选最大的保存到队列的min_vruntime域中，完成更新。其实第13-17行是最关键的，保证了队列的min_vruntime域中存放的是就绪队列中最小的虚拟运行时间，而第20行的作用仅仅是保证min_vruntime域中的值单调递增，没有别的含义了。队列中的min_vruntime成员非常重要，用于在睡眠进程被唤醒后以及新进程被创建好时，进行虚拟时间补偿或者惩罚，后面会分析到。

1 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
2 {
3     if (unlikely(se->load.weight != NICE_0_LOAD))
4         delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
5 
6     return delta;
7 }

第3行判断当前进程nice值是否为0，如果是的话，直接返回真实运行时间delta（nice0级别的进程真实运行时间和虚拟运行时间值相等）；如果不是的话，第4行将真实时间转换成虚拟时间。下面我们分析__calc_delta函数（kernel/sched/fair.c）：

 1 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 2 {
 3     u64 fact = scale_load_down(weight);
 4     int shift = WMULT_SHIFT;
 5 
 6     __update_inv_weight(lw);
 7 
 8     if (unlikely(fact >> 32)) {
 9         while (fact >> 32) {
10             fact >>= 1;
11             shift--;
12         }
13     }
14 
15     /* hint to use a 32x32->64 mul */
16     fact = (u64)(u32)fact * lw->inv_weight;
17 
18     while (fact >> 32) {
19         fact >>= 1;
20         shift--;
21     }
22 
23     return mul_u64_u32_shr(delta_exec, fact, shift);
24 }

该函数执行了两种算法：要么是delta_exec * weight / lw.weight，要么是(delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT。计算的结果就是转换后的虚拟时间。至此，scheduler_tick函数大致就分析完了，当然还有一些细节没有分析到，进程调度这块非常庞杂，要想把所有函数分析完非常耗费时间和精力，以后如果分析到的话，再慢慢补充。scheduler_tick函数主要更新进程的运行时间，包括物理时间和虚拟时间。

2.进程优先级设置的函数，进程的优先级和调度关系密切，通过上边分析可以看到，计算进程的虚拟运行时间要用到优先级，优先级决定进程权重，权重决定进程虚拟时间的增加速度，最终决定进程可运行时间的长短。权重越大的进程可以执行的时间越长。从effective_prio函数开始（kernel/sched/core.c）：

 1 static int effective_prio(struct task_struct *p)
 2 {
 3     p->normal_prio = normal_prio(p);
 4     /*
 5      * If we are RT tasks or we were boosted to RT priority,
 6      * keep the priority unchanged. Otherwise, update priority
 7      * to the normal priority:
 8      */
 9     if (!rt_prio(p->prio))
10         return p->normal_prio;
11     return p->prio;
12 }

该函数在进程创建时或者在用户的nice系统调用中都会被调用到，来设置进程的活动优先级（进程的三种优先级：活动优先级prio，静态优先级static_prio，普通优先级normal_prio），该函数设计的有一定技巧性，函数的返回值是用来设置进程的活动优先级，但是在函数体中也把进程的普通优先级设置了。第3行设置进程的普通优先级，随后分析normal_prio函数。第9-11行，通过进程的活动优先级可以判断出该进程是不是实时进程，如果是实时进程，则执行11行，返回p->prio，实时进程的活动优先级不变。否则返回p->normal_prio，这说明普通进程的活动优先级等于它的普通优先级。下面我们看看normal_prio函数（kernel/sched/core.c）：

 1 static inline int normal_prio(struct task_struct *p)
 2 {
 3     int prio;
 4 
 5     if (task_has_dl_policy(p))
 6         prio = MAX_DL_PRIO-1;
 7     else if (task_has_rt_policy(p))
 8         prio = MAX_RT_PRIO-1 - p->rt_priority;
 9     else
10         prio = __normal_prio(p);
11     return prio;
12 }

该函数用来设置进程的普通优先级。第5行判断当前进程是不是空闲进程，是的话设置进程的普通优先级为-1（-1是空闲进程的优先级），否则的话第7行判断进程是不是实时进程，是的话设置实时进程的普通优先级为0-99（数字越小优先级越高），可以看到这块减去了p->rt_priority，比较奇怪，这是因为实时进程描述符的rt_priority成员中事先存放了它自己的优先级（数字也是0-99，但在这里数字越大，优先级越高），因此往p->prio中倒换的时候，需要处理一下，MAX_RT_PRIO值为100，因此MAX_RT_PRIO-1-（0，99）就倒换成了（99，0），这仅仅是个小技巧。如果当前进程也不是实时进程（说明是普通进程喽），则第10行将进程的静态优先级存入prio中，然后返回（也就是说普通进程的普通优先级等于其静态优先级）。

总结下，总共有三种进程：空闲进程，实时进程，普通进程；每种进程都包含三种优先级：活动优先级，普通优先级，静态优先级。空闲进程的普通优先级永远-1，实时进程的普通优先级是根据p->rt_priority设定的（0-99），普通进程的普通优先级是根据其静态优先级设定的（100-139）。

3.进程权重设置的函数，上边我们提到，进程的优先级决定进程的权重。权重进而决定进程运行时间的长短。我们先分析和权重相关的数据结构。

struct load_weight（include/linux/sched.h）

1 struct load_weight {
2     unsigned long weight;
3     u32 inv_weight;
4 };

每个进程描述符，调度实体，就绪对列中都包含一个该类型变量，用来保存各自的权重。成员weight中存放进程优先级所对应的权重。成员inv_weight中存放NICE_0_LOAD/weight的结果，这个结果乘以进程运行的时间间隔delta_exec就是进程运行的虚拟时间。因此引入权重就是为了计算进程的虚拟时间。在这里将中间的计算结果保存下来，后边就不用再计算了，直接可以用。

数组prio_to_weight[40]（kernel/sched/sched.h）

 1 static const int prio_to_weight[40] = {
 2  /* -20 */     88761,     71755,     56483,     46273,     36291,
 3  /* -15 */     29154,     23254,     18705,     14949,     11916,
 4  /* -10 */      9548,      7620,      6100,      4904,      3906,
 5  /*  -5 */      3121,      2501,      1991,      1586,      1277,
 6  /*   0 */      1024,       820,       655,       526,       423,
 7  /*   5 */       335,       272,       215,       172,       137,
 8  /*  10 */       110,        87,        70,        56,        45,
 9  /*  15 */        36,        29,        23,        18,        15,
10 };

该数组是普通进程的优先级和权重对应关系。数组元素是权重值，分别是优先级从100-139或者nice值从-20-+19所对应的权重值。nice值（-20-+19）是从用户空间看到的普通进程的优先级，和内核空间的优先级（100-139）一一对应。struct load_weight中的weight成员存放正是这些权重值。

中间结果数组prio_to_wmult[40] （kernel/sched/sched.h）

 1 static const u32 prio_to_wmult[40] = {
 2  /* -20 */     48388,     59856,     76040,     92818,    118348,
 3  /* -15 */    147320,    184698,    229616,    287308,    360437,
 4  /* -10 */    449829,    563644,    704093,    875809,   1099582,
 5  /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
 6  /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
 7  /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
 8  /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
 9  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
10 };

该数组元素就是上个数组元素被NICE_0_LOAD除的结果，一一对应。struct load_weight中的inv_weight成员存放正是这些值。

下边我们分析和权重设置相关的函数。从set_load_weight函数开始（kernel/sched/core.c）：

 1 static void set_load_weight(struct task_struct *p)
 2 {
 3     int prio = p->static_prio - MAX_RT_PRIO;
 4     struct load_weight *load = &p->se.load;
 5 
 6     /*
 7      * SCHED_IDLE tasks get minimal weight:
 8      */
 9     if (p->policy == SCHED_IDLE) {
10         load->weight = scale_load(WEIGHT_IDLEPRIO);
11         load->inv_weight = WMULT_IDLEPRIO;
12         return;
13     }
14 
15     load->weight = scale_load(prio_to_weight[prio]);
16     load->inv_weight = prio_to_wmult[prio];
17 }

该函数用来设置进程p的权重。第3行将进程p的静态优先级转换成数组下标（减去100，从100-139--->0-39）。第4行获取当前进程的调度实体中的权重结构体指针，存入load中。第9-12行，如果当前进程是空闲进程，则设置相应的权重和中间计算结果。第15-16行，设置实时进程和普通进程的权重和中间计算结果。

由于就绪队列中也包含权重结构体，所以也要对它们进行设置。使用以下函数（kernel/sched/fair.c）：

1 static inline void update_load_set(struct load_weight *lw, unsigned long w)
2 {
3     lw->weight = w;
4     lw->inv_weight = 0;
5 }

该函数用来设置就绪队列的权重。

1 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
2 {
3     lw->weight += inc;
4     lw->inv_weight = 0;
5 }

当有进程加入就绪队列，使用该函数增加就绪队列的权重。

1 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
2 {
3     lw->weight -= dec;
4     lw->inv_weight = 0;
5 }

当有进程从就绪队列移除时，使用该函数减少就绪队列的权重。就绪队列的load_weight.inv_weight成员总是0，因为不会使用到就绪队列的该域。

4.计算进程延迟周期的相关函数。进程的延迟周期指的是将所有进程执行一遍的时间。当就绪队列中的进程数量不超出规定的时候，内核有一个固定的延迟周期供调度使用，但是当进程数量超出规定以后，就需要对该固定延迟周期进行扩展（不然随着进程越多，每个进程分配的执行时间会越少）。下面看看sched_slice函数（kernel/sched/fair.c）：

 1 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2 {
 3     u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 4 
 5     for_each_sched_entity(se) {
 6         struct load_weight *load;
 7         struct load_weight lw;
 8 
 9         cfs_rq = cfs_rq_of(se);
10         load = &cfs_rq->load;
11 
12         if (unlikely(!se->on_rq)) {
13             lw = cfs_rq->load;
14 
15             update_load_add(&lw, se->load.weight);
16             load = &lw;
17         }
18         slice = __calc_delta(slice, se->load.weight, load);
19     }
20     return slice;
21 }

直接看第18行，__calc_delta用来计算当前进程在延迟周期中可占的时间（相当于“时间片”，就是当前进程可用的时间）。__calc_delta函数很强大，记得前边在entity_tick函数中也调用过该函数（entity_tick--->update_curr--->calc_delta_fair--->__calc_delta），当时使用该函数是为了将进程运行过的物理时间（真实时间）转换成虚拟时间；而在此处，我们用它来计算当前进程在延迟周期中可占的时间（相当于“时间片”）。前边提过，__calc_delta中用到param1 * param2 / param3.weight这个公式（param代表该函数接收的参数），当param2的值固定不变（等于NICE_0_LOAD），param3（代表当前进程的权重）在变化时，该函数是用来转换真实时间和虚拟时间的；当param3（代表当前cfs_rq的权重，cfs_rq->load->weight）的值固定不变，param2在变化（代表当前进程的权重）时，该函数则是用来计算当前进程应该运行的时间。因此第18行计算结果slice就是当前进程应该运行的真实时间。下面再看一个函数sched_vslice（kernel/sched/fair.c）：

1 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
2 {
3     return calc_delta_fair(sched_slice(cfs_rq, se), se);
4 }

该函数用来将进程应该运行的真实时间转换成应该运行的虚拟时间，以供调度使用。可以看到该函数调用了cals_delta_fair来进行时间转换，前边已分析过，不再赘述。

5.选择下一个需要调度的进程。所使用的函数是pick_next_task_fair，代码如下（kernel/sched/fair.c）：

  1 static struct task_struct *
  2 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
  3 {
  4     struct cfs_rq *cfs_rq = &rq->cfs;
  5     struct sched_entity *se;
  6     struct task_struct *p;
  7     int new_tasks;
  8 
  9 again:
 10 #ifdef CONFIG_FAIR_GROUP_SCHED
 11     if (!cfs_rq->nr_running)
 12         goto idle;
 13 
 14     if (prev->sched_class != &fair_sched_class)
 15         goto simple;
 16 
 17     /*
 18      * Because of the set_next_buddy() in dequeue_task_fair() it is rather
 19      * likely that a next task is from the same cgroup as the current.
 20      *
 21      * Therefore attempt to avoid putting and setting the entire cgroup
 22      * hierarchy, only change the part that actually changes.
 23      */
 24 
 25     do {
 26         struct sched_entity *curr = cfs_rq->curr;
 27 
 28         /*
 29          * Since we got here without doing put_prev_entity() we also
 30          * have to consider cfs_rq->curr. If it is still a runnable
 31          * entity, update_curr() will update its vruntime, otherwise
 32          * forget we've ever seen it.
 33          */
 34         if (curr && curr->on_rq)
 35             update_curr(cfs_rq);
 36         else
 37             curr = NULL;
 38 
 39         /*
 40          * This call to check_cfs_rq_runtime() will do the throttle and
 41          * dequeue its entity in the parent(s). Therefore the 'simple'
 42          * nr_running test will indeed be correct.
 43          */
 44         if (unlikely(check_cfs_rq_runtime(cfs_rq)))
 45             goto simple;
 46 
 47         se = pick_next_entity(cfs_rq, curr);
 48         cfs_rq = group_cfs_rq(se);
 49     } while (cfs_rq);
 50 
 51     p = task_of(se);
 52 
 53     /*
 54      * Since we haven't yet done put_prev_entity and if the selected task
 55      * is a different task than we started out with, try and touch the
 56      * least amount of cfs_rqs.
 57      */
 58     if (prev != p) {
 59         struct sched_entity *pse = &prev->se;
 60 
 61         while (!(cfs_rq = is_same_group(se, pse))) {
 62             int se_depth = se->depth;
 63             int pse_depth = pse->depth;
 64 
 65             if (se_depth <= pse_depth) {
 66                 put_prev_entity(cfs_rq_of(pse), pse);
 67                 pse = parent_entity(pse);
 68             }
 69             if (se_depth >= pse_depth) {
 70                 set_next_entity(cfs_rq_of(se), se);
 71                 se = parent_entity(se);
 72             }
 73         }
 74 
 75         put_prev_entity(cfs_rq, pse);
 76         set_next_entity(cfs_rq, se);
 77     }
 78 
 79     if (hrtick_enabled(rq))
 80         hrtick_start_fair(rq, p);
 81 
 82     return p;
 83 simple:
 84     cfs_rq = &rq->cfs;
 85 #endif
 86 
 87     if (!cfs_rq->nr_running)
 88         goto idle;
 89 
 90     put_prev_task(rq, prev);
 91 
 92     do {
 93         se = pick_next_entity(cfs_rq, NULL);
 94         set_next_entity(cfs_rq, se);
 95         cfs_rq = group_cfs_rq(se);
 96     } while (cfs_rq);
 97 
 98     p = task_of(se);
 99 
100     if (hrtick_enabled(rq))
101         hrtick_start_fair(rq, p);
102 
103     return p;
104 
105 idle:
106     new_tasks = idle_balance(rq);
107     /*
108      * Because idle_balance() releases (and re-acquires) rq->lock, it is
109      * possible for any higher priority task to appear. In that case we
110      * must re-start the pick_next_entity() loop.
111      */
112     if (new_tasks < 0)
113         return RETRY_TASK;
114 
115     if (new_tasks > 0)
116         goto again;
117 
118     return NULL;
119 }

pick_next_task_fair

该函数会被赋给公平调度类的pick_next_task成员（.pick_next_task = pick_next_task_fair），在schedule函数中会调用该函数选择下一个需要调用的进程，然后进行进程切换。第11-12行如果当前就绪队列中的进程数量为0，则退出函数。第25-49行对所有的调度组进行遍历，从中选择下一个可调度的进程，而不只局限在当前队列的当前组。第26行获取当前调度实体，第34-37行如果存在一个当前调度实体（进程）并且正在运行，则更新进程的运行时间，否则就绪队列cfs_rq.curr置为null，表示当前无进程运行。第47行获取下一个调度实体，待会来分析该函数。第48行获取下个调度实体所拥有的就绪队列my_q（代表一个调度组），如果调度组非空，则进入下一次循环，在新的就绪队列（调度组）中挑选下一个可调度进程，直到某个实体没有自己的就绪队列为止（遍历完所有的调度组了）。退出循环后，可以发现此时的se是所遍历的最后一个调度组的下个可运行实体。第51行获取se对应的进程描述符，第58-77行，如果当前进程和下一个进程（se所对应的进程）不是同一个的话，则执行if体，第59行将当前进程的调度实体指针装入pse中，第61-72行如果当前进程和下一个调度的进程（se对应的进程）不属于同一调度组，则进入循环。否则，执行第75-76行，将当前进程放回就绪队列，将下个进程从就绪队列中拿出，这两个函数涉及到了就绪队列的出队和入队操作，我们在下边分析。第61-73的循环根据当前实体和下个调度实体的节点深度进行调整（同一个级别的进程才能切换）。下面看看pick_next_entity，put_prev_entity和set_prev_entity函数代码（kernel/sched/fair.c）：

 1 static struct sched_entity *
 2 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 3 {
 4     struct sched_entity *left = __pick_first_entity(cfs_rq);
 5     struct sched_entity *se;
 6 
 7     /*
 8      * If curr is set we have to see if its left of the leftmost entity
 9      * still in the tree, provided there was anything in the tree at all.
10      */
11     if (!left || (curr && entity_before(curr, left)))
12         left = curr;
13 
14     se = left; /* ideally we run the leftmost entity */
15 
16     /*
17      * Avoid running the skip buddy, if running something else can
18      * be done without getting too unfair.
19      */
20     if (cfs_rq->skip == se) {
21         struct sched_entity *second;
22 
23         if (se == curr) {
24             second = __pick_first_entity(cfs_rq);
25         } else {
26             second = __pick_next_entity(se);
27             if (!second || (curr && entity_before(curr, second)))
28                 second = curr;
29         }
30 
31         if (second && wakeup_preempt_entity(second, left) < 1)
32             se = second;
33     }
34 
35     /*
36      * Prefer last buddy, try to return the CPU to a preempted task.
37      */
38     if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
39         se = cfs_rq->last;
40 
41     /*
42      * Someone really wants this to run. If it's not unfair, run it.
43      */
44     if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
45         se = cfs_rq->next;
46 
47     clear_buddies(cfs_rq, se);
48 
49     return se;
50 }

该函数选择下一个调度的实体。第4行将红黑树的最左边实体赋给left，第11-12行如果红黑树的最左边实体为空或者当前实体运行的虚拟时间小于下一个实体（继续当前的实体），把当前实体赋给left，实际上left指向的是下一个要执行的进程（该进程要么还是当前进程，要么是下一个进程），第14行将left赋给se，第20-33行如果se进程是需要跳过的进程（不能被调度），执行if体，如果se进程是当前进程，则选择红黑数最左的进程赋给second，否则se进程已经是最左的进程，就把next指向的进程赋给second（next指向的是刚刚被唤醒的进程），第32行将second再次赋给se，se是挑选出来的下个进程。第38-45，再次判断，要么把last指向的进程赋给se，要么把next指向进程赋给se，内核更倾向于把last赋给se，因为last是唤醒next进程的进程（一般就是当前进程），所以执行last就意味着不用切换进程，效率最高。第47行清理掉next和last域。第31，38，44行使用到的wakeup_preempt_entity函数我们在进程唤醒那块再分析。

 1 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 2 {
 3     /*
 4      * If still on the runqueue then deactivate_task()
 5      * was not called and update_curr() has to be done:
 6      */
 7     if (prev->on_rq)
 8         update_curr(cfs_rq);
 9 
10     /* throttle cfs_rqs exceeding runtime */
11     check_cfs_rq_runtime(cfs_rq);
12 
13     check_spread(cfs_rq, prev);
14     if (prev->on_rq) {
15         update_stats_wait_start(cfs_rq, prev);
16         /* Put 'current' back into the tree. */
17         __enqueue_entity(cfs_rq, prev);
18         /* in !on_rq case, update occurred at dequeue */
19         update_entity_load_avg(prev, 1);
20     }
21     cfs_rq->curr = NULL;
22 }

该函数将当前调度实体放回就绪队列。第7-8行如果当前实体正在运行，更新其时间片。第17行将当前实体加入到就绪队列中，待会分析__enqueue_entity函数。第21行将就绪队列的curr域置为null，因为当前进程已经放回就绪队列了，就表示当前没有进程正在执行了，因此curr为空。

 1 static void
 2 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 3 {
 4     /* 'current' is not kept within the tree. */
 5     if (se->on_rq) {
 6         /*
 7          * Any task has to be enqueued before it get to execute on
 8          * a CPU. So account for the time it spent waiting on the
 9          * runqueue.
10          */
11         update_stats_wait_end(cfs_rq, se);
12         __dequeue_entity(cfs_rq, se);
13     }
14 
15     update_stats_curr_start(cfs_rq, se);
16     cfs_rq->curr = se;
17 #ifdef CONFIG_SCHEDSTATS
18     /*
19      * Track our maximum slice length, if the CPU's load is at
20      * least twice that of our own weight (i.e. dont track it
21      * when there are only lesser-weight tasks around):
22      */
23     if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
24         se->statistics.slice_max = max(se->statistics.slice_max,
25             se->sum_exec_runtime - se->prev_sum_exec_runtime);
26     }
27 #endif
28     se->prev_sum_exec_runtime = se->sum_exec_runtime;
29 }

该函数将下一个被调度实体从就绪队列中取出。第12行实现取出操作，待会分析该函数。第16行将取出的调度实体指针赋给就绪队列的curr，那么此时就有了正在运行的进程了。后边的代码更新当前进程的时间统计信息。

6.就绪队列的入队和出队操作（kernel/sched/fair.c）。

 1 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2 {
 3     struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 4     struct rb_node *parent = NULL;
 5     struct sched_entity *entry;
 6     int leftmost = 1;
 7 
 8     /*
 9      * Find the right place in the rbtree:
10      */
11     while (*link) {
12         parent = *link;
13         entry = rb_entry(parent, struct sched_entity, run_node);
14         /*
15          * We dont care about collisions. Nodes with
16          * the same key stay together.
17          */
18         if (entity_before(se, entry)) {
19             link = &parent->rb_left;
20         } else {
21             link = &parent->rb_right;
22             leftmost = 0;
23         }
24     }
25 
26     /*
27      * Maintain a cache of leftmost tree entries (it is frequently
28      * used):
29      */
30     if (leftmost)
31         cfs_rq->rb_leftmost = &se->run_node;
32 
33     rb_link_node(&se->run_node, parent, link);
34     rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
35 }

该函数实现入队操作。第3行获取就绪队列中红黑树的根节点，将树根指针保存在link中。第12行parent暂时指向树根。第13行获得树根节点的调度实体，保存在entry中。第18-22行，比较要入队的实体中的已运行虚拟时间和树根实体中的该信息，如果前者小的话，就要插入到树的左子树上（link指向树根的左孩子，再次进入循环，类似于递归），否则就要插入到树的右子树上（同上）。这块就将进程的调度策略展现的淋漓尽致：根据进程已运行的虚拟时间来决定进程的调度，红黑树的左子树比右子树要先被调度，已运行的虚拟时间越小的进程越在树的左侧。第30-31行，如果入队的实体最终被插在左孩子上（该入队实体仍是就绪队列上最靠前的实体，下次就会被调用），那么就要让就绪队列的rb_leftmost指向入队实体。rb_leftmost指针始终指向下次要被调度的实体（进程）。最后两行要给红黑数重新着色。

 1 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 2 {
 3     if (cfs_rq->rb_leftmost == &se->run_node) {
 4         struct rb_node *next_node;
 5 
 6         next_node = rb_next(&se->run_node);
 7         cfs_rq->rb_leftmost = next_node;
 8     }
 9 
10     rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
11 }

该函数实现出队操作。第3行判断要出队的实体是不是红黑树最左侧的孩子（rb_leftmost所指向的），如果不是的话，第10行直接删除该出队实体。否则执行if体，第6行找到出队实体的下一个实体（中序遍历的下一个节点，也就是当出队实体删除后，最左边的孩子），赋给next_node。第7行让rb_leftmost指向next_node。在删除掉要出队实体后，下一个需要被调度的实体也就设置好了。

7.睡眠进程被唤醒后抢占当前进程。当某个资源空出来后，等待该资源的进程就会被唤醒，唤醒后也许就要抢占当前进程，因此这块的函数也需要分析（kernel/sched/core.c）。

 1 static int
 2 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 3 {
 4     unsigned long flags;
 5     int cpu, success = 0;
 6 
 7     /*
 8      * If we are going to wake up a thread waiting for CONDITION we
 9      * need to ensure that CONDITION=1 done by the caller can not be
10      * reordered with p->state check below. This pairs with mb() in
11      * set_current_state() the waiting thread does.
12      */
13     smp_mb__before_spinlock();
14     raw_spin_lock_irqsave(&p->pi_lock, flags);
15     if (!(p->state & state))
16         goto out;
17 
18     success = 1; /* we're going to change ->state */
19     cpu = task_cpu(p);
20 
21     if (p->on_rq && ttwu_remote(p, wake_flags))
22         goto stat;
23 
24 #ifdef CONFIG_SMP
25     /*
26      * If the owning (remote) cpu is still in the middle of schedule() with
27      * this task as prev, wait until its done referencing the task.
28      */
29     while (p->on_cpu)
30         cpu_relax();
31     /*
32      * Pairs with the smp_wmb() in finish_lock_switch().
33      */
34     smp_rmb();
35 
36     p->sched_contributes_to_load = !!task_contributes_to_load(p);
37     p->state = TASK_WAKING;
38 
39     if (p->sched_class->task_waking)
40         p->sched_class->task_waking(p);
41 
42     cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
43     if (task_cpu(p) != cpu) {
44         wake_flags |= WF_MIGRATED;
45         set_task_cpu(p, cpu);
46     }
47 #endif /* CONFIG_SMP */
48 
49     ttwu_queue(p, cpu);
50 stat:
51     ttwu_stat(p, cpu, wake_flags);
52 out:
53     raw_spin_unlock_irqrestore(&p->pi_lock, flags);
54 
55     return success;
56 }

该函数会唤醒参数p指定的进程，将进程加入就绪队列中等待调度。第15行判断p进程的状态码和传进来的状态码是否一致，不一致的话函数结束（不一致则说明进程等待的条件未满足）。第19行获取要唤醒进程p原先所在的cpu号。第37行设置要唤醒进程p的状态为TASK_WAKING。第40行执行进程p的调度类中的task_waking函数，该函数指针指向了task_waking_fair函数，该函数主要是对睡眠进程的已运行虚拟时间进行补偿，待会分析该函数。第42行为刚唤醒进程p选择一个合适的就绪队列供其加入，返回就绪队列所在的cpu号。第43行如果进程p所在的原先cpu和为它挑选的cpu不是同一个的话，第45行更改进程p的cpu号。

 1 void wake_up_new_task(struct task_struct *p)
 2 {
 3     unsigned long flags;
 4     struct rq *rq;
 5 
 6     raw_spin_lock_irqsave(&p->pi_lock, flags);
 7 #ifdef CONFIG_SMP
 8     /*
 9      * Fork balancing, do it here and not earlier because:
10      *  - cpus_allowed can change in the fork path
11      *  - any previously selected cpu might disappear through hotplug
12      */
13     set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
14 #endif
15 
16     /* Initialize new task's runnable average */
17     init_task_runnable_average(p);
18     rq = __task_rq_lock(p);
19     activate_task(rq, p, 0);
20     p->on_rq = 1;
21     trace_sched_wakeup_new(p, true);
22     check_preempt_curr(rq, p, WF_FORK);
23 #ifdef CONFIG_SMP
24     if (p->sched_class->task_woken)
25         p->sched_class->task_woken(rq, p);
26 #endif
27     task_rq_unlock(rq, p, &flags);
28 }

该函数用来唤醒刚创建好的进程，而上个函数是用来唤醒睡眠中的进程。第13行为将唤醒的进程p设置合适的cpu。第17行设置进程p的可运行时间长度（类似“时间片”），第19行activate_task函数主要将唤醒的进程p加入就绪队列,并更新队列的时间,初始化进程p的时间等，该函数中的调用关系大致为activate_task->enqueue_task->enqueue_task_fair(p->sched_class->enqueue_task)->enqueue_entity。第22行check_preempt_curr函数调用了check_preempt_wakeup函数，来检查唤醒进程是否能抢占当前进程，下面分析该函数（kernel/sched/fair.c）。

 1 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 2 {
 3     struct task_struct *curr = rq->curr;
 4     struct sched_entity *se = &curr->se, *pse = &p->se;
 5     struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 6     int scale = cfs_rq->nr_running >= sched_nr_latency;
 7     int next_buddy_marked = 0;
 8 
 9     if (unlikely(se == pse))
10         return;
11 
12     /*
13      * This is possible from callers such as move_task(), in which we
14      * unconditionally check_prempt_curr() after an enqueue (which may have
15      * lead to a throttle).  This both saves work and prevents false
16      * next-buddy nomination below.
17      */
18     if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
19         return;
20 
21     if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
22         set_next_buddy(pse);
23         next_buddy_marked = 1;
24     }
25 
26     /*
27      * We can come here with TIF_NEED_RESCHED already set from new task
28      * wake up path.
29      *
30      * Note: this also catches the edge-case of curr being in a throttled
31      * group (e.g. via set_curr_task), since update_curr() (in the
32      * enqueue of curr) will have resulted in resched being set.  This
33      * prevents us from potentially nominating it as a false LAST_BUDDY
34      * below.
35      */
36     if (test_tsk_need_resched(curr))
37         return;
38 
39     /* Idle tasks are by definition preempted by non-idle tasks. */
40     if (unlikely(curr->policy == SCHED_IDLE) &&
41         likely(p->policy != SCHED_IDLE))
42         goto preempt;
43 
44     /*
45      *  do not preempt non-idle tasks (their preemption
46      * is driven by the tick):
47      */
48     if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
49         return;
50 
51     find_matching_se(&se, &pse);
52     update_curr(cfs_rq_of(se));
53     BUG_ON(!pse);
54     if (wakeup_preempt_entity(se, pse) == 1) {
55         /*
56          * Bias pick_next to pick the sched entity that is
57          * triggering this preemption.
58          */
59         if (!next_buddy_marked)
60             set_next_buddy(pse);
61         goto preempt;
62     }
63 
64     return;
65 
66 preempt:
67     resched_task(curr);
68     /*
69      * Only set the backward buddy when the current task is still
70      * on the rq. This can happen when a wakeup gets interleaved
71      * with schedule on the ->pre_schedule() or idle_balance()
72      * point, either of which can * drop the rq lock.
73      *
74      * Also, during early boot the idle thread is in the fair class,
75      * for obvious reasons its a bad idea to schedule back to it.
76      */
77     if (unlikely(!se->on_rq || curr == rq->idle))
78         return;
79 
80     if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
81         set_last_buddy(se);
82 }

第21-24行，如果开启了NEXT_BUDDY并且唤醒的进程不是新进程（而是睡眠进程），那么第22行就将cfs_rq的next指针指向唤醒的进程，并且设置标记。第36行如果当前进程可以被抢占，函数直接返回。否则，第40-42行如果当前进程是空闲进程并且被唤醒的进程不是空闲进程，则跳到preempt处，设置need_resched标志，完成抢占设置。第48行，如果被唤醒进程是空闲进程或者批处理进程，直接返回，这些进程不能抢占别的进程。第51行如果当前进程和被唤醒进程不在同一级别（同一个调度组），则寻找它们的祖先parent，把它们调整到同一级别，才能进行虚拟运行时间的比较，进而决定能否抢占。第54行，对当前进程和被唤醒进程的虚拟运行时间进行比较，可以抢占的话（唤醒进程的虚拟时间小于当前进程）执行if体，跳到preempt处完成抢占。否则所有都不满足的话，当前进程不能被抢占，执行第64行返回，随后分析该函数。第80-81行如果开启了LAST_BUDDY，就将cfs_rq的last指针指向唤醒进程的进程。在pick_next_entity函数中，next和last所指的进程会先于就绪队列的left进程被选择。下面分析下wakeup_preempt_entity函数（kernel/sched/fair.c）。

 1 static int
 2 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 3 {
 4     s64 gran, vdiff = curr->vruntime - se->vruntime;
 5 
 6     if (vdiff <= 0)
 7         return -1;
 8 
 9     gran = wakeup_gran(curr, se);
10     if (vdiff > gran)
11         return 1;
12 
13     return 0;
14 }

该函数是要保证在se实体在抢占curr实体时，curr实体已经运行过一段时间（具体而言，物理时间1ms），第9行wakeup_gran函数是将sysctl_sched_wakeup_granularity的值（1ms）转换成se实体的虚拟时间，保存在gran中，第10行比较vdiff和gran大小，实际上是比较curr->vruntime 和 se->vruntime+gran，因此就是想让curr实体多执行gran时间，才能被抢占。

最后我们再分析下 try_to_wake_up函数中第40行遗留的那个函数指针task_waking，该指针指向了task_waking_fair函数，代码如下（kernel/sched/fair.c）：

 1 static void task_waking_fair(struct task_struct *p)
 2 {
 3     struct sched_entity *se = &p->se;
 4     struct cfs_rq *cfs_rq = cfs_rq_of(se);
 5     u64 min_vruntime;
 6 
 7 #ifndef CONFIG_64BIT
 8     u64 min_vruntime_copy;
 9 
10     do {
11         min_vruntime_copy = cfs_rq->min_vruntime_copy;
12         smp_rmb();
13         min_vruntime = cfs_rq->min_vruntime;
14     } while (min_vruntime != min_vruntime_copy);
15 #else
16     min_vruntime = cfs_rq->min_vruntime;
17 #endif
18 
19     se->vruntime -= min_vruntime;
20     record_wakee(p);
21 }

该函数完成对睡眠进程的虚拟时间补偿。考虑到睡眠时间长时间没有运行，因此第19行从唤醒进程se的已运行虚拟时间中减去就绪队列的最小虚拟时间，做点补偿，让其可以多运行一点时间。

8.新进程的处理函数（kernel/sched/fair.c）：

 1 static void task_fork_fair(struct task_struct *p)
 2 {
 3     struct cfs_rq *cfs_rq;
 4     struct sched_entity *se = &p->se, *curr;
 5     int this_cpu = smp_processor_id();
 6     struct rq *rq = this_rq();
 7     unsigned long flags;
 8 
 9     raw_spin_lock_irqsave(&rq->lock, flags);
10 
11     update_rq_clock(rq);
12 
13     cfs_rq = task_cfs_rq(current);
14     curr = cfs_rq->curr;
15 
16     /*
17      * Not only the cpu but also the task_group of the parent might have
18      * been changed after parent->se.parent,cfs_rq were copied to
19      * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
20      * of child point to valid ones.
21      */
22     rcu_read_lock();
23     __set_task_cpu(p, this_cpu);
24     rcu_read_unlock();
25 
26     update_curr(cfs_rq);
27 
28     if (curr)
29         se->vruntime = curr->vruntime;
30     place_entity(cfs_rq, se, 1);
31 
32     if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
33         /*
34          * Upon rescheduling, sched_class::put_prev_task() will place
35          * 'current' within the tree based on its new key value.
36          */
37         swap(curr->vruntime, se->vruntime);
38         resched_task(rq->curr);
39     }
40 
41     se->vruntime -= cfs_rq->min_vruntime;
42 
43     raw_spin_unlock_irqrestore(&rq->lock, flags);
44 }

该函数在do_fork--->copy_process函数中调用，用来设置新创建进程的虚拟时间信息。第5行获取当前的cpu号，第23行为新进程设置该cpu号。第29行将当前进程（父进程）的虚拟运行时间拷贝给新进程（子进程）。第30行的place_entity函数完成新进程的“时间片”计算以及虚拟时间惩罚，之后将新进程加入红黑数中，待会分析。第32行如果设置了子进程先于父进程运行的标志并且当前进程不为空且当前进程已运行的虚拟时间比新进程小，则执行if体，第37行交换当前进程和新进程的虚拟时间（新进程的虚拟时间变小，就排在了红黑树的左侧，当前进程之前，下次就能被调度），第38行设置重新调度标志。第41行给新进程的虚拟运行时间减去队列的最小虚拟时间来做一点补偿（因为在上边的place_entity函数中给新进程的虚拟时间加了一次min_vruntime，所以在这里要减去），再来看看place_entity函数（kernel/sched/fair.c）：

 1 static void
 2 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 3 {
 4     u64 vruntime = cfs_rq->min_vruntime;
 5 
 6     /*
 7      * The 'current' period is already promised to the current tasks,
 8      * however the extra weight of the new task will slow them down a
 9      * little, place the new task so that it fits in the slot that
10      * stays open at the end.
11      */
12     if (initial && sched_feat(START_DEBIT))
13         vruntime += sched_vslice(cfs_rq, se);
14 
15     /* sleeps up to a single latency don't count. */
16     if (!initial) {
17         unsigned long thresh = sysctl_sched_latency;
18 
19         /*
20          * Halve their sleep time's effect, to allow
21          * for a gentler effect of sleepers:
22          */
23         if (sched_feat(GENTLE_FAIR_SLEEPERS))
24             thresh >>= 1;
25 
26         vruntime -= thresh;
27     }
28 
29     /* ensure we never gain time by being placed backwards. */
30     se->vruntime = max_vruntime(se->vruntime, vruntime);
31 }

该函数完成新进程的“时间片”计算和虚拟时间惩罚，并且将新进程加入就绪队列。第4行将就绪队列的min_vruntime值存入vruntime中，第12-13行，如果initial标志为1的话（说明当前计算的是新进程的时间），将计算出的新进程的虚拟时间片累加到vruntime中，累加到原因是调度系统要保证先把就绪队列中的所有的进程执行一遍之后才能执行新进程，一会具体解释。第16-17行，如果当前计算的不是新进程（睡眠的进程），把一个延迟周期的长度sysctl_sched_latency（6ms）赋给thresh，第24行thresh减半，第26行睡眠进程的虚拟运行时间减去减半后的thresh，因为睡眠进程好长时间未运行，因此要进行虚拟时间补偿，把它已运行的虚拟时间减小一点，使得它能多运行一会。第30行将设置好的虚拟时间保存到进程调度实体的vruntime域。下面解释下为什么要对新进程进行虚拟时间惩罚，其实原因只有一个，就是调度系统要保证将就绪队列中现有的进程执行一遍之后再执行新进程，那么就必须使新进程的 vruntime=cfs_rq->min_vruntime+新进程的虚拟时间片，才能使得新进程插入到红黑树的右边，最后参与调度，不然无法保证所有进程在新进程之前执行。

最后，分析下和调度相关的这些函数执行的时机

前面在介绍这些函数的时候，基本上都提到了会在哪里调用这些函数，最后，我们再系统总结一下：

进程调度分为两个部分：一是进程信息的修改，二是进程切换。进程切换只有一个函数schedule，schedule的运行时机我们最后分析。其它函数的运行时机如下：

1.scheduler_tick函数是在每个时钟中断中被调用，用来更新当前进程运行的时间。

2.effective_prio函数是在创建一个新进程或者用户使用nice系统调用设置进程的优先级时调用，用来设置进程的在内核中优先级（不同于nice值）。

3.set_load_weight函数也是在创建新进程或者用户使用nice()设置进程的优先级时调用，用来设置进程的权重。该函数和2中的函数其实是配套使用的，当设置或者改变了一个进程的优先级时，要么就要为这个进程设置或者改变该优先级对应的权重。

4.sched_slice函数主要是在scheduler_tick->...->check_preempt_tick中调用（别的地方也有），因此也是每个时钟周期调用一次，用来计算当前进程应该执行的“时间片”，进而才能判断进程是否已经超出它的时间片，超出的话就要设置抢占标志，切换别的进程。

5.pick_next_task_fair函数schedule中调用，用来选择下一个要被调度的进程，然后才能切换进程。它的执行时机就是schedule的执行时机，随后分析。

6.__enqueue_entity/__dequeue_entity函数是在需要入就绪队列或者出就绪队列的地方被调用，因此使用它们的地方较多，比如schedule中调用（间接调用），就不一一分析了。

7.try_to_wake_up/wake_up_new_task函数，前者在进程所等待的资源满足时被调用（一般在中断内调用）；后者是在创建好新进程后被调用。都是用来唤醒进程的，前者唤醒睡眠进程，后者唤醒新进程并将进程加入就绪队列。

8.task_fork_fair函数也是在新进程被创建好后调用，用来设置新进程的“时间片”等信息，设置好以后新进程就可以被唤醒了。所以该函数和wake_up_new_task函数调用时机是一样的。

最后，我们分析schedule函数的调用时机。该函数是唯一实现进程切换的函数。

在分析schedule函数的调用时机之前，我们先为大家介绍下“内核控制路径“的概念。所谓内核控制路径，就是由中断或者异常引出的执行路径，说白了，执行中断或者异常处理程序时就处在内核控制路径中，此时代表的也是当前进程。内核控制路径可以嵌套（也就是可以嵌套中断），但是无论嵌套多少，最终都要回到当前进程中，也就是说要从内核控制路径中返回，不可以在内核控制路径中进行进程切换（因此中断处理程序中不允许调用能引起进程睡眠的函数）。说到底，内核要么处在进程中，要么处在内核控制路径中（其实内核控制路径也代表当前进程，因为其有特殊性，所以我们单列出来谈），不会再有别的什么路径了。那么进程切换的时机，或者说schedule函数被调用的时机，也就只可能发生于上述两种路径中。那么，1.当在进程中调用schedule函数时（就是ULK这本书上说的直接调用），表明当前进程因为等待资源或者别的原因需要挂起，主动放弃使用cpu，调用schedule函数切换到别的进程中；2.当在内核控制路径中调用schedule函数时（上边说过了，内核控制路径中不允许切换进程），其实是在内核控制路径返回进程时调用（该时机就是ULK上说的延迟调用），说明有更重要的进程等待执行，需要抢占当前进程，因此在中断处理程序/异常处理程序返回时都要去检查当前进程能否被抢占，可以抢占的话就要调用schedule函数进行进程切换，包括从系统调用中返回用户空间时也要检查（这是统一的，因为系统调用本身也是异常，因此从系统调用中返回相当于从异常处理程序中返回，通过系统调用进入到内核态也可以说是内核控制路径，但是一般不这么叫）当前进程的抢占标志，能发生抢占的话就要去调用schedule函数。至此，进程切换的时机就分析完了。很好记的，要么是进程上下文发生进程切换（主动调用schedule），要么是从中断返回时切换，因此每次中断返回时必须要检查能否发生抢占，包括从系统调用返回也属于这种情形。

至此，进程调度机制咱们就分析完了（其实只分析了CFS调度）。实时进程调度以后再分析！

参考书籍：《深入理解linux内核》

　　　　　《深入理解linux内核架构》

参考文章：blog.csdn.net/wudongxu/article/details/8574737

　　　　　blog.csdn.net/dog250/article/details/5302869

　　　　 chxxxyg.blog.163.com/blog/static/1502811932012912546208/