Linux内核设计与实现总结笔记（第三章）进程

进程管理

进程：处于执行期的程序。

线程：在进程中活动的对象

虚拟机制

虚拟处理器：多个进程分享一个处理器

虚拟内存：多个线程共享虚拟内存

一、进程描述符和任务结构

进程存放在双向循环链表中（队列），链表中的项为task_struct，称为进程描述符。在头文件<linux/sched.h>中。

struct task_struct {
    volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
    void *stack;
    atomic_t usage;
    unsigned int flags; /* per process flags, defined below */
    unsigned int ptrace;

#ifdef CONFIG_SMP
    struct task_struct *wake_entry;
    int on_cpu;
#endif
    int on_rq;

    int prio, static_prio, normal_prio;
    unsigned int rt_priority;
    const struct sched_class *sched_class;
    struct sched_entity se;
    struct sched_rt_entity rt;

#ifdef CONFIG_PREEMPT_NOTIFIERS
    /* list of struct preempt_notifier: */
    struct hlist_head preempt_notifiers;
#endif

    /*
     * fpu_counter contains the number of consecutive context switches
     * that the FPU is used. If this is over a threshold, the lazy fpu
     * saving becomes unlazy to save the trap. This is an unsigned char
     * so that after 256 times the counter wraps and the behavior turns
     * lazy again; this to deal with bursty apps that only use FPU for
     * a short time
     */
    unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
    unsigned int btrace_seq;
#endif

    unsigned int policy;
    cpumask_t cpus_allowed;

#ifdef CONFIG_PREEMPT_RCU
    int rcu_read_lock_nesting;
    char rcu_read_unlock_special;
#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU)
    int rcu_boosted;
#endif /* #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) */
    struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TREE_PREEMPT_RCU
    struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
    struct rt_mutex *rcu_boost_mutex;
#endif /* #ifdef CONFIG_RCU_BOOST */

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
    struct sched_info sched_info;
#endif

    struct list_head tasks;
#ifdef CONFIG_SMP
    struct plist_node pushable_tasks;
#endif

    struct mm_struct *mm, *active_mm;
#ifdef CONFIG_COMPAT_BRK
    unsigned brk_randomized:1;
#endif
#if defined(SPLIT_RSS_COUNTING)
    struct task_rss_stat    rss_stat;
#endif
/* task state */
    int exit_state;
    int exit_code, exit_signal;
    int pdeath_signal;  /*  The signal sent when the parent dies  */
    unsigned int group_stop;    /* GROUP_STOP_*, siglock protected */
    /* ??? */
    unsigned int personality;
    unsigned did_exec:1;
    unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
                 * execve */
    unsigned in_iowait:1;


    /* Revert to default priority/policy when forking */
    unsigned sched_reset_on_fork:1;
    unsigned sched_contributes_to_load:1;

    pid_t pid;
    pid_t tgid;

#ifdef CONFIG_CC_STACKPROTECTOR
    /* Canary value for the -fstack-protector gcc feature */
    unsigned long stack_canary;
#endif

    /* 
     * pointers to (original) parent process, youngest child, younger sibling,
     * older sibling, respectively.  (p->father can be replaced with 
     * p->real_parent->pid)
     */
    struct task_struct *real_parent; /* real parent process */
    struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
    /*
     * children/sibling forms the list of my natural children
     */
    struct list_head children;  /* list of my children */
    struct list_head sibling;   /* linkage in my parent's children list */
    struct task_struct *group_leader;   /* threadgroup leader */

    /*
     * ptraced is the list of tasks this task is using ptrace on.
     * This includes both natural children and PTRACE_ATTACH targets.
     * p->ptrace_entry is p's link on the p->parent->ptraced list.
     */
    struct list_head ptraced;
    struct list_head ptrace_entry;

    /* PID/PID hash table linkage. */
    struct pid_link pids[PIDTYPE_MAX];
    struct list_head thread_group;

    struct completion *vfork_done;      /* for vfork() */
    int __user *set_child_tid;      /* CLONE_CHILD_SETTID */
    int __user *clear_child_tid;        /* CLONE_CHILD_CLEARTID */

    cputime_t utime, stime, utimescaled, stimescaled;
    cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
    cputime_t prev_utime, prev_stime;
#endif
    unsigned long nvcsw, nivcsw; /* context switch counts */
    struct timespec start_time;         /* monotonic time */
    struct timespec real_start_time;    /* boot based time */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
    unsigned long min_flt, maj_flt;

    struct task_cputime cputime_expires;
    struct list_head cpu_timers[3];
/* process credentials */
    const struct cred __rcu *real_cred; /* objective and real subjective task
                     * credentials (COW) */
    const struct cred __rcu *cred;  /* effective (overridable) subjective task
                     * credentials (COW) */
    struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */

    char comm[TASK_COMM_LEN]; /* executable name excluding path
                     - access with [gs]et_task_comm (which lock
                       it with task_lock())
                     - initialized normally by setup_new_exec */
/* file system info */
    int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
    struct sysv_sem sysvsem;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
    unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
    struct thread_struct thread;
/* filesystem information */
    struct fs_struct *fs;
/* open file information */
    struct files_struct *files;
/* namespaces */
    struct nsproxy *nsproxy;
/* signal handlers */
    struct signal_struct *signal;
    struct sighand_struct *sighand;

    sigset_t blocked, real_blocked;
    sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
    struct sigpending pending;

    unsigned long sas_ss_sp;
    size_t sas_ss_size;
    int (*notifier)(void *priv);
    void *notifier_data;
    sigset_t *notifier_mask;
    struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
    uid_t loginuid;
    unsigned int sessionid;
#endif
    seccomp_t seccomp;

/* Thread group tracking */
    u32 parent_exec_id;
    u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
 * mempolicy */
    spinlock_t alloc_lock;

#ifdef CONFIG_GENERIC_HARDIRQS
    /* IRQ handler threads */
    struct irqaction *irqaction;
#endif

    /* Protection of the PI data structures: */
    raw_spinlock_t pi_lock;

#ifdef CONFIG_RT_MUTEXES
    /* PI waiters blocked on a rt_mutex held by this task */
    struct plist_head pi_waiters;
    /* Deadlock detection and priority inheritance handling */
    struct rt_mutex_waiter *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
    /* mutex deadlock detection */
    struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
    unsigned int irq_events;
    unsigned long hardirq_enable_ip;
    unsigned long hardirq_disable_ip;
    unsigned int hardirq_enable_event;
    unsigned int hardirq_disable_event;
    int hardirqs_enabled;
    int hardirq_context;
    unsigned long softirq_disable_ip;
    unsigned long softirq_enable_ip;
    unsigned int softirq_disable_event;
    unsigned int softirq_enable_event;
    int softirqs_enabled;
    int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
    u64 curr_chain_key;
    int lockdep_depth;
    unsigned int lockdep_recursion;
    struct held_lock held_locks[MAX_LOCK_DEPTH];
    gfp_t lockdep_reclaim_gfp;
#endif

/* journalling filesystem info */
    void *journal_info;

/* stacked block device info */
    struct bio_list *bio_list;

#ifdef CONFIG_BLOCK
/* stack plugging */
    struct blk_plug *plug;
#endif

/* VM state */
    struct reclaim_state *reclaim_state;

    struct backing_dev_info *backing_dev_info;

    struct io_context *io_context;

    unsigned long ptrace_message;
    siginfo_t *last_siginfo; /* For ptrace use.  */
    struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
    u64 acct_rss_mem1;  /* accumulated rss usage */
    u64 acct_vm_mem1;   /* accumulated virtual memory usage */
    cputime_t acct_timexpd; /* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
    nodemask_t mems_allowed;    /* Protected by alloc_lock */
    int mems_allowed_change_disable;
    int cpuset_mem_spread_rotor;
    int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
    /* Control Group info protected by css_set_lock */
    struct css_set __rcu *cgroups;
    /* cg_list protected by css_set_lock and tsk->alloc_lock */
    struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
    struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
    struct compat_robust_list_head __user *compat_robust_list;
#endif
    struct list_head pi_state_list;
    struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
    struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
    struct mutex perf_event_mutex;
    struct list_head perf_event_list;
#endif
#ifdef CONFIG_NUMA
    struct mempolicy *mempolicy;    /* Protected by alloc_lock */
    short il_next;
    short pref_node_fork;
#endif
    atomic_t fs_excl;   /* holding fs exclusive resources */
    struct rcu_head rcu;

    /*
     * cache last used pipe for splice
     */
    struct pipe_inode_info *splice_pipe;
#ifdef  CONFIG_TASK_DELAY_ACCT
    struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
    int make_it_fail;
#endif
    struct prop_local_single dirties;
#ifdef CONFIG_LATENCYTOP
    int latency_record_count;
    struct latency_record latency_record[LT_SAVECOUNT];
#endif
    /*
     * time slack values; these are used to round up poll() and
     * select() etc timeout values. These are in nanoseconds.
     */
    unsigned long timer_slack_ns;
    unsigned long default_timer_slack_ns;

    struct list_head    *scm_work_list;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
    /* Index of current stored address in ret_stack */
    int curr_ret_stack;
    /* Stack of return addresses for return function tracing */
    struct ftrace_ret_stack *ret_stack;
    /* time stamp for last schedule */
    unsigned long long ftrace_timestamp;
    /*
     * Number of functions that haven't been traced
     * because of depth overrun.
     */
    atomic_t trace_overrun;
    /* Pause for the tracing */
    atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
    /* state flags for use by tracers */
    unsigned long trace;
    /* bitmask and counter of trace recursion */
    unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
    struct memcg_batch_info {
        int do_batch;   /* incremented when batch uncharge started */
        struct mem_cgroup *memcg; /* target memcg of uncharge */
        unsigned long nr_pages; /* uncharged usage */
        unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
    } memcg_batch;
#endif
#ifdef CONFIG_HAVE_HW_BREAKPOINT
    atomic_t ptrace_bp_refcnt;
#endif
};

task_struct

slab分配器分配task_struct结构，2.6后slab分配器动态生成task struct，只需要在栈底（向下增长的栈）或栈顶（向上增长的栈）创建一个新的结构struct thread_info。

在头文件<asm/thread_info.h>中，找了半天在arch/arm/include/asm/thread_info.h：

/*
 * low level task data that entry.S needs immediate access to.
 * __switch_to() assumes cpu_context follows immediately after cpu_domain.
 */
struct thread_info {
    unsigned long       flags;      /* low level flags */
    int         preempt_count;  /* 0 => preemptable, <0 => bug */
    mm_segment_t        addr_limit; /* address limit */
    struct task_struct  *task;      /* main task structure */
    struct exec_domain  *exec_domain;   /* execution domain */
    __u32           cpu;        /* cpu */
    __u32           cpu_domain; /* cpu domain */
    struct cpu_context_save cpu_context;    /* cpu context */
    __u32           syscall;    /* syscall number */
    __u8            used_cp[16];    /* thread used copro */
    unsigned long       tp_value;
    struct crunch_state crunchstate;
    union fp_state      fpstate __attribute__((aligned(8)));
    union vfp_state     vfpstate;
#ifdef CONFIG_ARM_THUMBEE
    unsigned long       thumbee_state;  /* ThumbEE Handler Base register */
#endif
    struct restart_block    restart_block;
};

thread_info

在内核栈的尾端，分配了threa_info，结构体task域存放指向该任务的实际task_struct指针。

二、进程描述符

PID是一个数，实际上是int类型。内核把各自的进程的PID放在各自的进程描述符中。

修改进程最大限制可以在文件<linux/threads.h>修改，或者修改/proc/sys/kernel/pid_max来提高上限

2.1 进程状态

TASK_RUNNING：进程使可运行的

TASK_INTERRUPTIBLE：进程正在睡眠

TASK_UNINTERRUPTIBLE：进程就算收到信号也不会唤醒

__TASK_TRACED：被其他进程跟踪的进程

__TASK_STOPPED：进程停止执行

2.2 进程状态设置

使用函数调整某个进程的状态

set_task_state(task, state);        /* 将任务task的状态设置为state */

进程上下文：当一个程序执行了系统调用，或触发了某个异常，就陷入了内核空间。此时，内核代表进程执行并处于进程上下文中。

进程家族树：拥有同一个父进程的所有进程被称为兄弟。

每个task_struct都包含一个指向其父进程task_struct叫做parent的指针，还包含一个称为children的子进程链表。

/* 获取其父进程的进程描述符 */
struct task_struct *my_parent = current->parent;  

/* 一次访问子进程 */
struct task_struct *task;
struct list_head *list;
list_for_each(list, &current->children) {
    task = list_entry(list, struct task_struct, sibling);
    /* task现在是指向当前的某个子进程 */
}

/* 访问init */
struct task_struct *task;
for(task = current; task != &init_task; task = task->parent)
    ;

/* 遍历系统中所有进程 */
list_entry(task->tasks.next, struct task_struct, tasks)
list_entry(task->tasks.prev, struct task_struct, tasks)

struct task_struct *task;
for_each_process(task) {
    /* 它打印出每一个任务的名称和PID */
    printk("%s[%d] 
", task->comm, task->pid);
}
/* 注意：代价极大*/

task_struct遍历

三、进程创建

首先，fork()通过拷贝当前进程创建要给子进程。子进程与父进程的区别仅仅在于PID和PPID，和某些资源和统计量（如挂起信号）。

exec()负责读取可执行我呢见并将其载入地址空间开始运行。

写时拷贝：一种可以推迟甚至免除拷贝数据的技术。内核并不复制整个进程地址空间，而是父子进程共享同一个拷贝。

只有在要写入时，数据才会被复制。在此之前，都是只读方式共享。

3.1 fork()

Linux通过clone()系统调用实现fork()。do_fork()完成创建中的大部分工作，定义在kernel/fork.c中。

该函数调用copy_process()函数，然后让进程开始运行。

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          struct pt_regs *regs,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * Do some preliminary argument and permissions checking before we
     * actually start allocating stuff
     */
    if (clone_flags & CLONE_NEWUSER) {
        if (clone_flags & CLONE_THREAD)
            return -EINVAL;
        /* hopefully this check will go away when userns support is
         * complete
         */
        if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
                !capable(CAP_SETGID))
            return -EPERM;
    }

    /*
     * When called from kernel_thread, don't do user tracing stuff.
     */
    if (likely(user_mode(regs)))
        trace = tracehook_prepare_clone(clone_flags);

    /* 1) 调用copy_process()函数，然后让进程开始执行 */

    p = copy_process(clone_flags, stack_start, regs, stack_size,
             child_tidptr, NULL, trace);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        struct completion vfork;

        trace_sched_process_fork(current, p);

        nr = task_pid_vnr(p);

        if (clone_flags & CLONE_PARENT_SETTID)
            put_user(nr, parent_tidptr);

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
        }

        audit_finish_fork(p);
        tracehook_report_clone(regs, clone_flags, nr, p);

        /*
         * We set PF_STARTING at creation in case tracing wants to
         * use this to distinguish a fully live task from one that
         * hasn't gotten to tracehook_report_clone() yet.  Now we
         * clear it and set the child going.
         */
        p->flags &= ~PF_STARTING;

        wake_up_new_task(p);

        tracehook_report_clone_complete(trace, regs,
                        clone_flags, nr, p);

        if (clone_flags & CLONE_VFORK) {
            freezer_do_not_count();
            wait_for_completion(&vfork);
            freezer_count();
            tracehook_report_vfork_done(p, nr);
        }
    } else {
        nr = PTR_ERR(p);
    }
    return nr;
}


/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    struct pt_regs *regs,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace)
{
    int retval;
    struct task_struct *p;
    int cgroup_callbacks_done = 0;

    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);

    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
     */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);

    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
     */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);

    /*
     * Siblings of global init remain as zombies on exit since they are
     * not reaped by their parent (swapper). To solve this and to avoid
     * multi-rooted process trees, prevent global and container-inits
     * from creating siblings.
     */
    if ((clone_flags & CLONE_PARENT) &&
                current->signal->flags & SIGNAL_UNKILLABLE)
        return ERR_PTR(-EINVAL);

    retval = security_task_create(clone_flags);
    if (retval)
        goto fork_out;

    /* 1) 调用dup_task_struct()为新进程创建一个内核栈、thread_info结构和task_struct，这些值
     * 与当前进程的值相同。此时，子进程和父进程的描述符是完全相同的。
     */
    retval = -ENOMEM;
    p = dup_task_struct(current);
    if (!p)
        goto fork_out;

    ftrace_graph_init_task(p);

    rt_mutex_init_task(p);

#ifdef CONFIG_PROVE_LOCKING
    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
    
    /* 第一个if检查用户的进程数量是否超过限制
     * 第二个if检查用户是否有足够的权限操作
     */
    retval = -EAGAIN;
    if (atomic_read(&p->real_cred->user->processes) >=
            task_rlimit(p, RLIMIT_NPROC)) {
        if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
            p->real_cred->user != INIT_USER)
            goto bad_fork_free;
    }

    retval = copy_creds(p, clone_flags);
    if (retval < 0)
        goto bad_fork_free;

    /*
     * If multiple threads are within copy_process(), then this check
     * triggers too late. This doesn't hurt, the check is only there
     * to stop root fork bombs.
     * 2) 检查并确保新创建的这个子进程后，
     * 当前用户所拥有的进程数目没有超出给它分配的资源限制
     */
    retval = -EAGAIN;
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;

    /* 确认exec_domain模块是否被加载 */
    if (!try_module_get(task_thread_info(p)->exec_domain->module))      
        goto bad_fork_cleanup_count;

    p->did_exec = 0;
    delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
    /* 5) 调用copy_flags()以更新task_struct的flags成员，
     * 表明进程是否拥有超级用户权限的PF_SUPERPRIV标志被清0。
     * 表明进程还没有调用exec()函数的PF_FORKNOEXEC标志被设备
     */
    copy_flags(clone_flags, p);
    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    rcu_copy_process(p);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);

    init_sigpending(&p->pending);

    p->utime = cputime_zero;
    p->stime = cputime_zero;
    p->gtime = cputime_zero;
    p->utimescaled = cputime_zero;
    p->stimescaled = cputime_zero;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
    p->prev_utime = cputime_zero;
    p->prev_stime = cputime_zero;
#endif
#if defined(SPLIT_RSS_COUNTING)
    memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

    p->default_timer_slack_ns = current->timer_slack_ns;

    task_io_accounting_init(&p->ioac);
    acct_clear_integrals(p);

    posix_cpu_timers_init(p);

    do_posix_clock_monotonic_gettime(&p->start_time);
    p->real_start_time = p->start_time;
    monotonic_to_bootbased(&p->real_start_time);
    p->io_context = NULL;
    p->audit_context = NULL;
    if (clone_flags & CLONE_THREAD)
        threadgroup_fork_read_lock(current);
    cgroup_fork(p);
#ifdef CONFIG_NUMA
    p->mempolicy = mpol_dup(p->mempolicy);
     if (IS_ERR(p->mempolicy)) {
         retval = PTR_ERR(p->mempolicy);
         p->mempolicy = NULL;
         goto bad_fork_cleanup_cgroup;
     }
    mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
    p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
    p->hardirqs_enabled = 1;
#else
    p->hardirqs_enabled = 0;
#endif
    p->hardirq_enable_ip = 0;
    p->hardirq_enable_event = 0;
    p->hardirq_disable_ip = _THIS_IP_;
    p->hardirq_disable_event = 0;
    p->softirqs_enabled = 1;
    p->softirq_enable_ip = _THIS_IP_;
    p->softirq_enable_event = 0;
    p->softirq_disable_ip = 0;
    p->softirq_disable_event = 0;
    p->hardirq_context = 0;
    p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
    p->lockdep_depth = 0; /* no locks held yet */
    p->curr_chain_key = 0;
    p->lockdep_recursion = 0;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    p->memcg_batch.do_batch = 0;
    p->memcg_batch.memcg = NULL;
#endif

    /* Perform scheduler related setup. Assign this task to a CPU. */
    sched_fork(p);

    retval = perf_event_init_task(p);
    if (retval)
        goto bad_fork_cleanup_policy;

    if ((retval = audit_alloc(p)))
        goto bad_fork_cleanup_policy;
    
    /* 下面的操作中，根据flag中是否设置了相关标志
     * 进行重新分配或者共享父进程的内容
     */
    /* copy all the process information */
    if ((retval = copy_semundo(clone_flags, p)))
        goto bad_fork_cleanup_audit;
    if ((retval = copy_files(clone_flags, p)))
        goto bad_fork_cleanup_semundo;
    if ((retval = copy_fs(clone_flags, p)))
        goto bad_fork_cleanup_files;
    if ((retval = copy_sighand(clone_flags, p)))
        goto bad_fork_cleanup_fs;
    if ((retval = copy_signal(clone_flags, p)))
        goto bad_fork_cleanup_sighand;
    if ((retval = copy_mm(clone_flags, p)))
        goto bad_fork_cleanup_signal;
    if ((retval = copy_namespaces(clone_flags, p)))
        goto bad_fork_cleanup_mm;
    if ((retval = copy_io(clone_flags, p)))
        goto bad_fork_cleanup_namespaces;

    /* 更新子进程的内核栈和寄存器中的值 */
    retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
    if (retval)
        goto bad_fork_cleanup_io;

    /* 6)为新进程分配一个有效的PID */
    if (pid != &init_struct_pid) {
        retval = -ENOMEM;
        pid = alloc_pid(p->nsproxy->pid_ns);
        if (!pid)
            goto bad_fork_cleanup_io;
    }

    p->pid = pid_nr(pid);
    p->tgid = p->pid;
    if (clone_flags & CLONE_THREAD)
        p->tgid = current->tgid;

    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /*
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
#ifdef CONFIG_BLOCK
    p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
    p->robust_list = NULL;
#ifdef CONFIG_COMPAT
    p->compat_robust_list = NULL;
#endif
    INIT_LIST_HEAD(&p->pi_state_list);
    p->pi_state_cache = NULL;
#endif
    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        p->sas_ss_sp = p->sas_ss_size = 0;

    /*
     * Syscall tracing and stepping should be turned off in the
     * child regardless of CLONE_PTRACE.
     */
    user_disable_single_step(p);
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
    clear_all_latency_tracing(p);

    /* ok, now we should be set up.. */
    p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
    p->pdeath_signal = 0;
    p->exit_state = 0;

    /*
     * Ok, make it visible to the rest of the system.
     * We dont wake it up yet.
     */
    p->group_leader = p;
    INIT_LIST_HEAD(&p->thread_group);

    /* Now that the task is set up, run cgroup callbacks if
     * necessary. We need to run them before the task is visible
     * on the tasklist. */
    cgroup_fork_callbacks(p);
    cgroup_callbacks_done = 1;

    /* Need tasklist lock for parent etc handling! */
    write_lock_irq(&tasklist_lock);

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
        p->real_parent = current->real_parent;
        p->parent_exec_id = current->parent_exec_id;
    } else {
        p->real_parent = current;
        p->parent_exec_id = current->self_exec_id;
    }

    spin_lock(&current->sighand->siglock);

    /*
     * Process group and session signals need to be delivered to just the
     * parent before the fork or both the parent and the child after the
     * fork. Restart if a signal comes in before we add the new process to
     * it's process group.
     * A fatal signal pending means that current will exit, so the new
     * thread can't slip out of an OOM kill (or normal SIGKILL).
      */
    recalc_sigpending();
    if (signal_pending(current)) {
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        retval = -ERESTARTNOINTR;
        goto bad_fork_free_pid;
    }

    if (clone_flags & CLONE_THREAD) {
        current->signal->nr_threads++;
        atomic_inc(&current->signal->live);
        atomic_inc(&current->signal->sigcnt);
        p->group_leader = current->group_leader;
        list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
    }

    /* 8)做一些扫尾工作，并返回一个指向子进程的指针 */     
    if (likely(p->pid)) {
        tracehook_finish_clone(p, clone_flags, trace);

        if (thread_group_leader(p)) {
            if (is_child_reaper(pid))
                p->nsproxy->pid_ns->child_reaper = p;

            p->signal->leader_pid = pid;
            p->signal->tty = tty_kref_get(current->signal->tty);
            attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
            attach_pid(p, PIDTYPE_SID, task_session(current));
            list_add_tail(&p->sibling, &p->real_parent->children);
            list_add_tail_rcu(&p->tasks, &init_task.tasks);
            __this_cpu_inc(process_counts);
        }
        attach_pid(p, PIDTYPE_PID, pid);
        nr_threads++;
    }

    total_forks++;
    spin_unlock(&current->sighand->siglock);
    write_unlock_irq(&tasklist_lock);
    proc_fork_connector(p);
    cgroup_post_fork(p);
    if (clone_flags & CLONE_THREAD)
        threadgroup_fork_read_unlock(current);
    perf_event_fork(p);
    return p;

bad_fork_free_pid:
    if (pid != &init_struct_pid)
        free_pid(pid);
bad_fork_cleanup_io:
    if (p->io_context)
        exit_io_context(p);
bad_fork_cleanup_namespaces:
    exit_task_namespaces(p);
bad_fork_cleanup_mm:
    if (p->mm) {
        task_lock(p);
        if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
            atomic_dec(&p->mm->oom_disable_count);
        task_unlock(p);
        mmput(p->mm);
    }
bad_fork_cleanup_signal:
    if (!(clone_flags & CLONE_THREAD))
        free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
    __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
    exit_fs(p); /* blocking */
bad_fork_cleanup_files:
    exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
    exit_sem(p);
bad_fork_cleanup_audit:
    audit_free(p);
bad_fork_cleanup_policy:
    perf_event_free_task(p);
#ifdef CONFIG_NUMA
    mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
    if (clone_flags & CLONE_THREAD)
        threadgroup_fork_read_unlock(current);
    cgroup_exit(p, cgroup_callbacks_done);
    delayacct_tsk_free(p);
    module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
    atomic_dec(&p->cred->user->processes);
    exit_creds(p);
bad_fork_free:
    free_task(p);
fork_out:
    return ERR_PTR(retval);
}


NORET_TYPE void do_exit(long code)
{
    struct task_struct *tsk = current;
    int group_dead;

    profile_task_exit(tsk);

    WARN_ON(atomic_read(&tsk->fs_excl));
    WARN_ON(blk_needs_flush_plug(tsk));

    if (unlikely(in_interrupt()))
        panic("Aiee, killing interrupt handler!");
    if (unlikely(!tsk->pid))
        panic("Attempted to kill the idle task!");

    /*
     * If do_exit is called because this processes oopsed, it's possible
     * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
     * continuing. Amongst other possible reasons, this is to prevent
     * mm_release()->clear_child_tid() from writing to a user-controlled
     * kernel address.
     */
    set_fs(USER_DS);

    tracehook_report_exit(&code);

    validate_creds_for_do_exit(tsk);

    /*
     * We're taking recursive faults here in do_exit. Safest is to just
     * leave this task alone and wait for reboot.
     */
    if (unlikely(tsk->flags & PF_EXITING)) {
        printk(KERN_ALERT
            "Fixing recursive fault but reboot is needed!
");
        /*
         * We can do this unlocked here. The futex code uses
         * this flag just to verify whether the pi state
         * cleanup has been done or not. In the worst case it
         * loops once more. We pretend that the cleanup was
         * done as there is no way to return. Either the
         * OWNER_DIED bit is set by now or we push the blocked
         * task into the wait for ever nirwana as well.
         */
        tsk->flags |= PF_EXITPIDONE;
        set_current_state(TASK_UNINTERRUPTIBLE);
        schedule();
    }

    exit_irq_thread();

    /* 1)将task_struct中的标志成员设置为PF_EXTING */
    exit_signals(tsk);  /* sets PF_EXITING */
    /*
     * tsk->flags are checked in the futex code to protect against
     * an exiting task cleaning up the robust pi futexes.
     */
    smp_mb();
    raw_spin_unlock_wait(&tsk->pi_lock);

    if (unlikely(in_atomic()))
        printk(KERN_INFO "note: %s[%d] exited with preempt_count %d
",
                current->comm, task_pid_nr(current),
                preempt_count());

    acct_update_integrals(tsk);
    /* sync mm's RSS info before statistics gathering */
    if (tsk->mm)
        sync_mm_rss(tsk, tsk->mm);
    group_dead = atomic_dec_and_test(&tsk->signal->live);
    if (group_dead) {
        hrtimer_cancel(&tsk->signal->real_timer);
        exit_itimers(tsk->signal);
        if (tsk->mm)
            setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
    }
    acct_collect(code, group_dead);
    if (group_dead)
        tty_audit_exit();
    if (unlikely(tsk->audit_context))
        audit_free(tsk);

    tsk->exit_code = code;
    taskstats_exit(tsk, group_dead);

    /* 4)释放进程占用的mm_struct，如果没有别的进程使用它们，就彻底释放它们 */
    exit_mm(tsk);

    if (group_dead)
        acct_process();
    trace_sched_process_exit(tsk);

    /* 5)如果进程派对等候IPC信号，则离开队列 */
    exit_sem(tsk);
    /* 6)分别递减文件描述符、文件系统数据的引用计数
     * 如果其中某个引用计数的数值降为零，
     * 那么就代表没有进程在使用相应的资源，可以释放
     */
    exit_files(tsk);
    exit_fs(tsk);
    check_stack_usage();
    exit_thread();

    /*
     * Flush inherited counters to the parent - before the parent
     * gets woken up by child-exit notifications.
     *
     * because of cgroup mode, must be called before cgroup_exit()
     */
    /* 把存放task_struct的exit_code成员中的任务退出代码，置为由exit()提供的退出代码
     * 或者去完成任何其他由内核机制规定的退出动作。
     * 退出代码存放在这里供父进程随时检索
     */
    perf_event_exit_task(tsk);

    cgroup_exit(tsk, 1);

    if (group_dead)
        disassociate_ctty(1);

    module_put(task_thread_info(tsk)->exec_domain->module);

    proc_exit_connector(tsk);

    /*
     * FIXME: do that only when needed, using sched_exit tracepoint
     */
    ptrace_put_breakpoints(tsk);

    /* 8)向父进程发送信号，给子进程重新找养父，养父为线程组中的其他线程或者init进程，
     * 并把进程状态设成EXIT_ZOMBIE
     */
    exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
    task_lock(tsk);
    mpol_put(tsk->mempolicy);
    tsk->mempolicy = NULL;
    task_unlock(tsk);
#endif
#ifdef CONFIG_FUTEX
    if (unlikely(current->pi_state_cache))
        kfree(current->pi_state_cache);
#endif
    /*
     * Make sure we are holding no locks:
     */
    debug_check_no_locks_held(tsk);
    /*
     * We can do this unlocked here. The futex code uses this flag
     * just to verify whether the pi state cleanup has been done
     * or not. In the worst case it loops once more.
     */
    tsk->flags |= PF_EXITPIDONE;

    if (tsk->io_context)
        exit_io_context(tsk);

    if (tsk->splice_pipe)
        __free_pipe_info(tsk->splice_pipe);

    validate_creds_for_do_exit(tsk);

    preempt_disable();
    exit_rcu();
    /* causes final put_task_struct in finish_task_switch(). */
    tsk->state = TASK_DEAD;
    /* 切换到新的进程。因为处于EXIT_ZOMBIE状态
     * 进程不会再被调度，所以这是进程所执行的最后一段代码
     */
    schedule();
    BUG();
    /* Avoid "noreturn function does return".  */
    for (;;)
        cpu_relax();    /* For when BUG is null */
}

do_fork()和copy_process()

3.2 创建线程

和创建进程类似，使用clone()的时候需要一些参数标志来指明共享资源。

clone(CLONE_VM|CLONE|FS|CLONE_FILES|CLONE_SIGHAND, 0);

这些标志在<linux/sched.h>中定义的

CLONE_FILES    父子进程共享打开文件
CLONE_FS    父子进程共享文件系统信息
CLONE_IDLETASK    将PID设置为0（只供idle进程使用）
CLONE_NEWNS    为子进程创建新的命名空间
CLONE_PARENT    指定子进程与父进程拥有同一个父进程
CLONE_PTRACE    继续调试子进程
CLONE_SETTID    将TID回写至用户空间
CLONE_SETTLS    为子进程创建新的TLS
CLONE_SIGHAND    父子进程共享信号处理函数及被阻断的信号
CLONE_SYSVSEM    父子进程共享system V SEM_UNDO语义
CLONE_THREAD    父子进程放入相同的线程组
CLONE_VFORK    调用vfork()，所以父进程准备睡眠等待子进程将其唤醒
CLONE_UNTRACED    防止跟踪进程在子进程上强制执行CLONE_PTRACE
CLONE_STOP    以TASK_STOPPED状态开始进程
CLONE_SETTLS    为子进程创建新的TLS
CLONE_CHILD_CLEARTID    清除子进程TID
CLONE_CHILD_SETTID    设置子进程TID
CLONE_PARENT_SETTID    设置父进程TID
CLONE_VM    父子进程共享地址空间

clone()参数标志

3.3 内核线程

内核线程和普通线程的区别在于内核线程没有独立的地址空间，并且只运行在内核空间，同样可以被调度和抢占。

在装有Linux系统上的运行ps -ef就可以看到内核线程了，在<linux/kthread.h>中声明有接口。创建一个新的内核线程的方法：

struct task_struct *kthread_create(int (*threadfn)(void *data),
    void *data,
    const char namefmt[],
    ...)
threadfn：运行的函数
data：给其传递的参数为data
namefmt：接受可变参数列表，类似于printf()

新创建的线程处于不可运行状态，需要通过wake_up_process()明确的唤醒，才会主动运行。

两步一起的函数为：

struct task_struct *kthread_run(int (*threadfn)(void *data),
    void *data,
    const char namefmt[],
    ...)

内核线程启动后一直运行直到调用do_exit()，或者kthread_stop()，参数是创建时的task_struct结构地址。

int kthread_stop(struct task_struct *k)

四、进程终结

大部分任务依靠do_exit()来终结，定义于<kernel/exit.c>。

NORET_TYPE void do_exit(long code)
{
    struct task_struct *tsk = current;
    int group_dead;

    profile_task_exit(tsk);

    WARN_ON(atomic_read(&tsk->fs_excl));
    WARN_ON(blk_needs_flush_plug(tsk));

    if (unlikely(in_interrupt()))
        panic("Aiee, killing interrupt handler!");
    if (unlikely(!tsk->pid))
        panic("Attempted to kill the idle task!");

    /*
     * If do_exit is called because this processes oopsed, it's possible
     * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
     * continuing. Amongst other possible reasons, this is to prevent
     * mm_release()->clear_child_tid() from writing to a user-controlled
     * kernel address.
     */
    set_fs(USER_DS);

    tracehook_report_exit(&code);

    validate_creds_for_do_exit(tsk);

    /*
     * We're taking recursive faults here in do_exit. Safest is to just
     * leave this task alone and wait for reboot.
     */
    if (unlikely(tsk->flags & PF_EXITING)) {
        printk(KERN_ALERT
            "Fixing recursive fault but reboot is needed!
");
        /*
         * We can do this unlocked here. The futex code uses
         * this flag just to verify whether the pi state
         * cleanup has been done or not. In the worst case it
         * loops once more. We pretend that the cleanup was
         * done as there is no way to return. Either the
         * OWNER_DIED bit is set by now or we push the blocked
         * task into the wait for ever nirwana as well.
         */
        tsk->flags |= PF_EXITPIDONE;
        set_current_state(TASK_UNINTERRUPTIBLE);
        schedule();
    }

    exit_irq_thread();

    /* 1)设置进程标志为PF_EXITING */
    exit_signals(tsk);  /* sets PF_EXITING */
    /*
     * tsk->flags are checked in the futex code to protect against
     * an exiting task cleaning up the robust pi futexes.
     */
    smp_mb();
    raw_spin_unlock_wait(&tsk->pi_lock);

    if (unlikely(in_atomic()))
        printk(KERN_INFO "note: %s[%d] exited with preempt_count %d
",
                current->comm, task_pid_nr(current),
                preempt_count());

    acct_update_integrals(tsk);
    /* sync mm's RSS info before statistics gathering */
    if (tsk->mm)
        sync_mm_rss(tsk, tsk->mm);
    /* 2)调用del_timer_sync()删除任一内核定时器 */
    group_dead = atomic_dec_and_test(&tsk->signal->live);
    if (group_dead) {
        hrtimer_cancel(&tsk->signal->real_timer);
        exit_itimers(tsk->signal);
        if (tsk->mm)
            setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
    }
    acct_collect(code, group_dead);
    if (group_dead)
        tty_audit_exit();
    if (unlikely(tsk->audit_context))
        audit_free(tsk);

    tsk->exit_code = code;
    taskstats_exit(tsk, group_dead);

    /* 4)调用exit_mm()释放进程占用的mm_struct，如果没有别的进程使用它们 */
    exit_mm(tsk);

    if (group_dead)
        acct_process();
    trace_sched_process_exit(tsk);

    /* 5)如果进程派对等候IPC信号，它则离开队列 */
    exit_sem(tsk);
    /* 6)以分别递减文件描述符、文件系统数据的引用计数。如果其中某个引用计数的数值降为零
     * 那么就代表没有进程在使用相应的资源，此时可以释放 */
    exit_files(tsk);
    exit_fs(tsk);
    check_stack_usage();
    exit_thread();

    /*
     * Flush inherited counters to the parent - before the parent
     * gets woken up by child-exit notifications.
     *
     * because of cgroup mode, must be called before cgroup_exit()
     */
    perf_event_exit_task(tsk);

    cgroup_exit(tsk, 1);

    if (group_dead)
        disassociate_ctty(1);

    module_put(task_thread_info(tsk)->exec_domain->module);

    proc_exit_connector(tsk);

    /*
     * FIXME: do that only when needed, using sched_exit tracepoint
     */
    ptrace_put_breakpoints(tsk);

    /* 8)向父进程发送信号，给子进程重新找养父，养父为线程组中的其他线程或者位init线程
     * 并把进程状态设为EXIT_ZOMBIE */
    exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
    task_lock(tsk);
    mpol_put(tsk->mempolicy);
    tsk->mempolicy = NULL;
    task_unlock(tsk);
#endif
#ifdef CONFIG_FUTEX
    if (unlikely(current->pi_state_cache))
        kfree(current->pi_state_cache);
#endif
    /*
     * Make sure we are holding no locks:
     */
    debug_check_no_locks_held(tsk);
    /*
     * We can do this unlocked here. The futex code uses this flag
     * just to verify whether the pi state cleanup has been done
     * or not. In the worst case it loops once more.
     */
    tsk->flags |= PF_EXITPIDONE;

    if (tsk->io_context)
        exit_io_context(tsk);

    if (tsk->splice_pipe)
        __free_pipe_info(tsk->splice_pipe);

    validate_creds_for_do_exit(tsk);

    preempt_disable();
    exit_rcu();
    /* causes final put_task_struct in finish_task_switch(). */
    tsk->state = TASK_DEAD;
    /* 调用schedule切换到新的进程，处于ZOMBIE不会再被调度 */
    schedule();
    BUG();
    /* Avoid "noreturn function does return".  */
    for (;;)
        cpu_relax();    /* For when BUG is null */
}

do_exit

4.1 删除进程描述符

调用了do_exit()后，在父进程获得已结束的子进程信息后，或者通知内核它并不关注这些信息后，task_struct结构才被释放。

当最终释放进程描述符时，release_task()或被调用：

void release_task(struct task_struct * p)
{
    struct task_struct *leader;
    int zap_leader;
repeat:
    tracehook_prepare_release_task(p);
    /* don't need to get the RCU readlock here - the process is dead and
     * can't be modifying its own credentials. But shut RCU-lockdep up */
    rcu_read_lock();
    atomic_dec(&__task_cred(p)->user->processes);
    rcu_read_unlock();

    proc_flush_task(p);

    write_lock_irq(&tasklist_lock);
    tracehook_finish_release_task(p);
    /* 1)该函数调用_unhash_process(),后者调用detach_pid()从pidhash
     * 上删除该进程，同时也要从任务列表中删除该进程
     * 2)释放目前僵死进程所使用的所有剩余资源，并进行最终统计和记录     */
    __exit_signal(p);

    /*
     * If we are the last non-leader member of the thread
     * group, and the leader is zombie, then notify the
     * group leader's parent process. (if it wants notification.)
     */
     /* 3）如果这个进程是线程组最后一个进程，并且领头进程已经死掉，
      * 那么release_task()就要通知僵死的头领进程的父进程     */
    zap_leader = 0;
    leader = p->group_leader;
    if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
        BUG_ON(task_detached(leader));
        do_notify_parent(leader, leader->exit_signal);
        /*
         * If we were the last child thread and the leader has
         * exited already, and the leader's parent ignores SIGCHLD,
         * then we are the one who should release the leader.
         *
         * do_notify_parent() will have marked it self-reaping in
         * that case.
         */
        zap_leader = task_detached(leader);

        /*
         * This maintains the invariant that release_task()
         * only runs on a task in EXIT_DEAD, just for sanity.
         */
        if (zap_leader)
            leader->exit_state = EXIT_DEAD;
    }

    write_unlock_irq(&tasklist_lock);
    /* 4)释放进程内核栈和thread_info结构所占的页，
     * 并释放task_struct所占的slab高速缓存 */
    release_thread(p);
    call_rcu(&p->rcu, delayed_put_task_struct);

    p = leader;
    if (unlikely(zap_leader))
        goto repeat;
}

release_task

Linux内核设计与实现 总结笔记（第三章）进程