Linux内核实现透视---软中断&Tasklet

软中断

首先明确一个概念软中断（不是软件中断int n）。总来来说软中断就是内核在启动时为每一个内核创建了一个特殊的进程，这个进程会不停的poll检查是否有软中断需要执行，如果需要执行则调用注册的接口函数。所以软中断是运行在进程上下文的，而且可能并发执行在不同CPU上。所谓的软中断就是内核利用内核线程配合抽象的数据结构进行管理线程合适时间调用注册的接口的一套软件管理机制。

先看管理软中断的数据结构因为数据结构最能说明逻辑内核对软件中断抽象的数据结构主要有如下几个部分。

中断服务接口管理

在内核中声明在kernelsoftirq.c中如下

#ifndef __ARCH_IRQ_STAT
irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
EXPORT_SYMBOL(irq_stat);
#endif

static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;


struct softirq_action
{
    void    (*action)(struct softirq_action *);
};

其中的NR_SOFTIRQS由软中断类型的枚举对象提供如下定义：

enum
{
    HI_SOFTIRQ=0,
    TIMER_SOFTIRQ,
    NET_TX_SOFTIRQ,
    NET_RX_SOFTIRQ,
    BLOCK_SOFTIRQ,
    BLOCK_IOPOLL_SOFTIRQ,
    TASKLET_SOFTIRQ,
    SCHED_SOFTIRQ,
    HRTIMER_SOFTIRQ,
    RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */

    NR_SOFTIRQS
};

之所以综上可以知道内核维护了一个struct softirq_action类型的软中断接口数组，而软中断的状态则是由前面的 irq_cpustat_t 类型的数组管理，由定义可以知道状态是和CPU关联的，表示某一个CPU上的软中断状态。下面看看irq_cpustat_t 的定义，也非常的的简单主要就是其中的 __softirq_pending成员，这个成员的每一个bit表示一种类型的中断类型的状态信息，并且低bit的中断类型的中断优先级高。

typedef struct {
    unsigned int __softirq_pending;//标记是否有软中断悬起
    long idle_timestamp;
    /* 统计信息 */
    /* Hard interrupt statistics. */
    unsigned int irq_timer_count;
    unsigned int irq_syscall_count;
    unsigned int irq_resched_count;
    unsigned int irq_hv_flush_count;
    unsigned int irq_call_count;
    unsigned int irq_hv_msg_count;
    unsigned int irq_dev_intr_count;

} ____cacheline_aligned irq_cpustat_t;

在通过Tasklet接接口中断的创建就可以知道软件中断的注册(open_softirq)过程就是修改前面定义的softirq_vec数组，就可以完成软件中断的注册,而驱动开发人员也很少直接使用软件中断。

//接口中的nr就是上面枚举值，action就是软中断服务函数
open_softirq(int nr,void(*action)(struct softirq_action *));

再看内核在启动时为每个CPU创建的线程操作：

static struct notifier_block cpu_nfb = {
    .notifier_call = cpu_callback
};

static struct smp_hotplug_thread softirq_threads = {
    .store            = &ksoftirqd,
    .thread_should_run    = ksoftirqd_should_run,
    .thread_fn        = run_ksoftirqd,
    .thread_comm        = "ksoftirqd/%u",
};

static __init int spawn_ksoftirqd(void)
{
    register_cpu_notifier(&cpu_nfb);

    BUG_ON(smpboot_register_percpu_thread(&softirq_threads));

    return 0;
}
early_initcall(spawn_ksoftirqd);

重点是这个接口函数 smpboot_register_percpu_thread如下：

/**
 * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
 * @plug_thread:    Hotplug thread descriptor
 *
 * Creates and starts the threads on all online cpus.
 */
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
    unsigned int cpu;
    int ret = 0;

    get_online_cpus();
    mutex_lock(&smpboot_threads_lock);
    for_each_online_cpu(cpu) {
        ret = __smpboot_create_thread(plug_thread, cpu);
        if (ret) {
            smpboot_destroy_threads(plug_thread);
            goto out;
        }
        smpboot_unpark_thread(plug_thread, cpu);
    }
    list_add(&plug_thread->list, &hotplug_threads);
out:
    mutex_unlock(&smpboot_threads_lock);
    put_online_cpus();
    return ret;
}

传进来的参数是 softirq_threads，先获取在线即激活的CPU然后遍历调用__smpboot_create_thread 参数同样是前面定义的softirq_threads继续向下看：

tatic int
__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
    struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
    struct smpboot_thread_data *td;

    if (tsk)
        return 0;

    td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
    if (!td)
        return -ENOMEM;
    td->cpu = cpu;
    td->ht = ht;

    tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
                    ht->thread_comm);
    if (IS_ERR(tsk)) {
        kfree(td);
        return PTR_ERR(tsk);
    }
    get_task_struct(tsk);
    *per_cpu_ptr(ht->store, cpu) = tsk;
    if (ht->create) {
        /*
         * Make sure that the task has actually scheduled out
         * into park position, before calling the create
         * callback. At least the migration thread callback
         * requires that the task is off the runqueue.
         */
        if (!wait_task_inactive(tsk, TASK_PARKED))
            WARN_ON(1);
        else
            ht->create(cpu);
    }
    return 0;
}

看创建了一个内核线程在特定CPU上通过kthread_create_on_cpu(smpboot_thread_fn, td, cpu,ht->thread_comm)接口，不在往深入继续看，这里只需要创建了一个绑定CPU的线程，线程函数是smpboot_thread_fn这个比较重要需要详细看一下。传入的data就是一个struct smpboot_thread_data类型的数据这个数据中保存了softirq_threads在ht中如下，进程开始运行时先关闭抢占，检查是否需要停止当前线程如果需要则立马停止当前线程，这里肯定不需要停止除非是关机（我的理解）。然就是检查是否要暂停，因为用户的软中断接口可能调用阻塞接口会阻塞当前内尔后进程所以需要暂停当前线程最后的恢复也是有用户软件中断服务函数完成（我的理解）最后部分源码注释如下：

static int smpboot_thread_fn(void *data)
{
    struct smpboot_thread_data *td = data;
    struct smp_hotplug_thread *ht = td->ht;

    while (1) {
        set_current_state(TASK_INTERRUPTIBLE);
        //关闭内核抢占机制
        preempt_disable();
　　　　　//是否需要停止当前线程关机时才执行？？
        if (kthread_should_stop()) {
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            if (ht->cleanup)
                ht->cleanup(td->cpu, cpu_online(td->cpu));
            kfree(td);
            return 0;
        }

        if (kthread_should_park()) {
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            if (ht->park && td->status == HP_THREAD_ACTIVE) {
                BUG_ON(td->cpu != smp_processor_id());
                ht->park(td->cpu);
                td->status = HP_THREAD_PARKED;
            }
            kthread_parkme();
            /* We might have been woken for stop */
            continue;
        }

        BUG_ON(td->cpu != smp_processor_id());

        /* Check for state change setup */
        switch (td->status) {
        case HP_THREAD_NONE:
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            if (ht->setup)
                ht->setup(td->cpu);
            td->status = HP_THREAD_ACTIVE;
            continue;

        case HP_THREAD_PARKED:
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            if (ht->unpark)
                ht->unpark(td->cpu);
            td->status = HP_THREAD_ACTIVE;
            continue;
        }
        /*
        * 就是通过调用ksoftirqd_should_run 这是在一开始定义的softirq_threads中指定的，检查当前CPU上维护的软件中断数组中是否有中断
        * 的置起了从而决定当前的软件中断线程是否需要执行，不需要执行则放弃时间片
        */
        if (!ht->thread_should_run(td->cpu)) {
　　　　　　　/*
　　　　　　　*没有需要的软件中断需要执行，则放弃时间片
　　　　　　　*/
            preempt_enable_no_resched();
            schedule();
        } else {
            /*
            * 有中断需要执行则直接调用 run_ksoftirqd 执行软件中断注册的接口的调用
            */
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            //这个接口在上面初始化时绑定为run_ksoftirqd
            ht->thread_fn(td->cpu);
        }
    }
}

可以看到run_ksoftirqd如下：

static void run_ksoftirqd(unsigned int cpu)
{
    local_irq_disable();
    if (local_softirq_pending()) {
        /*
         * We can safely run softirq on inline stack, as we are not deep
         * in the task stack here.
         */
        __do_softirq();
        local_irq_enable();
        cond_resched_rcu_qs();
        return;
    }
    local_irq_enable();
}

关闭本CPU上的硬中断然后执行__do_softirq();这个是软件中断的重点接口如下，注释了一部分：

asmlinkage __visible void __do_softirq(void)
{
    unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
    unsigned long old_flags = current->flags;
    int max_restart = MAX_SOFTIRQ_RESTART;
    struct softirq_action *h;
    bool in_hardirq;
    __u32 pending;
    int softirq_bit;

    /*
     * Mask out PF_MEMALLOC s current task context is borrowed for the
     * softirq. A softirq handled such as network RX might set PF_MEMALLOC
     * again if the socket is related to swap
     */
    current->flags &= ~PF_MEMALLOC;
    //保存悬起的软件中断的位图
    pending = local_softirq_pending();
    account_irq_enter_time(current);
    //标记进入软件中断上下文
    __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
    in_hardirq = lockdep_softirq_start();

restart:
    /* Reset the pending bitmask before enabling irqs */
    //清除悬起的软件中断的位图
    set_softirq_pending(0);
    //开启硬件中断
    local_irq_enable();
    //取软件中断的全局中断接口链表
    h = softirq_vec;
    //判断是否有悬起的软件中断bit，返回地最低置起的bit位置 1开始而不是0，软中断也是由优先级的低bit优先
    while ((softirq_bit = ffs(pending))) {
        unsigned int vec_nr;
        int prev_count;
        //取出对应的中断对象
        h += softirq_bit - 1;
        //取出对应的中断index
        vec_nr = h - softirq_vec;
        prev_count = preempt_count();

        kstat_incr_softirqs_this_cpu(vec_nr);

        trace_softirq_entry(vec_nr);
        //执行软件中断注册的接口函数
        h->action(h);
        trace_softirq_exit(vec_nr);
        if (unlikely(prev_count != preempt_count())) {
            pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?
",
                   vec_nr, softirq_to_name[vec_nr], h->action,
                   prev_count, preempt_count());
            preempt_count_set(prev_count);
        }
        //清除刚才处理过的中断bit并右移动整个位图，然后移动软件中断句柄
        h++;
        pending >>= softirq_bit;
        //移动后继续回去处理剩下置起的bit
    }
    //到这里说明本次进来时置起的bit全部处理完了
    rcu_bh_qs();
    local_irq_disable();
    //再检查在处理期间有无新置起的软件中断，如果有则需要继续处理软件中断
    pending = local_softirq_pending();
    if (pending) {
        /*
        *又有新的软件标志置起需要处理，则开始处理，这里有一个保护机制，因为软件中断的优先级是很高的相对于用户进程如果软件中断
        *源源不断则需要进行保护避免其他进程无法运行而导致系统实时性差，这里有三个条件一个步满足就会会停止本次的软件中断的执行
        *而先去执行其他进程调度
        *1、软中断处理时间不超过2jiffies，200Hz的系统对应10ms；
        *2、当前没有有进程需要调度，即!need_resched()；
        *3、这种循环不超过MAX_SOFTIRQ_RESTART次 一般是10
        */
        if (time_before(jiffies, end) && !need_resched() &&
            --max_restart)
            goto restart;
        //不满足其中一个条件则重新唤醒ksoftirq内核线程来处理软中断，因为这个函数可能在中断上下文执行所以需要进行限制
        wakeup_softirqd();
    }

    lockdep_softirq_end(in_hardirq);
    account_irq_exit_time(current);
    //使能中断底半部
    __local_bh_enable(SOFTIRQ_OFFSET);
    WARN_ON_ONCE(in_interrupt());
    tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}

注意软件中断的处理过程对软中断连续执行的时间进行了限制其实是有原因的，因为上述软中断处理部分的代码执行机会有可能在中断上下文irq_exit()具体的调用链就是irq_exit()->invoke_softirq()->wakeup_softirq()如下（可参考硬中断的分析过程）：

void irq_exit(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
    local_irq_disable();
#else
    WARN_ON_ONCE(!irqs_disabled());
#endif

    account_irq_exit_time(current);
    preempt_count_sub(HARDIRQ_OFFSET);
    if (!in_interrupt() && local_softirq_pending())
        invoke_softirq();

    tick_irq_exit();
    rcu_irq_exit();
    trace_hardirq_exit(); /* must be last! */
}

static inline void invoke_softirq(void)
{
    if (!force_irqthreads) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
        /*
         * We can safely execute softirq on the current stack if
         * it is the irq stack, because it should be near empty
         * at this stage.
         */
        __do_softirq();
#else
        /*
         * Otherwise, irq_exit() is called on the task stack that can
         * be potentially deep already. So call softirq in its own stack
         * to prevent from any overrun.
         */
        do_softirq_own_stack();
#endif
    } else {
        wakeup_softirqd();
    }
}

invoke_softirq 的执行过程就是判断一下是否强制中断线程化了这个是由CONFIG_IRQ_FORCED_THREADING宏进行配置，如果是线程化了则直接唤醒ksoftirq线程，可以结合前面分析硬中断的响应过程就可以明白，因为此时还是在中断上下文的所以才有上面分析的__do_softirq的三个条件得处理机制就是不希望软中断过分长时间的在中断上下文执行。因为现在的Linux内核已经把中断服务函数强制线程化了所以如果中断本身运行会先于软件中断运行，而软件中断的运行时在软件中断线程poll时得到执行的。到此软件中断的简单执行过程分析就算完了，至于软件中断的管理接口另一篇博客会来学习。

tasklet

tasklet是基于软中断实现的，因为软中断就是维护了一个软中断类型表而其中有两个类型就是专门留给tasklet的高优先级和普通优先级任务的如下：

enum
{
    HI_SOFTIRQ=0,//最高优先级的软中断类型
    TIMER_SOFTIRQ,//Timer定时器软中断
    NET_TX_SOFTIRQ,//发送网络数据包软中断
    NET_RX_SOFTIRQ,//接收网络数据包软中断
    BLOCK_SOFTIRQ,
    BLOCK_IOPOLL_SOFTIRQ,//块设备软中断
    TASKLET_SOFTIRQ,//专门为tasklet机制准备的软中断
    SCHED_SOFTIRQ,//进程调度以及负载均衡软中断
    HRTIMER_SOFTIRQ,//高精度定时器软中断
    RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */----RCU服务软中断

    NR_SOFTIRQS
};

tasklet也是内核实现了一种软件管理机制，所以来看其数据结构。

tasklet描述

struct tasklet_struct
{
    //多个tasklet串成一个链表。
    struct tasklet_struct *next;
     /*
     TASKLET_STATE_SCHED表示tasklet已经被调度，正准备运行； 
     TASKLET_STATE_RUN表示tasklet正在运行中。
    */
    unsigned long state;
    //0表示tasklet处于激活状态；非0表示该tasklet被禁止，不允许执行。
    atomic_t count;
   //该tasklet处理接口
    void (*func)(unsigned long);
   //传递给tasklet处理函数的参数
    unsigned long data;
};

除此之外内核还为每个CPU维护了两个tasklet 链表如下，一个是高优先级的tasklet另一个是低优先级的。数据结构和初始化过程如下：

struct tasklet_head {
    struct tasklet_struct *head;
    struct tasklet_struct **tail;
};

static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);

并初始化各自的软中断服务函数并和CPU进行绑定。

void __init softirq_init(void)
{
    int cpu;

    for_each_possible_cpu(cpu) {
        per_cpu(tasklet_vec, cpu).tail =
            &per_cpu(tasklet_vec, cpu).head;
        per_cpu(tasklet_hi_vec, cpu).tail =
            &per_cpu(tasklet_hi_vec, cpu).head;
    }

    open_softirq(TASKLET_SOFTIRQ, tasklet_action);
    open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}

注意其中的open_softirq注册了两个软中断这就是tasklet的主要处理接口高优先级和和普通优先级的操作很相似所以这里就只分析普通优先级的tasklet。

static void tasklet_action(struct softirq_action *a)
{
    struct tasklet_struct *list;
    //注意这里关闭了本CPU的中断
    local_irq_disable();
    //读取为这个CPU维护的tasklet链表
    list = __this_cpu_read(tasklet_vec.head);
    //重新初始化tasklet_vec,tasklet 是注册一次执行一次？？？？？？？
    __this_cpu_write(tasklet_vec.head, NULL);
    __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
    local_irq_enable();
    //开中断后开始处理
    while (list) {
        struct tasklet_struct *t = list;

        list = list->next;
        /*
            如果返回false，表示当前tasklet已经在其他CPU上运行，这一轮将会跳过此tasklet。确保同一个tasklet只能在一个CPU上运行
        */
        if (tasklet_trylock(t)) {
            //表示当前tasklet处于激活状态
            if (!atomic_read(&t->count)) {
                //清TASKLET_STATE_SCHED位；如果原来没有被置位，则返回0，触发BUG()。
                if (!test_and_clear_bit(TASKLET_STATE_SCHED,
                            &t->state))
                    BUG();
                //执行tasklet_struct 中的func成员即tasklet 接口
                t->func(t->data);
                tasklet_unlock(t);
                continue;
            }
            //执行完了解锁
            tasklet_unlock(t);
        }

        local_irq_disable();
        t->next = NULL;
        /* 
        *   暂时不懂这里是什么操作，？？？？？？？？
        */
        *__this_cpu_read(tasklet_vec.tail) = t;
        __this_cpu_write(tasklet_vec.tail, &(t->next));
        __raise_softirq_irqoff(TASKLET_SOFTIRQ);
        local_irq_enable();
    }
}

通过代码可以清晰的明白Tasklet的执行过程就是从维护的tasklet 但链表中的action接口依次调用执行。怎么把tasklet的接口函数加入到，tasklet中实际上是通过接口tasklet_schedule()完成：

static inline void tasklet_schedule(struct tasklet_struct *t)
{
    //置TASKLET_STATE_SCHED位，如果原来未被置位，则调用__tasklet_schedule()。
    //注意这里和上面的测试和设置和处理过程的TASKLET_STATE_SCHED处理形成呼应
    if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
        __tasklet_schedule(t);
}
//其中的__tasklet_schedule 实际上是一个链表操作的函数接口。
void __tasklet_schedule(struct tasklet_struct *t)
{
    unsigned long flags;

    local_irq_save(flags);
    t->next = NULL;
    //将t挂入到tasklet_vec链表中
    *__this_cpu_read(tasklet_vec.tail) = t;
    __this_cpu_write(tasklet_vec.tail, &(t->next));
    raise_softirq_irqoff(TASKLET_SOFTIRQ);
    local_irq_restore(flags);
}

到这里软中断和基于软中断的tasklet的工作方式都基本大体上清楚了，最后来看几个接口函数local_bh_disabled()/local_bh_enable()。

local_bh_disabled()

通过其调用过程就知道它实际上就是操作了内核的数据标记。

static inline void local_bh_disable(void)
{
    //增加softirq域计数，表示内核状态进入了软中断上下文
    __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}
// call
 void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
    preempt_count_add(cnt);
    barrier();
}
//call
void __preempt_count_add(int val)
{
    *preempt_count_ptr() += val;
}
//call
volatile int *preempt_count_ptr(void)
{
    return &current_thread_info()->preempt_count;
}

local_bh_enable()

同理

static inline void local_bh_disable(void)
{
    __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
    //增加softirq域计数
    preempt_count_add(cnt);
    //防止编译器做优化
    barrier();
}

static __always_inline void __preempt_count_add(int val)
{
    *preempt_count_ptr() += val;
}

总得来说软中断的设计是为了解决硬中断在执行期间会关闭本CPU上的其他中断的相应从而降低了系统的实时性的问题，除此之外结合软中断的实现细节可以明确同一个软中断程序可以在不同的CPU上并发执行，而同一个CPU上不会发生软中断之间相互抢占。其次是软中断可能运行在中断上下文所以软中断中是不能执行阻塞操作的。基于软中断的Tasklet也运行于软件中断上下文的，除此之外tasklet是不可重入的，这是由tasklet本身的实现决定的。

参考博客：

https://www.cnblogs.com/arnoldlu/p/8659986.html

https://blog.csdn.net/zhangskd/article/details/21992933