理解Linux内核之中断控制

乍一看下边的Linux内核代码，貌似L3389有bug，于是我就绕有兴趣地阅读了一下local_irq_save/local_irq_restore的源代码。

/* linux-4.14.12/mm/slab.c#3389 */

3377  static __always_inline void *
3378  slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3379  {
3380    unsigned long save_flags;
3381    void *objp;
....
3389    local_irq_save(save_flags);
3390    objp = __do_cache_alloc(cachep, flags);
3391    local_irq_restore(save_flags);
....
3399    return objp;
3400  }

在L3380和L3389中，如果local_irq_save()是一个函数，必然存在着bug, 因为需要把save_flags的变量地址传给local_irq_save()才对。

3380      unsigned long save_flags;
....
3389      local_irq_save(save_flags);

L3389是不是该是这样才对啊？

3389      local_irq_save(&save_flags);

但是，local_irq_save()和local_irq_restore()不是函数，而是宏,这样就没有bug了。

1. local_irq_save()和local_irq_restore()的实现

/* linux-4.14.12/include/linux/irqflags.h#139 */

105  #ifdef CONFIG_TRACE_IRQFLAGS
...
110  #define local_irq_save(flags)                      
111     do {                                            
112             raw_local_irq_save(flags);              
113             trace_hardirqs_off();                   
114     } while (0)
115
116
117  #define local_irq_restore(flags)                   
118     do {                                            
119             if (raw_irqs_disabled_flags(flags)) {   
120                     raw_local_irq_restore(flags);   
121                     trace_hardirqs_off();           
122             } else {                                
123                     trace_hardirqs_on();            
124                     raw_local_irq_restore(flags);   
125             }                                       
126     } while (0)
...
135  #else /* !CONFIG_TRACE_IRQFLAGS */
...
139  #define local_irq_save(flags)                              
140     do {                                                    
141             raw_local_irq_save(flags);                      
142     } while (0)
143  #define local_irq_restore(flags) do { raw_local_irq_restore(flags); } while (0)
...
146  #endif /* CONFIG_TRACE_IRQFLAGS */

为简单起见，我们只关注!CONFIG_TRACE_IRQFLAGS分支就好了，

139  #define local_irq_save(flags)                              
140     do {                                                    
141             raw_local_irq_save(flags);                      
142     } while (0)
143  #define local_irq_restore(flags) do { raw_local_irq_restore(flags); } while (0)

于是，我们可以认为， locale_irq_save()/local_irq_restore()等同于：

#define local_irq_save(flags)    raw_local_irq_save(flags)
#define local_irq_restore(flags) raw_local_irq_restore(flags)

2. raw_local_irq_save()和raw_local_irq_restore()的实现

/* linux-4.14.12/include/linux/irqflags.h#78 */

78  #define raw_local_irq_save(flags)                   
79      do {                                            
80              typecheck(unsigned long, flags);        
81              flags = arch_local_irq_save();          
82      } while (0)
83  #define raw_local_irq_restore(flags)                
84      do {                                            
85              typecheck(unsigned long, flags);        
86              arch_local_irq_restore(flags);          
87      } while (0)

关于宏typecheck()不做解释，因为很直观，就是保证flags的类型必须是unsigned long。于是，raw_local_irq_save()和raw_local_irq_restore()等同于：

#define raw_local_irq_save(flags)       flags = arch_local_irq_save()
#define raw_local_irq_restore(flags)    arch_local_irq_restore(flags)

下面以x86为例说明arch_local_irq_save()和arch_local_irq_restore()这两个函数的实现。

3. arch_local_irq_save()和arch_local_irq_restore()这两个函数在x86上的实现

/* linux-4.14.12/arch/x86/include/asm/irqflags.h#70 */

70  static inline notrace unsigned long arch_local_save_flags(void)
71  {
72      return native_save_fl();
73  }
74
75  static inline notrace void arch_local_irq_restore(unsigned long flags)
76  {
77      native_restore_fl(flags);
78  }
...
111  static inline notrace unsigned long arch_local_irq_save(void)
112  {
113     unsigned long flags = arch_local_save_flags();
114     arch_local_irq_disable();
115     return flags;
116  }

函数arch_local_irq_save()在调用arch_local_save_flags()还做了一件事，那就是调用arch_local_irq_disable()把中断禁止掉。接下来，我们首先看看native_save_fl()和native_restore_fl()的具体实现。

3.1 native_save_fl()的实现

/* linux-4.14.12/arch/x86/include/asm/irqflags.h#16 */

16  static inline unsigned long native_save_fl(void)
17  {
18      unsigned long flags;
19
20      /*
21       * "=rm" is safe here, because "pop" adjusts the stack before
22       * it evaluates its effective address -- this is part of the
23       * documented behavior of the "pop" instruction.
24       */
25      asm volatile("# __raw_save_flags
	"
26                   "pushf ; pop %0"
27                   : "=rm" (flags)
28                   : /* no input */
29                   : "memory");
30
31      return flags;
32  }

这是一段内嵌的汇编代码，后面写一个简单的demo再解释。

3.2 native_restore_fl()的实现

/* linux-4.14.12/arch/x86/include/asm/irqflags.h#34 */

34  static inline void native_restore_fl(unsigned long flags)
35  {
36      asm volatile("push %0 ; popf"
37                   : /* no output */
38                   :"g" (flags)
39                   :"memory", "cc");
40  }

同样，这也是内嵌的汇编代码，后面写一个简单的demo再解释。

3.3 反汇编理解native_save_fl()和native_restore_fl()

foo.c

 1 static inline unsigned long native_save_fl(void)
 2 {
 3         unsigned long flags;
 4 
 5         /*
 6          * "=rm" is safe here, because "pop" adjusts the stack before
 7          * it evaluates its effective address -- this is part of the
 8          * documented behavior of the "pop" instruction.
 9          */
10         asm volatile("# __raw_save_flags
	"
11                      "pushf ; pop %0"
12                      : "=rm" (flags)
13                      : /* no input */
14                      : "memory");
15 
16         return flags;
17 }
18 
19 static inline void native_restore_fl(unsigned long flags)
20 {
21         asm volatile("push %0 ; popf"
22                      : /* no output */
23                      :"g" (flags)
24                      :"memory", "cc");
25 }
26 
27 int main(int argc, char *argv[])
28 {
29         unsigned long flags = native_save_fl();
30         native_restore_fl(flags);
31         return 0;
32 }

用gcc编译并反汇编

veli@idorax:/tmp$ gcc -g -Wall -o foo foo.c
veli@idorax:/tmp$
veli@idorax:/tmp$ gdb foo
GNU gdb (Ubuntu 7.11.1-0ubuntu1~16.5) 7.11.1
...<snip>...................................
(gdb) set disassembly-flavor intel
(gdb)
(gdb) disas /m main
Dump of assembler code for function main:
28      {
   0x00000000004004f5 <+0>:     push   rbp
   0x00000000004004f6 <+1>:     mov    rbp,rsp
   0x00000000004004f9 <+4>:     sub    rsp,0x20
   0x00000000004004fd <+8>:     mov    DWORD PTR [rbp-0x14],edi
   0x0000000000400500 <+11>:    mov    QWORD PTR [rbp-0x20],rsi

29              unsigned long flags = native_save_fl();
   0x0000000000400504 <+15>:    call   0x4004d6 <native_save_fl>
   0x0000000000400509 <+20>:    mov    QWORD PTR [rbp-0x8],rax

30              native_restore_fl(flags);
   0x000000000040050d <+24>:    mov    rax,QWORD PTR [rbp-0x8]
   0x0000000000400511 <+28>:    mov    rdi,rax
   0x0000000000400514 <+31>:    call   0x4004e6 <native_restore_fl>

31              return 0;
   0x0000000000400519 <+36>:    mov    eax,0x0

32      }
   0x000000000040051e <+41>:    leave
   0x000000000040051f <+42>:    ret

End of assembler dump.
(gdb) #
(gdb) disas /m native_save_fl
Dump of assembler code for function native_save_fl:
2       {
   0x00000000004004d6 <+0>:     push   rbp
   0x00000000004004d7 <+1>:     mov    rbp,rsp

3               unsigned long flags;
4
5               /*
6                * "=rm" is safe here, because "pop" adjusts the stack before
7                * it evaluates its effective address -- this is part of the
8                * documented behavior of the "pop" instruction.
9                */
10              asm volatile("# __raw_save_flags
	"
   0x00000000004004da <+4>:     pushf
   0x00000000004004db <+5>:     pop    rax
   0x00000000004004dc <+6>:     mov    QWORD PTR [rbp-0x8],rax

11                           "pushf ; pop %0"
12                           : "=rm" (flags)
13                           : /* no input */
14                           : "memory");
15
16              return flags;
   0x00000000004004e0 <+10>:    mov    rax,QWORD PTR [rbp-0x8]

17      }
   0x00000000004004e4 <+14>:    pop    rbp
   0x00000000004004e5 <+15>:    ret

End of assembler dump.
(gdb) #
(gdb) disas /m native_restore_fl
Dump of assembler code for function native_restore_fl:
20      {
   0x00000000004004e6 <+0>:     push   rbp
   0x00000000004004e7 <+1>:     mov    rbp,rsp
   0x00000000004004ea <+4>:     mov    QWORD PTR [rbp-0x8],rdi

21              asm volatile("push %0 ; popf"
   0x00000000004004ee <+8>:     push   QWORD PTR [rbp-0x8]
   0x00000000004004f1 <+11>:    popf

22                           : /* no output */
23                           :"g" (flags)
24                           :"memory", "cc");
25      }
   0x00000000004004f2 <+12>:    nop
   0x00000000004004f3 <+13>:    pop    rbp
   0x00000000004004f4 <+14>:    ret

End of assembler dump.
(gdb) q
veli@idorax:/tmp$

根据上面的反汇编代码不难看出，native_save_fl()和native_restore_fl()的实现异常简单。

native_save_fl()

; static inline unsigned long native_save_fl(void)

0x00000000004004da <+4>:        pushf                          ; 把标志寄存器(FLAGS)压入栈(Stack)中
0x00000000004004db <+5>:        pop    rax                     ; 通过出栈操作把标志寄存器的值存入rax中
0x00000000004004dc <+6>:        mov    QWORD PTR [rbp-0x8],rax ; 把rax存入局部变量flags中
0x00000000004004e0 <+10>:       mov    rax,QWORD PTR [rbp-0x8] ; 根据ABI, 返回值总是存于rax中，这里等同于return flags

native_restore_fl()

; static inline void native_restore_fl(unsigned long flags)

0x00000000004004ea <+4>:        mov    QWORD PTR [rbp-0x8],rdi ; 根据ABI, 函数的第一个参数通过寄存器rdi传递
                                                               ; 于是，等同于将第一个参数flags存入一个局部变量中
0x00000000004004ee <+8>:        push   QWORD PTR [rbp-0x8]     ; 等同于将第一个参数flags压入栈中
0x00000000004004f1 <+11>:       popf                           ; 通过初栈操作把flags的值回复到标志寄存器FLAGS中

注意：操作标志寄存器FLAGS(16位： flags, 32位： eflags, 64位: rflags)，必须通过pushf和popf这两个指令，而不能使用push和pop指令。

因此，我们可以得出如下结论，(在x86平台上)

local_irq_save()就是把标志寄存器保存到一个局部变量flags中，然后禁止中断;
local_irq_restore()则是通过局部变量flags的值恢复标志寄存器，中断自动打开。

4. arch_local_irq_disable()和arch_local_irq_enable()在x86上的实现

/* linux-4.14.12/arch/x86/include/asm/irqflags.h#80 */

80  static inline notrace void arch_local_irq_disable(void)
81  {
82      native_irq_disable();
83  }
84
85  static inline notrace void arch_local_irq_enable(void)
86  {
87      native_irq_enable();
88  }

/* linux-4.14.12/arch/x86/include/asm/irqflags.h#42 */
42  static inline void native_irq_disable(void)
43  {
44      asm volatile("cli": : :"memory");
45  }
46
47  static inline void native_irq_enable(void)
48  {
49      asm volatile("sti": : :"memory");
50  }

从上面的代码可以看出，在x86中，

arch_local_irq_disable()的实质是执行汇编指令cli
arch_local_irq_enable的实质则是执行汇编指令sti

到此为止，我们已经搞清楚了如下4个宏的作用。

local_irq_disable() : 禁止本地中断传递。在x86上，本质上是调用汇编指令cli;
local_irq_enable() : 激活本地中断传递。在x86上，本质上是调用汇编指令sti;
local_irq_save() : 保存本地中断传递的当前状态，然后禁止本地中断传递。在x86上，本质上是调用pushf+pop先保存标志寄存器到一个变量flags中，然后调用汇编指令cli;
local_irq_restore() : 恢复本地中断传递到给定的状态。在x86上，本质上是调用push+popf重置标志寄存器。

更多有关中断控制的细节，请阅读源代码和《Linux Kernel Development》一书的第7章:中断和中断处理。常用的中断控制方法，如下图所示。

If all you have is a hammer, everything looks like a nail. | 如果你拥有的东西就只有一把锤子，那么一切事物在你眼里都看起来是钉子。 （P.S. 保持Open的心态很重要啊）