PROC文件系统

1. seq_file

参考： http://blog.chinaunix.net/uid-26084833-id-1754437.html

seq_file的结构体定义：

   1: struct seq_file {   2:     char *buf;   3:     size_t size;   4:     size_t from;   5:     size_t count;   6:     loff_t index;   7:     loff_t read_pos;   8:     u64 version;   9:     struct mutex lock;  10:     const struct seq_operations *op;  11:     void *private;  12: };

seq_operations的定义：

   1: struct seq_operations {   2:     void * (*start) (struct seq_file *m, loff_t *pos);   3:     void (*stop) (struct seq_file *m, void *v);   4:     void * (*next) (struct seq_file *m, void *v, loff_t *pos);   5:     int (*show) (struct seq_file *m, void *v);   6: };

start函数

用于指定seq_file文件的读开始位置，返回实际读开始位置，如果指定的位置超过文件末尾，应当返回NULL，start函数可以有一个特殊的返回SEQ_START_TOKEN，它用于让show函数输出文件头，但这只能在pos为0时使用；

next函数

用于把seq_file 文件的当前读位置移动到下一个读位置，返回实际的下一个读位置，如果已经到达文件末尾，返回NULL；

stop函数

用于在读完seq_file文件后调用，它类似于文件操作close，用于做一些必要的清理，如释放内存等；

show函数

用于格式化输出，如果成功返回0，否则返回出错码。

我们查看一下用来打印/proc/mounts信息对应的seq_file操作函数：

   1: const struct seq_operations mounts_op = {   2:     .start    = m_start,   3:     .next    = m_next,   4:     .stop    = m_stop,   5:     .show    = show_vfsmnt   6: };

依次来看各个函数的实现：

   1: static void *m_start(struct seq_file *m, loff_t *pos)   2: {   3:     struct proc_mounts *p = m->private;   4:     5:     down_read(&namespace_sem);   6:     return seq_list_start(&p->ns->list, *pos);   7: }   8:  

down_read(&namespace_sem);

用来将可用的信号量降低一个数值，表示占用一个信号量，用来读取namespace相关的信息。

   1: static struct list_head *mount_hashtable __read_mostly;   2: static struct kmem_cache *mnt_cache __read_mostly;   3: static struct rw_semaphore namespace_sem;

namespace_sem用来保护对mount_hashtable的并发读写。

struct proc_mounts *p = m->private;

这里可以知道，给mountinfo使用的seq_file的成员private用来保存proc_mounts结构体指针。

   1: struct proc_mounts {   2:     struct seq_file m; /* must be the first element */   3:     struct mnt_namespace *ns;   4:     struct path root;   5:     int event;   6: };

顾名思义，proc_mounts保存的是我们想要的/proc/mounts信息的数据结构。

？那么，proc_mounts结构体中的数据是从哪里得到的呢

   1: static int mounts_open_common(struct inode *inode, struct file *file,   2:                   const struct seq_operations *op)   3: {   4:     struct task_struct *task = get_proc_task(inode);   5:     struct nsproxy *nsp;   6:     struct mnt_namespace *ns = NULL;   7:     struct path root;   8:     struct proc_mounts *p;   9:     int ret = -EINVAL;  10:    11:     if (task) {  12:         rcu_read_lock();  13:         nsp = task_nsproxy(task);  14:         if (nsp) {  15:             ns = nsp->mnt_ns;  16:             if (ns)  17:                 get_mnt_ns(ns);  18:         }  19:         rcu_read_unlock();  20:         if (ns && get_task_root(task, &root) == 0)  21:             ret = 0;  22:         put_task_struct(task);  23:     }  24:    25:     if (!ns)  26:         goto err;  27:     if (ret)  28:         goto err_put_ns;  29:    30:     ret = -ENOMEM;  31:     p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);  32:     if (!p)  33:         goto err_put_path;  34:    35:     file->private_data = &p->m;  36:     ret = seq_open(file, op);  37:     if (ret)  38:         goto err_free;  39:    40:     p->m.private = p;  41:     p->ns = ns;  42:     p->root = root;  43:     p->event = ns->event;  44:    45:     return 0;  46:    47:  err_free:  48:     kfree(p);  49:  err_put_path:  50:     path_put(&root);  51:  err_put_ns:  52:     put_mnt_ns(ns);  53:  err:  54:     return ret;  55: }

首先看到这段代码

    p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
    if (!p)
        goto err_put_path;

    file->private_data = &p->m;
    ret = seq_open(file, op);
    if (ret)
        goto err_free;

    p->m.private = p;
    p->ns = ns;
    p->root = root;
    p->event = ns->event;

可以确定，proc_mounts结构体是在这里创建并且初始化的。其中最重要的数据ns是怎么来的呢？

    if (task) {
        rcu_read_lock();
        nsp = task_nsproxy(task);
        if (nsp) {
            ns = nsp->mnt_ns;
            if (ns)
                get_mnt_ns(ns);
        }
        rcu_read_unlock();
        if (ns && get_task_root(task, &root) == 0)
            ret = 0;
        put_task_struct(task);
    }

   1: /*   2:  * A structure to contain pointers to all per-process   3:  * namespaces - fs (mount), uts, network, sysvipc, etc.   4:  *   5:  * 'count' is the number of tasks holding a reference.   6:  * The count for each namespace, then, will be the number   7:  * of nsproxies pointing to it, not the number of tasks.   8:  *   9:  * The nsproxy is shared by tasks which share all namespaces.  10:  * As soon as a single namespace is cloned or unshared, the  11:  * nsproxy is copied.  12:  */  13: struct nsproxy {  14:     atomic_t count;  15:     struct uts_namespace *uts_ns;  16:     struct ipc_namespace *ipc_ns;  17:     struct mnt_namespace *mnt_ns;  18:     struct pid_namespace *pid_ns;  19:     struct net          *net_ns;  20: };

这段代码就是通过当前任务的nsproxy结构体得到mnt_ns数据。

那么是哪里调到了mounts_open_common函数呢？

   1: static int mounts_open(struct inode *inode, struct file *file)   2: {   3:     return mounts_open_common(inode, file, &mounts_op);   4: }   5:     6: static const struct file_operations proc_mounts_operations = {   7:     .open        = mounts_open,   8:     .read        = seq_read,   9:     .llseek        = seq_lseek,  10:     .release    = mounts_release,  11:     .poll        = mounts_poll,  12: };

我们又在fs/proc/base.c中有了大发现：

   1: static const struct pid_entry tgid_base_stuff[] = {   2:     DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),   3:     DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),   4:     DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),   5:     DIR("ns",      S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),   6: #ifdef CONFIG_NET   7:     DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),   8: #endif   9:     REG("environ",    S_IRUSR, proc_environ_operations),  10:     INF("auxv",       S_IRUSR, proc_pid_auxv),  11:     ONE("status",     S_IRUGO, proc_pid_status),  12:     ONE("personality", S_IRUGO, proc_pid_personality),  13:     INF("limits",      S_IRUGO, proc_pid_limits),  14: #ifdef CONFIG_SCHED_DEBUG  15:     REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),  16: #endif  17: #ifdef CONFIG_SCHED_AUTOGROUP  18:     REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),  19: #endif  20:     REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),  21: #ifdef CONFIG_HAVE_ARCH_TRACEHOOK  22:     INF("syscall",    S_IRUGO, proc_pid_syscall),  23: #endif  24:     INF("cmdline",    S_IRUGO, proc_pid_cmdline),  25:     ONE("stat",       S_IRUGO, proc_tgid_stat),  26:     ONE("statm",      S_IRUGO, proc_pid_statm),  27:     REG("maps",       S_IRUGO, proc_maps_operations),  28: #ifdef CONFIG_NUMA  29:     REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),  30: #endif  31:     REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),  32:     LNK("cwd",        proc_cwd_link),  33:     LNK("root",       proc_root_link),  34:     LNK("exe",        proc_exe_link),  35:     REG("mounts",     S_IRUGO, proc_mounts_operations),  36:     REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),  37:     REG("mountstats", S_IRUSR, proc_mountstats_operations),  38: #ifdef CONFIG_PROC_PAGE_MONITOR  39:     REG("clear_refs", S_IWUSR, proc_clear_refs_operations),  40:     REG("smaps",      S_IRUGO, proc_smaps_operations),  41:     REG("pagemap",    S_IRUGO, proc_pagemap_operations),  42: #endif  43: #ifdef CONFIG_SECURITY  44:     DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),  45: #endif  46: #ifdef CONFIG_KALLSYMS  47:     INF("wchan",      S_IRUGO, proc_pid_wchan),  48: #endif  49: #ifdef CONFIG_STACKTRACE  50:     ONE("stack",      S_IRUGO, proc_pid_stack),  51: #endif  52: #ifdef CONFIG_SCHEDSTATS  53:     INF("schedstat",  S_IRUGO, proc_pid_schedstat),  54: #endif  55: #ifdef CONFIG_LATENCYTOP  56:     REG("latency",  S_IRUGO, proc_lstats_operations),  57: #endif  58: #ifdef CONFIG_PROC_PID_CPUSET  59:     REG("cpuset",     S_IRUGO, proc_cpuset_operations),  60: #endif  61: #ifdef CONFIG_CGROUPS  62:     REG("cgroup",  S_IRUGO, proc_cgroup_operations),  63: #endif  64:     INF("oom_score",  S_IRUGO, proc_oom_score),  65:     REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),  66:     REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),  67: #ifdef CONFIG_AUDITSYSCALL  68:     REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),  69:     REG("sessionid",  S_IRUGO, proc_sessionid_operations),  70: #endif  71: #ifdef CONFIG_FAULT_INJECTION  72:     REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),  73: #endif  74: #ifdef CONFIG_ELF_CORE  75:     REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),  76: #endif  77: #ifdef CONFIG_TASK_IO_ACCOUNTING  78:     INF("io",    S_IRUSR, proc_tgid_io_accounting),  79: #endif  80: #ifdef CONFIG_HARDWALL  81:     INF("hardwall",   S_IRUGO, proc_pid_hardwall),  82: #endif  83: };

这里定义着在每个/proc/[pid]下面的所有目录项

那么/proc/mounts呢，我们查看一下/proc/mounts的信息：

   1: #ls -l /proc   2: ......   3: lrwxrwxrwx  1 root       root               11 2014-01-26 22:11 mounts -> self/mounts   4: ......   5: lrwxrwxrwx  1 root       root               64 2014-01-23 01:22 self -> 10590   6: ......

因此，一切都明了了，/proc/mounts其实是到当前任务的/proc/self/mounts的软链接。

proc_mounts的数据源头，以及生成数据的调用层次问题已经找到了答案，接下来再回过头来看看seq_file。

return seq_list_start(&p->ns->list, *pos);

   1: struct list_head *seq_list_start(struct list_head *head, loff_t pos)   2: {   3:     struct list_head *lh;   4:     5:     list_for_each(lh, head)   6:         if (pos-- == 0)   7:             return lh;   8:     9:     return NULL;  10: }  11: EXPORT_SYMBOL(seq_list_start);

其实很简单，就是返回到双链表head的第pos项的位置指针。如果pos超出了head双链表中的项目数目，就返回NULL。

可见，这是为了读取seq_file中的内容做准备。

对于m_next和m_stop的逻辑也很简单，不再详述。

   1: static void *m_next(struct seq_file *m, void *v, loff_t *pos)   2: {   3:     struct proc_mounts *p = m->private;   4:     5:     return seq_list_next(v, &p->ns->list, pos);   6: }   7:     8: static void m_stop(struct seq_file *m, void *v)   9: {  10:     up_read(&namespace_sem);  11: }

   1: struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)   2: {   3:     struct list_head *lh;   4:     5:     lh = ((struct list_head *)v)->next;   6:     ++*ppos;   7:     return lh == head ? NULL : lh;   8: }   9: EXPORT_SYMBOL(seq_list_next);

总结一下，就是m_start/m_next向外界暴露proc_mounts->ns->list的位置指针，允许外界对其内容进行读取。

m_stop用来当读取结束后做清理工作，这里是恢复namespace_sem信号量。

显示函数

   1: static int show_vfsmnt(struct seq_file *m, void *v)   2: {   3:     struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);   4:     int err = 0;   5:     struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };   6:     7:     if (mnt->mnt_sb->s_op->show_devname) {   8:         err = mnt->mnt_sb->s_op->show_devname(m, mnt);   9:         if (err)  10:             goto out;  11:     } else {  12:         mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");  13:     }  14:     seq_putc(m, ' ');  15:     seq_path(m, &mnt_path, " 	
\");  16:     seq_putc(m, ' ');  17:     show_type(m, mnt->mnt_sb);  18:     seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");  19:     err = show_sb_opts(m, mnt->mnt_sb);  20:     if (err)  21:         goto out;  22:     show_mnt_opts(m, mnt);  23:     if (mnt->mnt_sb->s_op->show_options)  24:         err = mnt->mnt_sb->s_op->show_options(m, mnt);  25:     seq_puts(m, " 0 0
");  26: out:  27:     return err;  28: }

从show函数来看，是将v指针指向的vfsmount结构体的mnt_list内容以一定的格式写到seq_file的buffer里面去。

这里有理由猜想v实际上保存的是

m->p->ns->root

接下来我们看一下，这些简单的功能(m_start/m_next/m_stop/show_vfsmnt)是怎样发挥作用的：

   1: ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)   2: {   3:     struct seq_file *m = file->private_data;   4:     size_t copied = 0;   5:     loff_t pos;   6:     size_t n;   7:     void *p;   8:     int err = 0;   9:    10:     mutex_lock(&m->lock);  11:    12:     /* Don't assume *ppos is where we left it */  13:     if (unlikely(*ppos != m->read_pos)) {  14:         m->read_pos = *ppos;  15:         while ((err = traverse(m, *ppos)) == -EAGAIN)  16:             ;  17:         if (err) {  18:             /* With prejudice... */  19:             m->read_pos = 0;  20:             m->version = 0;  21:             m->index = 0;  22:             m->count = 0;  23:             goto Done;  24:         }  25:     }  26:    27:     /*  28:      * seq_file->op->..m_start/m_stop/m_next may do special actions  29:      * or optimisations based on the file->f_version, so we want to  30:      * pass the file->f_version to those methods.  31:      *  32:      * seq_file->version is just copy of f_version, and seq_file  33:      * methods can treat it simply as file version.  34:      * It is copied in first and copied out after all operations.  35:      * It is convenient to have it as  part of structure to avoid the  36:      * need of passing another argument to all the seq_file methods.  37:      */  38:     m->version = file->f_version;  39:     /* grab buffer if we didn't have one */  40:     if (!m->buf) {  41:         m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);  42:         if (!m->buf)  43:             goto Enomem;  44:     }  45:     /* if not empty - flush it first */  46:     if (m->count) {  47:         n = min(m->count, size);  48:         err = copy_to_user(buf, m->buf + m->from, n);  49:         if (err)  50:             goto Efault;  51:         m->count -= n;  52:         m->from += n;  53:         size -= n;  54:         buf += n;  55:         copied += n;  56:         if (!m->count)  57:             m->index++;  58:         if (!size)  59:             goto Done;  60:     }  61:     /* we need at least one record in buffer */  62:     pos = m->index;  63:     p = m->op->start(m, &pos);  64:     while (1) {  65:         err = PTR_ERR(p);  66:         if (!p || IS_ERR(p))  67:             break;  68:         err = m->op->show(m, p);  69:         if (err < 0)  70:             break;  71:         if (unlikely(err))  72:             m->count = 0;  73:         if (unlikely(!m->count)) {  74:             p = m->op->next(m, p, &pos);  75:             m->index = pos;  76:             continue;  77:         }  78:         if (m->count < m->size)  79:             goto Fill;  80:         m->op->stop(m, p);  81:         kfree(m->buf);  82:         m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);  83:         if (!m->buf)  84:             goto Enomem;  85:         m->count = 0;  86:         m->version = 0;  87:         pos = m->index;  88:         p = m->op->start(m, &pos);  89:     }  90:     m->op->stop(m, p);  91:     m->count = 0;  92:     goto Done;  93: Fill:  94:     /* they want more? let's try to get some more */  95:     while (m->count < size) {  96:         size_t offs = m->count;  97:         loff_t next = pos;  98:         p = m->op->next(m, p, &next);  99:         if (!p || IS_ERR(p)) { 100:             err = PTR_ERR(p); 101:             break; 102:         } 103:         err = m->op->show(m, p); 104:         if (m->count == m->size || err) { 105:             m->count = offs; 106:             if (likely(err <= 0)) 107:                 break; 108:         } 109:         pos = next; 110:     } 111:     m->op->stop(m, p); 112:     n = min(m->count, size); 113:     err = copy_to_user(buf, m->buf, n); 114:     if (err) 115:         goto Efault; 116:     copied += n; 117:     m->count -= n; 118:     if (m->count) 119:         m->from = n; 120:     else 121:         pos++; 122:     m->index = pos; 123: Done: 124:     if (!copied) 125:         copied = err; 126:     else { 127:         *ppos += copied; 128:         m->read_pos += copied; 129:     } 130:     file->f_version = m->version; 131:     mutex_unlock(&m->lock); 132:     return copied; 133: Enomem: 134:     err = -ENOMEM; 135:     goto Done; 136: Efault: 137:     err = -EFAULT; 138:     goto Done; 139: }

seq_read，显然是用来读取文件内容的，但是其接口并不是seq_file，而是file，这就表明这个接口是把seq_file的实现细节隐藏在了该函数的内容，而对于外面来说，可以通过常用的struct file接口来调用该函数。

因此该函数起到了Adapter的作用。

下面这段是核心代码

    pos = m->index;
    p = m->op->start(m, &pos);
    while (1) {
        err = PTR_ERR(p);
        if (!p || IS_ERR(p))
            break;
        err = m->op->show(m, p);
        if (err < 0)
            break;
        if (unlikely(err))
            m->count = 0;
        if (unlikely(!m->count)) {
            p = m->op->next(m, p, &pos);
            m->index = pos;
            continue;
        }
        if (m->count < m->size)
            goto Fill;
        m->op->stop(m, p);
        kfree(m->buf);
        m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
        if (!m->buf)
            goto Enomem;
        m->count = 0;
        m->version = 0;
        pos = m->index;
        p = m->op->start(m, &pos);
    }
    m->op->stop(m, p);
    m->count = 0;
    goto Done;

~~如果err代表出错，则使用m_next读取下一条，因此控制逻辑在show中，如果没有读完，就返回出错的信息。~~

err < 0, 代表成功，跳出循环；

err > 0, 代表没有读完全，即调用m_next来读下一条；

err = 0, 代表失败，将buffer大小调整为原平的2倍，再尝试重新读。

其中，m->count代表已经读到m->buffer中的字节数目，m->size代表一共需要读取多少字节。