PROC文件系统

1. seq_file

参考: http://blog.chinaunix.net/uid-26084833-id-1754437.html

seq_file的结构体定义:

   1: struct seq_file {
   2:     char *buf;
   3:     size_t size;
   4:     size_t from;
   5:     size_t count;
   6:     loff_t index;
   7:     loff_t read_pos;
   8:     u64 version;
   9:     struct mutex lock;
  10:     const struct seq_operations *op;
  11:     void *private;
  12: };

seq_operations的定义:

   1: struct seq_operations {
   2:     void * (*start) (struct seq_file *m, loff_t *pos);
   3:     void (*stop) (struct seq_file *m, void *v);
   4:     void * (*next) (struct seq_file *m, void *v, loff_t *pos);
   5:     int (*show) (struct seq_file *m, void *v);
   6: };

start函数

用于指定seq_file文件的读开始位置,返回实际读开始位置,如果指定的位置超过文件末尾,应当返回NULL,start函数可以有一个特殊的返回SEQ_START_TOKEN,它用于让show函数输出文件头,但这只能在pos为0时使用;

next函数

用于把seq_file 文件的当前读位置移动到下一个读位置,返回实际的下一个读位置,如果已经到达文件末尾,返回NULL;

stop函数

用于在读完seq_file文件后调 用,它类似于文件操作close,用于做一些必要的清理,如释放内存等;

show函数

用于格式化输出,如果成功返回0,否则返回出错码。

我们查看一下用来打印/proc/mounts信息对应的seq_file操作函数:

   1: const struct seq_operations mounts_op = {
   2:     .start    = m_start,
   3:     .next    = m_next,
   4:     .stop    = m_stop,
   5:     .show    = show_vfsmnt
   6: };

依次来看各个函数的实现:

   1: static void *m_start(struct seq_file *m, loff_t *pos)
   2: {
   3:     struct proc_mounts *p = m->private;
   4:  
   5:     down_read(&namespace_sem);
   6:     return seq_list_start(&p->ns->list, *pos);
   7: }
   8:  

down_read(&namespace_sem);

用来将可用的信号量降低一个数值,表示占用一个信号量,用来读取namespace相关的信息。

   1: static struct list_head *mount_hashtable __read_mostly;
   2: static struct kmem_cache *mnt_cache __read_mostly;
   3: static struct rw_semaphore namespace_sem;

namespace_sem用来保护对mount_hashtable的并发读写。

struct proc_mounts *p = m->private;

这里可以知道,给mountinfo使用的seq_file的成员private用来保存proc_mounts结构体指针。

   1: struct proc_mounts {
   2:     struct seq_file m; /* must be the first element */
   3:     struct mnt_namespace *ns;
   4:     struct path root;
   5:     int event;
   6: };

顾名思义,proc_mounts保存的是我们想要的/proc/mounts信息的数据结构。

那么,proc_mounts结构体中的数据是从哪里得到的呢

   1: static int mounts_open_common(struct inode *inode, struct file *file,
   2:                   const struct seq_operations *op)
   3: {
   4:     struct task_struct *task = get_proc_task(inode);
   5:     struct nsproxy *nsp;
   6:     struct mnt_namespace *ns = NULL;
   7:     struct path root;
   8:     struct proc_mounts *p;
   9:     int ret = -EINVAL;
  10:  
  11:     if (task) {
  12:         rcu_read_lock();
  13:         nsp = task_nsproxy(task);
  14:         if (nsp) {
  15:             ns = nsp->mnt_ns;
  16:             if (ns)
  17:                 get_mnt_ns(ns);
  18:         }
  19:         rcu_read_unlock();
  20:         if (ns && get_task_root(task, &root) == 0)
  21:             ret = 0;
  22:         put_task_struct(task);
  23:     }
  24:  
  25:     if (!ns)
  26:         goto err;
  27:     if (ret)
  28:         goto err_put_ns;
  29:  
  30:     ret = -ENOMEM;
  31:     p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
  32:     if (!p)
  33:         goto err_put_path;
  34:  
  35:     file->private_data = &p->m;
  36:     ret = seq_open(file, op);
  37:     if (ret)
  38:         goto err_free;
  39:  
  40:     p->m.private = p;
  41:     p->ns = ns;
  42:     p->root = root;
  43:     p->event = ns->event;
  44:  
  45:     return 0;
  46:  
  47:  err_free:
  48:     kfree(p);
  49:  err_put_path:
  50:     path_put(&root);
  51:  err_put_ns:
  52:     put_mnt_ns(ns);
  53:  err:
  54:     return ret;
  55: }

首先看到这段代码

    p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
    if (!p)
        goto err_put_path;

    file->private_data = &p->m;
    ret = seq_open(file, op);
    if (ret)
        goto err_free;

    p->m.private = p;
    p->ns = ns;
    p->root = root;
    p->event = ns->event;

可以确定,proc_mounts结构体是在这里创建并且初始化的。其中最重要的数据ns是怎么来的呢?

    if (task) {
        rcu_read_lock();
        nsp = task_nsproxy(task);
        if (nsp) {
            ns = nsp->mnt_ns;
            if (ns)
                get_mnt_ns(ns);
        }
        rcu_read_unlock();
        if (ns && get_task_root(task, &root) == 0)
            ret = 0;
        put_task_struct(task);
    }

   1: /*
   2:  * A structure to contain pointers to all per-process
   3:  * namespaces - fs (mount), uts, network, sysvipc, etc.
   4:  *
   5:  * 'count' is the number of tasks holding a reference.
   6:  * The count for each namespace, then, will be the number
   7:  * of nsproxies pointing to it, not the number of tasks.
   8:  *
   9:  * The nsproxy is shared by tasks which share all namespaces.
  10:  * As soon as a single namespace is cloned or unshared, the
  11:  * nsproxy is copied.
  12:  */
  13: struct nsproxy {
  14:     atomic_t count;
  15:     struct uts_namespace *uts_ns;
  16:     struct ipc_namespace *ipc_ns;
  17:     struct mnt_namespace *mnt_ns;
  18:     struct pid_namespace *pid_ns;
  19:     struct net          *net_ns;
  20: };
这段代码就是通过当前任务的nsproxy结构体得到mnt_ns数据。

那么是哪里调到了mounts_open_common函数呢?

   1: static int mounts_open(struct inode *inode, struct file *file)
   2: {
   3:     return mounts_open_common(inode, file, &mounts_op);
   4: }
   5:  
   6: static const struct file_operations proc_mounts_operations = {
   7:     .open        = mounts_open,
   8:     .read        = seq_read,
   9:     .llseek        = seq_lseek,
  10:     .release    = mounts_release,
  11:     .poll        = mounts_poll,
  12: };

我们又在fs/proc/base.c中有了大发现:

   1: static const struct pid_entry tgid_base_stuff[] = {
   2:     DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
   3:     DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
   4:     DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
   5:     DIR("ns",      S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
   6: #ifdef CONFIG_NET
   7:     DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
   8: #endif
   9:     REG("environ",    S_IRUSR, proc_environ_operations),
  10:     INF("auxv",       S_IRUSR, proc_pid_auxv),
  11:     ONE("status",     S_IRUGO, proc_pid_status),
  12:     ONE("personality", S_IRUGO, proc_pid_personality),
  13:     INF("limits",      S_IRUGO, proc_pid_limits),
  14: #ifdef CONFIG_SCHED_DEBUG
  15:     REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
  16: #endif
  17: #ifdef CONFIG_SCHED_AUTOGROUP
  18:     REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
  19: #endif
  20:     REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
  21: #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
  22:     INF("syscall",    S_IRUGO, proc_pid_syscall),
  23: #endif
  24:     INF("cmdline",    S_IRUGO, proc_pid_cmdline),
  25:     ONE("stat",       S_IRUGO, proc_tgid_stat),
  26:     ONE("statm",      S_IRUGO, proc_pid_statm),
  27:     REG("maps",       S_IRUGO, proc_maps_operations),
  28: #ifdef CONFIG_NUMA
  29:     REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),
  30: #endif
  31:     REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
  32:     LNK("cwd",        proc_cwd_link),
  33:     LNK("root",       proc_root_link),
  34:     LNK("exe",        proc_exe_link),
  35:     REG("mounts",     S_IRUGO, proc_mounts_operations),
  36:     REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
  37:     REG("mountstats", S_IRUSR, proc_mountstats_operations),
  38: #ifdef CONFIG_PROC_PAGE_MONITOR
  39:     REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
  40:     REG("smaps",      S_IRUGO, proc_smaps_operations),
  41:     REG("pagemap",    S_IRUGO, proc_pagemap_operations),
  42: #endif
  43: #ifdef CONFIG_SECURITY
  44:     DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
  45: #endif
  46: #ifdef CONFIG_KALLSYMS
  47:     INF("wchan",      S_IRUGO, proc_pid_wchan),
  48: #endif
  49: #ifdef CONFIG_STACKTRACE
  50:     ONE("stack",      S_IRUGO, proc_pid_stack),
  51: #endif
  52: #ifdef CONFIG_SCHEDSTATS
  53:     INF("schedstat",  S_IRUGO, proc_pid_schedstat),
  54: #endif
  55: #ifdef CONFIG_LATENCYTOP
  56:     REG("latency",  S_IRUGO, proc_lstats_operations),
  57: #endif
  58: #ifdef CONFIG_PROC_PID_CPUSET
  59:     REG("cpuset",     S_IRUGO, proc_cpuset_operations),
  60: #endif
  61: #ifdef CONFIG_CGROUPS
  62:     REG("cgroup",  S_IRUGO, proc_cgroup_operations),
  63: #endif
  64:     INF("oom_score",  S_IRUGO, proc_oom_score),
  65:     REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
  66:     REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
  67: #ifdef CONFIG_AUDITSYSCALL
  68:     REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
  69:     REG("sessionid",  S_IRUGO, proc_sessionid_operations),
  70: #endif
  71: #ifdef CONFIG_FAULT_INJECTION
  72:     REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
  73: #endif
  74: #ifdef CONFIG_ELF_CORE
  75:     REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
  76: #endif
  77: #ifdef CONFIG_TASK_IO_ACCOUNTING
  78:     INF("io",    S_IRUSR, proc_tgid_io_accounting),
  79: #endif
  80: #ifdef CONFIG_HARDWALL
  81:     INF("hardwall",   S_IRUGO, proc_pid_hardwall),
  82: #endif
  83: };

这里定义着在每个/proc/[pid]下面的所有目录项

那么/proc/mounts呢,我们查看一下/proc/mounts的信息:

   1: #ls -l /proc
   2: ......
   3: lrwxrwxrwx  1 root       root               11 2014-01-26 22:11 mounts -> self/mounts
   4: ......
   5: lrwxrwxrwx  1 root       root               64 2014-01-23 01:22 self -> 10590
   6: ......

因此,一切都明了了,/proc/mounts其实是到当前任务的/proc/self/mounts的软链接。

proc_mounts的数据源头,以及生成数据的调用层次问题已经找到了答案,接下来再回过头来看看seq_file。

return seq_list_start(&p->ns->list, *pos);

   1: struct list_head *seq_list_start(struct list_head *head, loff_t pos)
   2: {
   3:     struct list_head *lh;
   4:  
   5:     list_for_each(lh, head)
   6:         if (pos-- == 0)
   7:             return lh;
   8:  
   9:     return NULL;
  10: }
  11: EXPORT_SYMBOL(seq_list_start);

其实很简单,就是返回到双链表head的第pos项的位置指针。如果pos超出了head双链表中的项目数目,就返回NULL。

可见,这是为了读取seq_file中的内容做准备。

对于m_next和m_stop的逻辑也很简单,不再详述。

   1: static void *m_next(struct seq_file *m, void *v, loff_t *pos)
   2: {
   3:     struct proc_mounts *p = m->private;
   4:  
   5:     return seq_list_next(v, &p->ns->list, pos);
   6: }
   7:  
   8: static void m_stop(struct seq_file *m, void *v)
   9: {
  10:     up_read(&namespace_sem);
  11: }
   1: struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
   2: {
   3:     struct list_head *lh;
   4:  
   5:     lh = ((struct list_head *)v)->next;
   6:     ++*ppos;
   7:     return lh == head ? NULL : lh;
   8: }
   9: EXPORT_SYMBOL(seq_list_next);

总结一下,就是m_start/m_next向外界暴露proc_mounts->ns->list的位置指针,允许外界对其内容进行读取。

m_stop用来当读取结束后做清理工作,这里是恢复namespace_sem信号量。

显示函数

   1: static int show_vfsmnt(struct seq_file *m, void *v)
   2: {
   3:     struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
   4:     int err = 0;
   5:     struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
   6:  
   7:     if (mnt->mnt_sb->s_op->show_devname) {
   8:         err = mnt->mnt_sb->s_op->show_devname(m, mnt);
   9:         if (err)
  10:             goto out;
  11:     } else {
  12:         mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
  13:     }
  14:     seq_putc(m, ' ');
  15:     seq_path(m, &mnt_path, " 	
\");
  16:     seq_putc(m, ' ');
  17:     show_type(m, mnt->mnt_sb);
  18:     seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
  19:     err = show_sb_opts(m, mnt->mnt_sb);
  20:     if (err)
  21:         goto out;
  22:     show_mnt_opts(m, mnt);
  23:     if (mnt->mnt_sb->s_op->show_options)
  24:         err = mnt->mnt_sb->s_op->show_options(m, mnt);
  25:     seq_puts(m, " 0 0
");
  26: out:
  27:     return err;
  28: }

从show函数来看,是将v指针指向的vfsmount结构体的mnt_list内容以一定的格式写到seq_file的buffer里面去。

这里有理由猜想v实际上保存的是

m->p->ns->root

接下来我们看一下,这些简单的功能(m_start/m_next/m_stop/show_vfsmnt)是怎样发挥作用的:

   1: ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
   2: {
   3:     struct seq_file *m = file->private_data;
   4:     size_t copied = 0;
   5:     loff_t pos;
   6:     size_t n;
   7:     void *p;
   8:     int err = 0;
   9:  
  10:     mutex_lock(&m->lock);
  11:  
  12:     /* Don't assume *ppos is where we left it */
  13:     if (unlikely(*ppos != m->read_pos)) {
  14:         m->read_pos = *ppos;
  15:         while ((err = traverse(m, *ppos)) == -EAGAIN)
  16:             ;
  17:         if (err) {
  18:             /* With prejudice... */
  19:             m->read_pos = 0;
  20:             m->version = 0;
  21:             m->index = 0;
  22:             m->count = 0;
  23:             goto Done;
  24:         }
  25:     }
  26:  
  27:     /*
  28:      * seq_file->op->..m_start/m_stop/m_next may do special actions
  29:      * or optimisations based on the file->f_version, so we want to
  30:      * pass the file->f_version to those methods.
  31:      *
  32:      * seq_file->version is just copy of f_version, and seq_file
  33:      * methods can treat it simply as file version.
  34:      * It is copied in first and copied out after all operations.
  35:      * It is convenient to have it as  part of structure to avoid the
  36:      * need of passing another argument to all the seq_file methods.
  37:      */
  38:     m->version = file->f_version;
  39:     /* grab buffer if we didn't have one */
  40:     if (!m->buf) {
  41:         m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
  42:         if (!m->buf)
  43:             goto Enomem;
  44:     }
  45:     /* if not empty - flush it first */
  46:     if (m->count) {
  47:         n = min(m->count, size);
  48:         err = copy_to_user(buf, m->buf + m->from, n);
  49:         if (err)
  50:             goto Efault;
  51:         m->count -= n;
  52:         m->from += n;
  53:         size -= n;
  54:         buf += n;
  55:         copied += n;
  56:         if (!m->count)
  57:             m->index++;
  58:         if (!size)
  59:             goto Done;
  60:     }
  61:     /* we need at least one record in buffer */
  62:     pos = m->index;
  63:     p = m->op->start(m, &pos);
  64:     while (1) {
  65:         err = PTR_ERR(p);
  66:         if (!p || IS_ERR(p))
  67:             break;
  68:         err = m->op->show(m, p);
  69:         if (err < 0)
  70:             break;
  71:         if (unlikely(err))
  72:             m->count = 0;
  73:         if (unlikely(!m->count)) {
  74:             p = m->op->next(m, p, &pos);
  75:             m->index = pos;
  76:             continue;
  77:         }
  78:         if (m->count < m->size)
  79:             goto Fill;
  80:         m->op->stop(m, p);
  81:         kfree(m->buf);
  82:         m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
  83:         if (!m->buf)
  84:             goto Enomem;
  85:         m->count = 0;
  86:         m->version = 0;
  87:         pos = m->index;
  88:         p = m->op->start(m, &pos);
  89:     }
  90:     m->op->stop(m, p);
  91:     m->count = 0;
  92:     goto Done;
  93: Fill:
  94:     /* they want more? let's try to get some more */
  95:     while (m->count < size) {
  96:         size_t offs = m->count;
  97:         loff_t next = pos;
  98:         p = m->op->next(m, p, &next);
  99:         if (!p || IS_ERR(p)) {
 100:             err = PTR_ERR(p);
 101:             break;
 102:         }
 103:         err = m->op->show(m, p);
 104:         if (m->count == m->size || err) {
 105:             m->count = offs;
 106:             if (likely(err <= 0))
 107:                 break;
 108:         }
 109:         pos = next;
 110:     }
 111:     m->op->stop(m, p);
 112:     n = min(m->count, size);
 113:     err = copy_to_user(buf, m->buf, n);
 114:     if (err)
 115:         goto Efault;
 116:     copied += n;
 117:     m->count -= n;
 118:     if (m->count)
 119:         m->from = n;
 120:     else
 121:         pos++;
 122:     m->index = pos;
 123: Done:
 124:     if (!copied)
 125:         copied = err;
 126:     else {
 127:         *ppos += copied;
 128:         m->read_pos += copied;
 129:     }
 130:     file->f_version = m->version;
 131:     mutex_unlock(&m->lock);
 132:     return copied;
 133: Enomem:
 134:     err = -ENOMEM;
 135:     goto Done;
 136: Efault:
 137:     err = -EFAULT;
 138:     goto Done;
 139: }

seq_read,显然是用来读取文件内容的,但是其接口并不是seq_file,而是file,这就表明这个接口是把seq_file的实现细节隐藏在了该函数的内容,而对于外面来说,可以通过常用的struct file接口来调用该函数。

因此该函数起到了Adapter的作用。

下面这段是核心代码

    pos = m->index;
    p = m->op->start(m, &pos);
    while (1) {
        err = PTR_ERR(p);
        if (!p || IS_ERR(p))
            break;
        err = m->op->show(m, p);
        if (err < 0)
            break;
        if (unlikely(err))
            m->count = 0;
        if (unlikely(!m->count)) {
            p = m->op->next(m, p, &pos);
            m->index = pos;
            continue;
        }
        if (m->count < m->size)
            goto Fill;
        m->op->stop(m, p);
        kfree(m->buf);
        m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
        if (!m->buf)
            goto Enomem;
        m->count = 0;
        m->version = 0;
        pos = m->index;
        p = m->op->start(m, &pos);
    }
    m->op->stop(m, p);
    m->count = 0;
    goto Done;

如果err代表出错,则使用m_next读取下一条,因此控制逻辑在show中,如果没有读完,就返回出错的信息。

err < 0, 代表成功,跳出循环;

err > 0, 代表没有读完全,即调用m_next来读下一条;

err = 0, 代表失败,将buffer大小调整为原平的2倍,再尝试重新读。

其中,m->count代表已经读到m->buffer中的字节数目,m->size代表一共需要读取多少字节。

原文地址:https://www.cnblogs.com/long123king/p/3534989.html