files_struct/fdtable解析

include/linux/fdtable.h

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
    atomic_t count;
    bool resize_in_progress;
    wait_queue_head_t resize_wait;

    struct fdtable __rcu *fdt;
    struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
    spinlock_t file_lock ____cacheline_aligned_in_smp;
    unsigned int next_fd;
    unsigned long close_on_exec_init[1];
    unsigned long open_fds_init[1];
    unsigned long full_fds_bits_init[1];
    struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

上述files_struct中最关键的成员是struct fdtable的fdt指针

对于小进程fork时父进程open的文件数小于NR_OPEN_DEFAULT，则fd table会直接使用files_struct里的；如果超过NR_OPEN_DEFAULT，则不会使用files_struct里的，会调用alloc_fdtable()进行分配fd table。

然后将父进程的fd table拷贝到新fork的进程的fd table。

NR_OPEN_DEFAULT定义为BITS_PER_LONG，一般为64。对于较大进程fork子进程时，父进程此时一般已经open了较多file，比如超过了64，此时就会alloc fd table，而不会使用files_struct里的default fd table；对于小进程fork子进程，此时一般就直接用了files_struct里default fd table。

fs/file.c

/*
 * Allocate a new files structure and copy contents from the
 * passed in files structure.
 * errorp will be valid only when the returned files_struct is NULL.
 */
struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
{
    struct files_struct *newf;
    struct file **old_fds, **new_fds;
    unsigned int open_files, i;
    struct fdtable *old_fdt, *new_fdt;

    *errorp = -ENOMEM;
    newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
    if (!newf)
        goto out;

    atomic_set(&newf->count, 1);

    spin_lock_init(&newf->file_lock);
    newf->resize_in_progress = false;
    init_waitqueue_head(&newf->resize_wait);
    newf->next_fd = 0;
    new_fdt = &newf->fdtab;
    new_fdt->max_fds = NR_OPEN_DEFAULT;
    new_fdt->close_on_exec = newf->close_on_exec_init;
    new_fdt->open_fds = newf->open_fds_init;
    new_fdt->full_fds_bits = newf->full_fds_bits_init;
    new_fdt->fd = &newf->fd_array[0];

    spin_lock(&oldf->file_lock);
    old_fdt = files_fdtable(oldf);
    open_files = sane_fdtable_size(old_fdt, max_fds);

    /*
     * Check whether we need to allocate a larger fd array and fd set.
     */
    while (unlikely(open_files > new_fdt->max_fds)) {
        spin_unlock(&oldf->file_lock);

        if (new_fdt != &newf->fdtab)
            __free_fdtable(new_fdt);

        new_fdt = alloc_fdtable(open_files - 1);
        if (!new_fdt) {
            *errorp = -ENOMEM;
            goto out_release;
        }

        /* beyond sysctl_nr_open; nothing to do */
        if (unlikely(new_fdt->max_fds < open_files)) {
            __free_fdtable(new_fdt);
            *errorp = -EMFILE;
            goto out_release;
        }

        /*
         * Reacquire the oldf lock and a pointer to its fd table
         * who knows it may have a new bigger fd table. We need
         * the latest pointer.
         */
        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        open_files = sane_fdtable_size(old_fdt, max_fds);
    }

    copy_fd_bitmaps(new_fdt, old_fdt, open_files);

    old_fds = old_fdt->fd;
    new_fds = new_fdt->fd;

    for (i = open_files; i != 0; i--) {
        struct file *f = *old_fds++;
        if (f) {
            get_file(f);
        } else {
            /*
             * The fd may be claimed in the fd bitmap but not yet
             * instantiated in the files array if a sibling thread
             * is partway through open().  So make sure that this
             * fd is available to the new process.
             */
            __clear_open_fd(open_files - i, new_fdt);
        }
        rcu_assign_pointer(*new_fds++, f);
    }
    spin_unlock(&oldf->file_lock);

    /* clear the remainder */
    memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));

    rcu_assign_pointer(newf->fdt, new_fdt);

    return newf;

out_release:
    kmem_cache_free(files_cachep, newf);
out:
    return NULL;
}

fd table里的内容

include/linux/fdtable.h

struct fdtable {
    unsigned int max_fds;
    struct file __rcu **fd;      /* current fd array */
    unsigned long *close_on_exec;
    unsigned long *open_fds;
    unsigned long *full_fds_bits;
    struct rcu_head rcu;
};

max_fds，表示此fd table最多能容纳多少个fd；

struct file的二重指针fd，这个是指向一个数组，这个数组里的元素是struct file *指针；

open_fds，long型指针，指向一个long型数组，数组中的每个元素的每一个bit代表一个文件，如果这个bit为1，表示此文件已open；

close_on_exec，和open_fds功能一样，含义不同；

full_fds_bits，long型数组，每一个bit代表open_fds里每个元素所有bit是否都为1，如果都为1，这个bit置上；只要有一个不为1，这个bit将被clear；

根据allc_fdtable()，可以看到open_fds/close_on_exec/full_fds_bits数组是一块申请分配的，内存layout顺序依次是open_fds/close_on_exec/full_fds_bits

alloc_fdtable()

fs/file.c

static struct fdtable * alloc_fdtable(unsigned int nr)
{
    struct fdtable *fdt;
    void *data;

    /*
     * Figure out how many fds we actually want to support in this fdtable.
     * Allocation steps are keyed to the size of the fdarray, since it
     * grows far faster than any of the other dynamic data. We try to fit
     * the fdarray into comfortable page-tuned chunks: starting at 1024B
     * and growing in powers of two from there on.
     */
    nr /= (1024 / sizeof(struct file *));
    nr = roundup_pow_of_two(nr + 1);
    nr *= (1024 / sizeof(struct file *));
    /*
     * Note that this can drive nr *below* what we had passed if sysctl_nr_open
     * had been set lower between the check in expand_files() and here.  Deal
     * with that in caller, it's cheaper that way.
     *
     * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
     * bitmaps handling below becomes unpleasant, to put it mildly...
     */
    if (unlikely(nr > sysctl_nr_open))
        nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;

    fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
    if (!fdt)
        goto out;
    fdt->max_fds = nr;
    data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
    if (!data)
        goto out_fdt;
    fdt->fd = data;

    data = kvmalloc(max_t(size_t,
                 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
                 GFP_KERNEL_ACCOUNT);
    if (!data)
        goto out_arr;
    fdt->open_fds = data;
    data += nr / BITS_PER_BYTE;
    fdt->close_on_exec = data;
    data += nr / BITS_PER_BYTE;
    fdt->full_fds_bits = data;

    return fdt;

out_arr:
    kvfree(fdt->fd);
out_fdt:
    kfree(fdt);
out:
    return NULL;
}

重点看下上述kvmalloc()，这个分配的大小是2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr)，nr表示此fd table要容纳多少个fd，nr/BITS_PER_BYTE，一个bit表示一个fd，所以这个表示容纳nr个fd需要多少个byte；

*2是因为有open_fds和close_on_exec两个大小一样的数组；

BITBIT_SIZE(nr)，这个宏定义如下。假设nr为65*64，则BITBIT_SIZE(65*64)的结果为2*8，即为两个long型，这两个long型的每个bit为1表示open_fds里一个元素（long型）所有bit均为1

#define BITBIT_NR(nr)    BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr)    (BITBIT_NR(nr) * sizeof(long))