vfs open系统调用flow之link_path

vfs open系统调用flow之link_path_walk()

do_filp_open()里将pathname保存到nameidata里，pathname是open file的完整路径名，调用path_openat，此时是flags是带了LOOKUP_RCU flag的

struct file *do_filp_open(int dfd, struct filename *pathname,
        const struct open_flags *op)
{
    struct nameidata nd;
    int flags = op->lookup_flags;
    struct file *filp;

    set_nameidata(&nd, dfd, pathname);
    filp = path_openat(&nd, op, flags | LOOKUP_RCU);
    if (unlikely(filp == ERR_PTR(-ECHILD)))
        filp = path_openat(&nd, op, flags);
    if (unlikely(filp == ERR_PTR(-ESTALE)))
        filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
    restore_nameidata();
    return filp;
}

path_openat()里主要call了3个函数path_init()、link_path_walk()、do_last()

调用path_init()设置nameidata的path结构体，对于常规文件来说，比如/data/test/test.txt，设置path结构体‘指向’根目录/，即设置path.mnt/path.dentry‘指向’根目录/，为后续的目录解析做准备。path_init()的返回值s即指向open file的完整路径名字串开头。

static struct file *path_openat(struct nameidata *nd,
            const struct open_flags *op, unsigned flags)
{
    struct file *file;
    int error;

    file = alloc_empty_file(op->open_flag, current_cred());
    if (IS_ERR(file))
        return file;

    if (unlikely(file->f_flags & __O_TMPFILE)) {
        error = do_tmpfile(nd, flags, op, file);
    } else if (unlikely(file->f_flags & O_PATH)) {
        error = do_o_path(nd, flags, file);
    } else {
        const char *s = path_init(nd, flags);
        while (!(error = link_path_walk(s, nd)) &&
            (error = do_last(nd, file, op)) > 0) {
            nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
            s = trailing_symlink(nd);
        }
        terminate_walk(nd);
    }
    if (likely(!error)) {
        if (likely(file->f_mode & FMODE_OPENED))
            return file;
        WARN_ON(1);
        error = -EINVAL;
    }
    fput(file);
    if (error == -EOPENSTALE) {
        if (flags & LOOKUP_RCU)
            error = -ECHILD;
        else
            error = -ESTALE;
    }
    return ERR_PTR(error);
}

static int link_path_walk(const char *name, struct nameidata *nd)
{
    int err;

    if (IS_ERR(name))
        return PTR_ERR(name);
    while (*name=='/')
        name++;
    if (!*name)
        return 0;

    /* At this point we know we have a real path component. */
    for(;;) {
        u64 hash_len;
        int type;

        err = may_lookup(nd);
        if (err)
            return err;

        hash_len = hash_name(nd->path.dentry, name); //计算当前目录的hash_len，这个变量高4 byte是当前目录name字串长度，低4byte是当前目录（路径）的hash值，hash值的计算是基于当前目录的父目录dentry（nd->path.dentry）来计算的，所以它跟其目录（路径）dentry是关联的

        type = LAST_NORM;
        if (name[0] == '.') switch (hashlen_len(hash_len)) {
            case 2:
                if (name[1] == '.') {
                    type = LAST_DOTDOT;
                    nd->flags |= LOOKUP_JUMPED;
                }
                break;
            case 1:
                type = LAST_DOT;
        }
        if (likely(type == LAST_NORM)) {
            struct dentry *parent = nd->path.dentry;
            nd->flags &= ~LOOKUP_JUMPED;
            if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {  //to do
                struct qstr this = { { .hash_len = hash_len }, .name = name };
                err = parent->d_op->d_hash(parent, &this);
                if (err < 0)
                    return err;
                hash_len = this.hash_len;
                name = this.name;
            }
        }

        nd->last.hash_len = hash_len; //更新nameidata last结构体，last.hash_len，name
        nd->last.name = name;
        nd->last_type = type;

        name += hashlen_len(hash_len); //name指向下一级目录
        if (!*name)
            goto OK;
        /*
         * If it wasn't NUL, we know it was '/'. Skip that
         * slash, and continue until no more slashes.
         */
        do {
            name++;
        } while (unlikely(*name == '/'));
        if (unlikely(!*name)) { //假设open file文件名路径上没有任何symlink，则如果这个条件满足，说明整个路径都解析完了，还剩最后的filename留给do_last()解析，此函数将从下面的!nd->depth条件处返回。
OK:
            /* pathname body, done */
            if (!nd->depth) //如果open file完整路径上没有任何symlink，nd->depth等于0，
                return 0;
            name = nd->stack[nd->depth - 1].name;
            /* trailing symlink, done */
            if (!name)
                return 0;
            /* last component of nested symlink */
            err = walk_component(nd, WALK_FOLLOW); //symlink case
        } else {
            /* not the last component */
            err = walk_component(nd, WALK_FOLLOW | WALK_MORE); //常规目录case，即非symlink case
        }
        if (err < 0)
            return err;

        if (err) {
            const char *s = get_link(nd);

            if (IS_ERR(s))
                return PTR_ERR(s);
            err = 0;
            if (unlikely(!s)) {
                /* jumped */
                put_link(nd);
            } else {
                nd->stack[nd->depth - 1].name = name;
                name = s;
                continue;
            }
        }
        if (unlikely(!d_can_lookup(nd->path.dentry))) {
            if (nd->flags & LOOKUP_RCU) {
                if (unlazy_walk(nd))
                    return -ECHILD;
            }
            return -ENOTDIR;
        }
    }
}

walk_component()首先会调用lookup_fast()在dcache里查找，如果没有找到，则直接返回0，然后进入lookup_slow()的slow path。

无论是在dcache里找到了当前目录对应的dentry（fast path）或者没有找到（slow path），这两个path都会去设置path结构体里的dentry、mnt成员，将当前路径更新到path结构体。

path结构体更新后，最后调用step_info/path_to_nameidata将path结构体更新到nd.path，这样nd里的path就指向了当前目录了，至此完成一级目录的解析查找，返回link_path_walk()将基于nd.path作为父目录解析下一级目录

static int walk_component(struct nameidata *nd, int flags)
{
    struct path path;
    struct inode *inode;
    unsigned seq;
    int err;
    /*
     * "." and ".." are special - ".." especially so because it has
     * to be able to know about the current root directory and
     * parent relationships.
     */
    if (unlikely(nd->last_type != LAST_NORM)) {
        err = handle_dots(nd, nd->last_type);
        if (!(flags & WALK_MORE) && nd->depth)
            put_link(nd);
        return err;
    }
    err = lookup_fast(nd, &path, &inode, &seq);
    if (unlikely(err <= 0)) { //上述look_fast()如果在dcache里没有找到当前目录（路径）的dentry，则会执行如下的lookup_slow()
        if (err < 0)
            return err;
        path.dentry = lookup_slow(&nd->last, nd->path.dentry,
                      nd->flags);
        if (IS_ERR(path.dentry))
            return PTR_ERR(path.dentry);

        path.mnt = nd->path.mnt;
        err = follow_managed(&path, nd);
        if (unlikely(err < 0))
            return err;

        if (unlikely(d_is_negative(path.dentry))) {
            path_to_nameidata(&path, nd);
            return -ENOENT;
        }

        seq = 0;    /* we are already out of RCU mode */
        inode = d_backing_inode(path.dentry);
    }

    return step_into(nd, &path, flags, inode, seq);
}

__lookup_slow()首先调用d_alloc_parallel给当前路径分配一个新的dentry，然后调用inode->i_op->lookup()，注意这里的inode是当前路径的父路径dentry的d_inode成员。inode->i_op是具体的文件系统inode operations函数集了，当前以ext4 fs来举例的话，就是ext4 fs的inode operations函数集，即ext4_dir_inode_operations，其lookup函数是ext4_lookup()，这个具体文件系统lookup的过程将单独在一篇文章里描述。

具体文件系统的lookup完成后，此时的dentry已经有绑定的inode了，即已经设置了其d_inode成员了，然后调用d_lookup_done()将此dentry从lookup hash链表上移除（它是在d_alloc_parallel里被插入lookup hash），这个lookup hash链表的作用是避免其它线程也同时来查找当前目录造成重复alloc dentry的问题

static struct dentry *__lookup_slow(const struct qstr *name,
                    struct dentry *dir,
                    unsigned int flags)
{
    struct dentry *dentry, *old;
    struct inode *inode = dir->d_inode;
    DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

    /* Don't go there if it's already dead */
    if (unlikely(IS_DEADDIR(inode)))
        return ERR_PTR(-ENOENT);
again:
    dentry = d_alloc_parallel(dir, name, &wq);
    if (IS_ERR(dentry))
        return dentry;
    if (unlikely(!d_in_lookup(dentry))) { //这个不考虑并发的情况下这个条件不成立
        if (!(flags & LOOKUP_NO_REVAL)) {
            int error = d_revalidate(dentry, flags);
            if (unlikely(error <= 0)) {
                if (!error) {
                    d_invalidate(dentry);
                    dput(dentry);
                    goto again;
                }
                dput(dentry);
                dentry = ERR_PTR(error);
            }
        }
    } else {
        old = inode->i_op->lookup(inode, dentry, flags);
        d_lookup_done(dentry);
        if (unlikely(old)) { //如果dentry是一个目录的dentry，则有可能old是有效的；否则如果dentry是文件的dentry则old是null
            dput(dentry);
            dentry = old;
        }
    }
    return dentry;
}

d_alloc_parallel首先alloc一个dentry，然后调用__d_lookup_rcu在dcache里快速查找是否有当前目录对应的dentry，这个在不考虑并发的情况下，是找不到的。在并发条件下，可能不止一个线程去lookup当前目录，可能在其它cpu上的线程也在同时lookup这个目录。

然后是在lookup hash链表上查找是否有当前目录对应的dentry，同样，在并发的条件下，有可能其它线程也在alloc当前目录，因为此时会进入具体的文件系统lookup当前目录，会去读取目录文件lookup当前目录，所以会涉及到IO操作，而IO操作是比较耗时的，有可能有其它线程已经将当前目录dentry加入了lookup hash链表但还在具体文件系统那边lookup，还没有完成整个当前目录的lookup，所以如果此时当前目录已经在lookup hash链表的话，当前lookup需要等其它线程完成它在具体文件系统那边的lookup过程，它那边完成了整个的lookup过程后，会将当前目录对应的dentry从lookup hash链表中移除，所以在d_alloc_parallel()里在lookup hash链表里查找时，如果该链表上的一个dentry的hash值、parent、name和当前目录的一致（说明当前目录已经在lookup hash链表上了），则会调用d_wait_lookup()等待当前目录的dentry从lookup hash链表上移除。d_wait_lookup()之后，基本会直接返回这个从lookup hash链表上找到的dentry，在return此dentry之前会先将上面已经alloc的dentry put。

如果在lookup hash上没有找到当前目录的dentry，说明当前目录没有在此hash链表上，即在此时没有人也在lookup当前目录。此时会将新alloc的dentry的flags的DCACHE_PAR_LOOKUP flag置上，并把新alloc的dentry加入到lookup hash，最终return此dentry

struct dentry *d_alloc_parallel(struct dentry *parent,
                const struct qstr *name,
                wait_queue_head_t *wq)
{
    unsigned int hash = name->hash;
    struct hlist_bl_head *b = in_lookup_hash(parent, hash); //lookup hash链表
    struct hlist_bl_node *node;
    struct dentry *new = d_alloc(parent, name); //分配一个dentry
    struct dentry *dentry;
    unsigned seq, r_seq, d_seq;

    if (unlikely(!new))
        return ERR_PTR(-ENOMEM);

retry:
    rcu_read_lock();
    seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
    r_seq = read_seqbegin(&rename_lock);
    dentry = __d_lookup_rcu(parent, name, &d_seq); //再次到dcache里查找当前目录是否有对应的dentry，在不考虑并发的条件下，将查找不到
    if (unlikely(dentry)) {
        if (!lockref_get_not_dead(&dentry->d_lockref)) {
            rcu_read_unlock();
            goto retry;
        }
        if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
            rcu_read_unlock();
            dput(dentry);
            goto retry;
        }
        rcu_read_unlock();
        dput(new);
        return dentry;
    }
    if (unlikely(read_seqretry(&rename_lock, r_seq))) {
        rcu_read_unlock();
        goto retry;
    }

    if (unlikely(seq & 1)) {
        rcu_read_unlock();
        goto retry;
    }

    hlist_bl_lock(b);
    if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
        hlist_bl_unlock(b);
        rcu_read_unlock();
        goto retry;
    }
    /*
     * No changes for the parent since the beginning of d_lookup().
     * Since all removals from the chain happen with hlist_bl_lock(),
     * any potential in-lookup matches are going to stay here until
     * we unlock the chain.  All fields are stable in everything
     * we encounter.
     */
    hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {  //从lookup hash链表上查找是否有当前路径对应的dentry，如果有找到，put上面alloc的dentry然后就直接返回找到的这个dentry了，这里有点奇怪，为什么不先查找，如果没有查找到再alloc dentry..
        if (dentry->d_name.hash != hash)
            continue;
        if (dentry->d_parent != parent)
            continue;
        if (!d_same_name(dentry, parent, name))
            continue;
        hlist_bl_unlock(b);
        /* now we can try to grab a reference */
        if (!lockref_get_not_dead(&dentry->d_lockref)) {
            rcu_read_unlock();
            goto retry;
        }

        rcu_read_unlock();
        /*
         * somebody is likely to be still doing lookup for it;
         * wait for them to finish
         */
        spin_lock(&dentry->d_lock);
        d_wait_lookup(dentry);
        /*
         * it's not in-lookup anymore; in principle we should repeat
         * everything from dcache lookup, but it's likely to be what
         * d_lookup() would've found anyway.  If it is, just return it;
         * otherwise we really have to repeat the whole thing.
         */
        if (unlikely(dentry->d_name.hash != hash))
            goto mismatch;
        if (unlikely(dentry->d_parent != parent))
            goto mismatch;
        if (unlikely(d_unhashed(dentry)))
            goto mismatch;
        if (unlikely(!d_same_name(dentry, parent, name)))
            goto mismatch;
        /* OK, it *is* a hashed match; return it */
        spin_unlock(&dentry->d_lock);
        dput(new);
        return dentry;
    }
    rcu_read_unlock();
    /* we can't take ->d_lock here; it's OK, though. */
    new->d_flags |= DCACHE_PAR_LOOKUP;
    new->d_wait = wq;
    hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b); //将此新alloc的dentry插入lookup hash链表
    hlist_bl_unlock(b);
    return new;
mismatch:
    spin_unlock(&dentry->d_lock);
    dput(dentry);
    goto retry;
}
EXPORT_SYMBOL(d_alloc_parallel);