Linux TCP/IP 协议栈之 Socket 的实现分析(一)

内核版本:2.6.37
参考[作者:kendo的文章(基于内涵版本2.6.12)]

第一部份 Socket套接字的创建

socket 并不是 TCP/IP协议的一部份。
从广义上来讲,socket 是Unix/Linux 抽像的进程间通讯的一种方法。网络 socket 通讯仅仅是其若干协议中的一类。而tcp/ip 又是网络这类中的一种。
从tcp/ip 的解度看 socket ,它更多地体现了用户 API 与协议栈的一个中间层接口层。用户通过调用socket API 将报文递交给协议栈,或者从协议栈中接收报文件。

一、系统总入口
Linux 内核为所有的与socket 有关的操作的API,提供了一个统一的系统调用入口,其代码在net/socket.c 中:

/*
 *    System call vectors.
 *
 *    Argument checking cleaned up. Saved 20% in size.
 *  This function doesn't need to set the kernel lock because
 *  it is set by the callees.
 */

SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
    unsigned long a[6];
    unsigned long a0, a1;
    int err;
    unsigned int len;

    if (call < 1 || call > SYS_RECVMMSG)
        return -EINVAL;

    len = nargs[call];
    if (len > sizeof(a))
        return -EINVAL;

    /* copy_from_user should be SMP safe. */
    if (copy_from_user(a, args, len))
        return -EFAULT;

    audit_socketcall(nargs[call] / sizeof(unsigned long), a);

    a0 = a[0];
    a1 = a[1];

    switch (call) {
    case SYS_SOCKET:
        err = sys_socket(a0, a1, a[2]);
        break;
    case SYS_BIND:
        err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_CONNECT:
        err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_LISTEN:
        err = sys_listen(a0, a1);
        break;
    case SYS_ACCEPT:
        err = sys_accept4(a0, (struct sockaddr __user *)a1,
                  (int __user *)a[2], 0);
        break;
    case SYS_GETSOCKNAME:
        err =
            sys_getsockname(a0, (struct sockaddr __user *)a1,
                    (int __user *)a[2]);
        break;
    case SYS_GETPEERNAME:
        err =
            sys_getpeername(a0, (struct sockaddr __user *)a1,
                    (int __user *)a[2]);
        break;
    case SYS_SOCKETPAIR:
        err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
        break;
    case SYS_SEND:
        err = sys_send(a0, (void __user *)a1, a[2], a[3]);
        break;
    case SYS_SENDTO:
        err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
                 (struct sockaddr __user *)a[4], a[5]);
        break;
    case SYS_RECV:
        err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
        break;
    case SYS_RECVFROM:
        err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                   (struct sockaddr __user *)a[4],
                   (int __user *)a[5]);
        break;
    case SYS_SHUTDOWN:
        err = sys_shutdown(a0, a1);
        break;
    case SYS_SETSOCKOPT:
        err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
        break;
    case SYS_GETSOCKOPT:
        err =
            sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                   (int __user *)a[4]);
        break;
    case SYS_SENDMSG:
        err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
        break;
    case SYS_RECVMSG:
        err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
        break;
    case SYS_RECVMMSG:
        err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
                   (struct timespec __user *)a[4]);
        break;
    case SYS_ACCEPT4:
        err = sys_accept4(a0, (struct sockaddr __user *)a1,
                  (int __user *)a[2], a[3]);
        break;
    default:
        err = -EINVAL;
        break;
    }
    return err;
}

首先调用copy_from_user 将用户态参数拷贝至数组a 。但是问题在于,每个被调用的 API 的参数不尽相同,那么每次拷贝的字节在小如果断定?
来看其第三个参数nargs[call],其中 call 是操作码,后面有个大大的 switch...case就是判断它。对应的操作码定义在include/linux/net.h :

#define SYS_SOCKET    1        /* sys_socket(2)        */
#define SYS_BIND    2        /* sys_bind(2)            */
#define SYS_CONNECT    3        /* sys_connect(2)        */
#define SYS_LISTEN    4        /* sys_listen(2)        */
#define SYS_ACCEPT    5        /* sys_accept(2)        */
#define SYS_GETSOCKNAME    6        /* sys_getsockname(2)        */
#define SYS_GETPEERNAME    7        /* sys_getpeername(2)        */
#define SYS_SOCKETPAIR    8        /* sys_socketpair(2)        */
#define SYS_SEND    9        /* sys_send(2)            */
#define SYS_RECV    10        /* sys_recv(2)            */
#define SYS_SENDTO    11        /* sys_sendto(2)        */
#define SYS_RECVFROM    12        /* sys_recvfrom(2)        */
#define SYS_SHUTDOWN    13        /* sys_shutdown(2)        */
#define SYS_SETSOCKOPT    14        /* sys_setsockopt(2)        */
#define SYS_GETSOCKOPT    15        /* sys_getsockopt(2)        */
#define SYS_SENDMSG    16        /* sys_sendmsg(2)        */
#define SYS_RECVMSG    17        /* sys_recvmsg(2)        */
#define SYS_ACCEPT4    18        /* sys_accept4(2)        */
#define SYS_RECVMMSG    19        /* sys_recvmmsg(2)        */

而数组nargs则根据操作码的不同,计算对应的参数的空间大小:

/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nargs[20] = {
    AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
    AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
    AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
    AL(4), AL(5)
};

#undef AL

当拷贝完成参数后,就进入一个switch...case... 判断操作码,跳转至对应的系统接口。

二、 sys_socket 函数

当用户空间要创建一个socke 接口时,会调用 API 函数:

int socket(int domain, int type, int protocol);

函数,其三个参数分别表示协议族、协议类型(面向连接或无连接)以及协议
协议族:

/* Supported address families. */
#define AF_UNSPEC    0
#define AF_UNIX        1    /* Unix domain sockets         */
#define AF_LOCAL    1    /* POSIX name for AF_UNIX    */
#define AF_INET        2    /* Internet IP Protocol     */
#define AF_AX25        3    /* Amateur Radio AX.25         */
#define AF_IPX        4    /* Novell IPX             */
#define AF_APPLETALK    5    /* AppleTalk DDP         */
#define AF_NETROM    6    /* Amateur Radio NET/ROM     */
#define AF_BRIDGE    7    /* Multiprotocol bridge     */
#define AF_ATMPVC    8    /* ATM PVCs            */
#define AF_X25        9    /* Reserved for X.25 project     */
#define AF_INET6    10    /* IP version 6            */
#define AF_ROSE        11    /* Amateur Radio X.25 PLP    */
#define AF_DECnet    12    /* Reserved for DECnet project    */
#define AF_NETBEUI    13    /* Reserved for 802.2LLC project*/
#define AF_SECURITY    14    /* Security callback pseudo AF */
#define AF_KEY        15      /* PF_KEY key management API */
#define AF_NETLINK    16
#define AF_ROUTE    AF_NETLINK /* Alias to emulate 4.4BSD */
#define AF_PACKET    17    /* Packet family        */
#define AF_ASH        18    /* Ash                */
#define AF_ECONET    19    /* Acorn Econet            */
#define AF_ATMSVC    20    /* ATM SVCs            */
#define AF_RDS        21    /* RDS sockets             */
#define AF_SNA        22    /* Linux SNA Project (nutters!) */
#define AF_IRDA        23    /* IRDA sockets            */
#define AF_PPPOX    24    /* PPPoX sockets        */
#define AF_WANPIPE    25    /* Wanpipe API Sockets */
#define AF_LLC        26    /* Linux LLC            */
#define AF_CAN        29    /* Controller Area Network      */
#define AF_TIPC        30    /* TIPC sockets            */
#define AF_BLUETOOTH    31    /* Bluetooth sockets         */
#define AF_IUCV        32    /* IUCV sockets            */
#define AF_RXRPC    33    /* RxRPC sockets         */
#define AF_ISDN        34    /* mISDN sockets         */
#define AF_PHONET    35    /* Phonet sockets        */
#define AF_IEEE802154    36    /* IEEE802154 sockets        */
#define AF_CAIF        37    /* CAIF sockets            */
#define AF_MAX        38    /* For now.. */

/* Protocol families, same as address families. */
#define PF_UNSPEC    AF_UNSPEC
#define PF_UNIX        AF_UNIX
#define PF_LOCAL    AF_LOCAL
#define PF_INET        AF_INET
#define PF_AX25        AF_AX25
#define PF_IPX        AF_IPX
#define PF_APPLETALK    AF_APPLETALK
#define PF_NETROM    AF_NETROM
#define PF_BRIDGE    AF_BRIDGE
#define PF_ATMPVC    AF_ATMPVC
#define PF_X25        AF_X25
#define PF_INET6            AF_INET6
#define PF_ROSE        AF_ROSE
#define PF_DECnet    AF_DECnet
#define PF_NETBEUI    AF_NETBEUI
#define PF_SECURITY    AF_SECURITY
#define PF_KEY        AF_KEY
#define PF_NETLINK    AF_NETLINK
#define PF_ROUTE    AF_ROUTE
#define PF_PACKET    AF_PACKET
#define PF_ASH        AF_ASH
#define PF_ECONET    AF_ECONET
#define PF_ATMSVC    AF_ATMSVC
#define PF_RDS        AF_RDS
#define PF_SNA        AF_SNA
#define PF_IRDA        AF_IRDA
#define PF_PPPOX            AF_PPPOX
#define PF_WANPIPE    AF_WANPIPE
#define PF_LLC        AF_LLC
#define PF_CAN        AF_CAN
#define PF_TIPC        AF_TIPC
#define PF_BLUETOOTH    AF_BLUETOOTH
#define PF_IUCV        AF_IUCV
#define PF_RXRPC    AF_RXRPC
#define PF_ISDN        AF_ISDN
#define PF_PHONET    AF_PHONET
#define PF_IEEE802154    AF_IEEE802154
#define PF_CAIF        AF_CAIF
#define PF_MAX        AF_MAX

协议类型:

enum sock_type {
    SOCK_STREAM    = 1,
    SOCK_DGRAM    = 2,
    SOCK_RAW    = 3,
    SOCK_RDM    = 4,
    SOCK_SEQPACKET    = 5,
    SOCK_DCCP    = 6,
    SOCK_PACKET    = 10,
};

socket创建通过操作码SYS_SOCKET是由sys_socket() 实现的:

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
    int retval;
    struct socket *sock;
    int flags;

    /* Check the SOCK_* constants for consistency.  */
    BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
    BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
    BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
    BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

    flags = type & ~SOCK_TYPE_MASK;
    if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
        return -EINVAL;
    type &= SOCK_TYPE_MASK;

    if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
        flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

    retval = sock_create(family, type, protocol, &sock);
    if (retval < 0)
        goto out;

    retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
    if (retval < 0)
        goto out_release;

out:
    /* It may be already another descriptor 8) Not kernel problem. */
    return retval;

out_release:
    sock_release(sock);
    return retval;
}

这段代码做了两件事:

1>  分配 sock 与sk,协议簇的协议封装;

2>  sock 面向上层系统调用,主要是与文件系统交互。

  通过进程的current指针的files,结合创建socket时返回的文件描符述,可以找到内核中对应的struct file,再根据file的f_dentry可以找到对应的目录项,而目录项struct dentry中,有d_inode指针,指向与sock封装在一起的inode。

  sock又与sk指针互指,一一对应。

三、 协议簇的协议封装

int __sock_create(struct net *net, int family, int type, int protocol,
             struct socket **res, int kern)
{
    int err;
    struct socket *sock;
    const struct net_proto_family *pf;

    /*
     *      Check protocol is in range
     */
    if (family < 0 || family >= NPROTO)
        return -EAFNOSUPPORT;
    if (type < 0 || type >= SOCK_MAX)
        return -EINVAL;

    /* Compatibility.

       This uglymoron is moved from INET layer to here to avoid
       deadlock in module load.
     */
    if (family == PF_INET && type == SOCK_PACKET) {
        static int warned;
        if (!warned) {
            warned = 1;
            printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)
",
                   current->comm);
        }
        family = PF_PACKET;
    }

    err = security_socket_create(family, type, protocol, kern);
    if (err)
        return err;

    /*
     *    Allocate the socket and allow the family to set things up. if
     *    the protocol is 0, the family is instructed to select an appropriate
     *    default.
     */
    sock = sock_alloc();
    if (!sock) {
        if (net_ratelimit())
            printk(KERN_WARNING "socket: no more sockets
");
        return -ENFILE;    /* Not exactly a match, but its the
                   closest posix thing */
    }

    sock->type = type;

#ifdef CONFIG_MODULES
    /* Attempt to load a protocol module if the find failed.
     *
     * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
     * requested real, full-featured networking support upon configuration.
     * Otherwise module support will break!
     */
    if (net_families[family] == NULL)
        request_module("net-pf-%d", family);
#endif

    rcu_read_lock();
    pf = rcu_dereference(net_families[family]);
    err = -EAFNOSUPPORT;
    if (!pf)
        goto out_release;

    /*
     * We will call the ->create function, that possibly is in a loadable
     * module, so we have to bump that loadable module refcnt first.
     */
    if (!try_module_get(pf->owner))
        goto out_release;

    /* Now protected by module ref count */
    rcu_read_unlock();

    err = pf->create(net, sock, protocol, kern);
    if (err < 0)
        goto out_module_put;

    /*
     * Now to bump the refcnt of the [loadable] module that owns this
     * socket at sock_release time we decrement its refcnt.
     */
    if (!try_module_get(sock->ops->owner))
        goto out_module_busy;

    /*
     * Now that we're done with the ->create function, the [loadable]
     * module can have its refcnt decremented
     */
    module_put(pf->owner);
    err = security_socket_post_create(sock, family, type, protocol, kern);
    if (err)
        goto out_sock_release;
    *res = sock;

    return 0;

out_module_busy:
    err = -EAFNOSUPPORT;
out_module_put:
    sock->ops = NULL;
    module_put(pf->owner);
out_sock_release:
    sock_release(sock);
    return err;

out_release:
    rcu_read_unlock();
    goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);

上面这个函数主要做了三件事:

1> sock_alloc()

在分析这个函数前,首先要了解:为了对 socket 抽像出文件的概念,内核中为socket定义了一个专门的文件系统类型sockfs。

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {
    .name =        "sockfs",
    .mount =    sockfs_mount,
    .kill_sb =    kill_anon_super,
};

在模块初始化的时候,安装该文件系统:

static int __init sock_init(void)
{
    /*
     *      Initialize sock SLAB cache.
     */

    sk_init();

    /*
     *      Initialize skbuff SLAB cache
     */
    skb_init();

    /*
     *      Initialize the protocols module.
     */

    init_inodecache();
    register_filesystem(&sock_fs_type);
    sock_mnt = kern_mount(&sock_fs_type);

    /* The real protocol initialization is performed in later initcalls.
     */

#ifdef CONFIG_NETFILTER
    netfilter_init();
#endif

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
    skb_timestamping_init();
#endif

    return 0;
}

core_initcall(sock_init);    /* early initcall */

文件系统安装中的一个重要步骤kern_mount->kern_mount_data->vfs_kern_mount:

vfs_kern_mount函数中,先根据注册的文件系统类型,如果文件系统本身有mount成员函数则调用之,没则调用它的get_sb成员函数指针,获取相应的超级块sb 。最后,调置文件系统的超级块成员指针,使之指向对应的值。 

其中sockfs文件系统的mount函数调用mount_pseudo()实现超级块的初始化,跟节点inode和目录下dentry创建,sockfs_ops这里关联上文件系统。

那前面提到的new_inode()函数分配inode 时调用的: sock_mnt->mnt_sb->s_op->alloc_inode(sock_mnt->mnt_sb);

static const struct super_operations sockfs_ops = {
    .alloc_inode    = sock_alloc_inode,
    .destroy_inode    = sock_destroy_inode,
    .statfs        = simple_statfs,
};

这个alloc_inode函数指针也就是sockfs_opssock_alloc_inode()函数。

static struct inode *sock_alloc_inode(struct super_block *sb)
{
    struct socket_alloc *ei;

    ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
    if (!ei)
        return NULL;
    ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL);
    if (!ei->socket.wq) {
        kmem_cache_free(sock_inode_cachep, ei);
        return NULL;
    }
    init_waitqueue_head(&ei->socket.wq->wait);
    ei->socket.wq->fasync_list = NULL;

    ei->socket.state = SS_UNCONNECTED;
    ei->socket.flags = 0;
    ei->socket.ops = NULL;
    ei->socket.sk = NULL;
    ei->socket.file = NULL;

    return &ei->vfs_inode;
}

函数先分配了一个用于封装socket和inode的ei ,然后在高速缓存中为之申请了一块空间。这样,inode和socket就同时都被分配了。接下来初始化socket的各个成员。

struct socket_alloc {
    struct socket socket;
    struct inode vfs_inode;
};

显而易见,该结构实现了inode和socket的封装。已经通过new_inode从sockfs文件系统分配一个inode,可以通过宏SOCKET_I来获取与之对应的socket:

sock = SOCKET_I(inode);

分配inode、socket 以及两者如何关联,都已一一分析了。

2> pf = rcu_dereference(net_families[family]);

net_families[family]的定义:

static const struct net_proto_family *net_families[NPROTO] __read_mostly;

net_proto_family的定义:

struct net_proto_family {
    int        family;
    int        (*create)(struct net *net, struct socket *sock,
                  int protocol, int kern);
    struct module    *owner;
};

net_families数组填充函数sock_register():

/**
 *    sock_register - add a socket protocol handler
 *    @ops: description of protocol
 *
 *    This function is called by a protocol handler that wants to
 *    advertise its address family, and have it linked into the
 *    socket interface. The value ops->family coresponds to the
 *    socket system call protocol family.
 */
int sock_register(const struct net_proto_family *ops)
{
    int err;

    if (ops->family >= NPROTO) {
        printk(KERN_CRIT "protocol %d >= NPROTO(%d)
", ops->family,
               NPROTO);
        return -ENOBUFS;
    }

    spin_lock(&net_family_lock);
    if (net_families[ops->family])
        err = -EEXIST;
    else {
        net_families[ops->family] = ops;
        err = 0;
    }
    spin_unlock(&net_family_lock);

    printk(KERN_INFO "NET: Registered protocol family %d
", ops->family);
    return err;
}
EXPORT_SYMBOL(sock_register);

从这里我们看出每个协议族都是通过sock_register函数注册到net_families数组中,通过代码搜索发现每个协议族都会调用这个函数去注册。

Af_ax25.c (netax25):    sock_register(&ax25_family_ops);
Af_bluetooth.c (netluetooth):    err = sock_register(&bt_sock_family_ops);
Af_can.c (netcan):    sock_register(&can_family_ops);
Af_decnet.c (netdecnet):    sock_register(&dn_family_ops);
Af_econet.c (neteconet):    sock_register(&econet_family_ops);
Af_ieee802154.c (netieee802154):    rc = sock_register(&ieee802154_family_ops);
Af_inet.c (netipv4):    (void)sock_register(&inet_family_ops);
Af_inet6.c (netipv6):    err = sock_register(&inet6_family_ops);
Af_ipx.c (netipx):    sock_register(&ipx_family_ops);
Af_irda.c (netirda):        rc = sock_register(&irda_family_ops);
Af_iucv.c (netiucv):    err = sock_register(&iucv_sock_family_ops);
Af_key.c (netkey):    err = sock_register(&pfkey_family_ops);
Af_llc.c (netllc):    rc = sock_register(&llc_ui_family_ops);
Af_netlink.c (net
etlink):    sock_register(&netlink_family_ops);
Af_netrom.c (net
etrom):    if (sock_register(&nr_family_ops)) {
Af_packet.c (netpacket):    sock_register(&packet_family_ops);
Af_phonet.c (netphonet):    err = sock_register(&phonet_proto_family);
Af_rds.c (net
ds):    ret = sock_register(&rds_family_ops);
Af_rose.c (net
ose):    sock_register(&rose_family_ops);
Af_rxrpc.c (net
xrpc):    ret = sock_register(&rxrpc_family_ops);
Af_unix.c (netunix):    sock_register(&unix_family_ops);
Af_x25.c (netx25):    rc = sock_register(&x25_family_ops);
Caif_socket.c (netcaif):    int err = sock_register(&caif_family_ops);
Ddp.c (netappletalk):    (void)sock_register(&atalk_family_ops);
Net.h (includelinux):extern int         sock_register(const struct net_proto_family *fam);
Pppox.c (drivers
et):    return sock_register(&pppox_proto_family);
Pvc.c (netatm):    return sock_register(&pvc_family_ops);
Socket.c (driversisdnmisdn):    err = sock_register(&mISDN_sock_family_ops);
Socket.c (net): *    sock_register - add a socket protocol handler
Socket.c (net):int sock_register(const struct net_proto_family *ops)
Socket.c (net):EXPORT_SYMBOL(sock_register);
Socket.c (net	ipc):    res = sock_register(&tipc_family_ops);
Svc.c (netatm):    return sock_register(&svc_family_ops);

本文主要分析的ipv4协议族,所以我们参考的文件af_inet.c(net/ipv4)。

3> err = pf->create(net, sock, protocol, kern);

在af_inet.c里面inet_init函数里面调用sock_register注册到协议族数组net_families里:

(void)sock_register(&inet_family_ops);

接着看inet_family_ops定义:

static const struct net_proto_family inet_family_ops = {
    .family = PF_INET,
    .create = inet_create,
    .owner    = THIS_MODULE,
};

这里的inet_create就是程序调用的函数:

/*
 *    Create an inet socket.
 */

static int inet_create(struct net *net, struct socket *sock, int protocol,
               int kern)
{
    struct sock *sk;
    struct inet_protosw *answer;
    struct inet_sock *inet;
    struct proto *answer_prot;
    unsigned char answer_flags;
    char answer_no_check;
    int try_loading_module = 0;
    int err;

    if (unlikely(!inet_ehash_secret))
        if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
            build_ehash_secret();

    sock->state = SS_UNCONNECTED;

    /* Look for the requested type/protocol pair. */
lookup_protocol:
    err = -ESOCKTNOSUPPORT;
    rcu_read_lock();
    list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

        err = 0;
        /* Check the non-wild match. */
        if (protocol == answer->protocol) {
            if (protocol != IPPROTO_IP)
                break;
        } else {
            /* Check for the two wild cases. */
            if (IPPROTO_IP == protocol) {
                protocol = answer->protocol;
                break;
            }
            if (IPPROTO_IP == answer->protocol)
                break;
        }
        err = -EPROTONOSUPPORT;
    }

    if (unlikely(err)) {
        if (try_loading_module < 2) {
            rcu_read_unlock();
            /*
             * Be more specific, e.g. net-pf-2-proto-132-type-1
             * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
             */
            if (++try_loading_module == 1)
                request_module("net-pf-%d-proto-%d-type-%d",
                           PF_INET, protocol, sock->type);
            /*
             * Fall back to generic, e.g. net-pf-2-proto-132
             * (net-pf-PF_INET-proto-IPPROTO_SCTP)
             */
            else
                request_module("net-pf-%d-proto-%d",
                           PF_INET, protocol);
            goto lookup_protocol;
        } else
            goto out_rcu_unlock;
    }

    err = -EPERM;
    if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
        goto out_rcu_unlock;

    err = -EAFNOSUPPORT;
    if (!inet_netns_ok(net, protocol))
        goto out_rcu_unlock;

    sock->ops = answer->ops;
    answer_prot = answer->prot;
    answer_no_check = answer->no_check;
    answer_flags = answer->flags;
    rcu_read_unlock();

    WARN_ON(answer_prot->slab == NULL);

    err = -ENOBUFS;
    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
    if (sk == NULL)
        goto out;

    err = 0;
    sk->sk_no_check = answer_no_check;
    if (INET_PROTOSW_REUSE & answer_flags)
        sk->sk_reuse = 1;

    inet = inet_sk(sk);
    inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

    inet->nodefrag = 0;

    if (SOCK_RAW == sock->type) {
        inet->inet_num = protocol;
        if (IPPROTO_RAW == protocol)
            inet->hdrincl = 1;
    }

    if (ipv4_config.no_pmtu_disc)
        inet->pmtudisc = IP_PMTUDISC_DONT;
    else
        inet->pmtudisc = IP_PMTUDISC_WANT;

    inet->inet_id = 0;

    sock_init_data(sock, sk);

    sk->sk_destruct       = inet_sock_destruct;
    sk->sk_protocol       = protocol;
    sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

    inet->uc_ttl    = -1;
    inet->mc_loop    = 1;
    inet->mc_ttl    = 1;
    inet->mc_all    = 1;
    inet->mc_index    = 0;
    inet->mc_list    = NULL;

    sk_refcnt_debug_inc(sk);

    if (inet->inet_num) {
        /* It assumes that any protocol which allows
         * the user to assign a number at socket
         * creation time automatically
         * shares.
         */
        inet->inet_sport = htons(inet->inet_num);
        /* Add to protocol hash chains. */
        sk->sk_prot->hash(sk);
    }

    if (sk->sk_prot->init) {
        err = sk->sk_prot->init(sk);
        if (err)
            sk_common_release(sk);
    }
out:
    return err;
out_rcu_unlock:
    rcu_read_unlock();
    goto out;
}

在分析inet_create()函数前,就要分析inetsw[SOCK_MAX]这个数组。

static struct list_head inetsw[SOCK_MAX];

这个数组是在inet_init()->inet_register_protosw()里面填充的。

    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
        inet_register_protosw(q);

inetsw_array定义:

/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
    {
        .type =       SOCK_STREAM,
        .protocol =   IPPROTO_TCP,
        .prot =       &tcp_prot,
        .ops =        &inet_stream_ops,
        .no_check =   0,
        .flags =      INET_PROTOSW_PERMANENT |
                  INET_PROTOSW_ICSK,
    },

    {
        .type =       SOCK_DGRAM,
        .protocol =   IPPROTO_UDP,
        .prot =       &udp_prot,
        .ops =        &inet_dgram_ops,
        .no_check =   UDP_CSUM_DEFAULT,
        .flags =      INET_PROTOSW_PERMANENT,
       },


       {
           .type =       SOCK_RAW,
           .protocol =   IPPROTO_IP,    /* wild card */
           .prot =       &raw_prot,
           .ops =        &inet_sockraw_ops,
           .no_check =   UDP_CSUM_DEFAULT,
           .flags =      INET_PROTOSW_REUSE,
       }
};

#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

inet_register_protosw函数分析:

void inet_register_protosw(struct inet_protosw *p)
{
    struct list_head *lh;
    struct inet_protosw *answer;
    int protocol = p->protocol;
    struct list_head *last_perm;

    spin_lock_bh(&inetsw_lock);

    if (p->type >= SOCK_MAX)
        goto out_illegal;

    /* If we are trying to override a permanent protocol, bail. */
    answer = NULL;
    last_perm = &inetsw[p->type];
    list_for_each(lh, &inetsw[p->type]) {
        answer = list_entry(lh, struct inet_protosw, list);

        /* Check only the non-wild match. */
        if (INET_PROTOSW_PERMANENT & answer->flags) {
            if (protocol == answer->protocol)
                break;
            last_perm = lh;
        }

        answer = NULL;
    }
    if (answer)
        goto out_permanent;

    /* Add the new entry after the last permanent entry if any, so that
     * the new entry does not override a permanent entry when matched with
     * a wild-card protocol. But it is allowed to override any existing
     * non-permanent entry.  This means that when we remove this entry, the
     * system automatically returns to the old behavior.
     */
    list_add_rcu(&p->list, last_perm);
out:
    spin_unlock_bh(&inetsw_lock);

    return;

out_permanent:
    printk(KERN_ERR "Attempt to override permanent protocol %d.
",
           protocol);
    goto out;

out_illegal:
    printk(KERN_ERR
           "Ignoring attempt to register invalid socket type %d.
",
           p->type);
    goto out;
}
EXPORT_SYMBOL(inet_register_protosw);

这个函数完成的工作,就是把inetsw_array 数组中,相同的协议类型(protocol成员)下边的协议,加入到inetsw 对应的协议类型的链表中去。
因为事实上一对一的关系,所以这个函数要简单得多:
  因为不存在其它成员,所以每一次 list_entry 都为空值,所以不存在覆盖和追加的情况,直接调用list_add_rcu(&p->list, last_perm);
  把协议类型节点(struct inet_protosw 类型的数组的某个元素)添加到链表(链表首部本身是一个数组,数组索引是协议对应的协议类型的值的第一个成员。

继续分析inet_create()函数:

  首先,根据sock的成员protocol,把之前在链表中注册的协议节点找出。

  然后,将创建的socket 的ops 函数指针集,指向协议类型的例如创建的是SOCK_STREAM,那么就指向了inet_stream_ops; answer_prot 指针指向了当前要创建的socket 的协议类型下边的协议,如上例,它就是IPPROTO_TCP 的tcp_prot结构。

  接着, 接下来一个重要的工作,就是为socket分配一个sock,并初始化它。

  最后,初始化一个 inet 。

虽然create 的代码就到这儿了,不过要说清楚sk(socK)的分配,还得费上大力气。
每一个Socket 套接字,都有一个对应的 struct socket 结构来描述(内核中一般使用名称为sock),但是同时又一个struct sock 结构(内核中一般使用名称为sk),两者之间是一一对应的关系。

在后面的sock_init_data 函数中,可以看到:

sk->sk_socket = sock; 
sock->sk = sk;

socket 结构和 sock 结构实际上是同一个事物的两个方面。不妨说,socket 结构是面向进程和系统调用界面的侧面,而 sock 结构则是面向底层驱动程序的侧面。

设计者把socket套接字中,与文件系统关系比较密切的那一部份放在socket结构中,而把与通信关系比较密切的那一部份,则单独成为 一个数结结构,那就是sock 结构。

由于这两部份逻辑上本来就是一体的,所以要通过指针互相指向对方,形成一对一的关系。

调用sk_alloc()分配一个sk:

  在之前proto_register()函数创建的高速缓存中申请分配一个slab缓存项,并清零。然后设置协议族、并把sk中的sk_prot与对应的协议关联起来。

分配完成sk后,另一个重要的功能就是初始化它

  sk的成员相当复杂,其主要的初始化工作是在函数sock_init_data()中完成的:
  sock 结构中,有三个重要的双向队列,分别是 sk_receive_queuesk_write_queuesk_error_queue。从它们的名字就可以看出来其作用了。
队列并非采用通用的list_head来维护,而是使用skb_buffer队列:

struct sk_buff_head { 
            /* These two members must be first. */ 
        struct sk_buff        *next; 
        struct sk_buff        *prev; 
 
            __u32                        qlen; 
        spinlock_t        lock; 
};

这样,队列中指向的每一个skb_buffer,就是一个数据包,分别是接收、发送和投递错误。
inet 初始化:
inet 是一个struct inet_sock 结构类型,来看它的定义:

struct inet_sock { 
    /* sk and pinet6 has to be the first two members of inet_sock */ 
    struct sock sk; 
    …… 
}

只留意它的第一个成员就足够了。
我们说sock 是面向用户态调用,而sk是面向内核驱动调用的,那sk是如何与协议栈交互的呢?
对于每一个类型的协议,为了与sk联系起来,都定义了一个struct XXX_sock 结构,XXX是协议名,例如:

struct tcp_sock { 
    /* inet_sock has to be the first member of tcp_sock */ 
    struct inet_sock inet; 
    int tcp_header_len; /* Bytes of tcp header to send */ 
    …… 
} 

很明显,它们的结构定构是“af_inet 一般属性+ 自己的私有属性” ,因为它们的第一个成员总是inet 。

现在回头来照一下起初在af_inet.c中,封装协议注册proto_register()的时候,size成员,对于tcp而言:

struct proto tcp_prot = {
    .name            = "TCP",
    .owner            = THIS_MODULE,
    .close            = tcp_close,
    .connect        = tcp_v4_connect,
    .disconnect        = tcp_disconnect,
    .accept            = inet_csk_accept,
    .ioctl            = tcp_ioctl,
    .init            = tcp_v4_init_sock,
    .destroy        = tcp_v4_destroy_sock,
    .shutdown        = tcp_shutdown,
    .setsockopt        = tcp_setsockopt,
    .getsockopt        = tcp_getsockopt,
    .recvmsg        = tcp_recvmsg,
    .sendmsg        = tcp_sendmsg,
        ...
    .obj_size        = sizeof(struct tcp_sock),
        ...     
};

其它协议类似。

以obj_size 来确定每个 slab 缓存项分配的大小,所以,我们就可说,每次申请分配的,实际上是一个struct XXX_sock 结构大小的结构。因为都是定义于上层结构的第一个成员,可以使用强制类型转换来使用这块分配的内存空间。例如: 

struct inet_sock {
    /* sk and pinet6 has to be the first two members of inet_sock */
    struct sock        sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
    struct ipv6_pinfo    *pinet6;
#endif
    /* Socket demultiplex comparisons on incoming packets. */
    __be32            inet_daddr;
    __be32            inet_rcv_saddr;
    __be16            inet_dport;
    __u16            inet_num;
    __be32            inet_saddr;
    __s16            uc_ttl;
    __u16            cmsg_flags;
    __be16            inet_sport;
    __u16            inet_id;
    ...  
};

inet = inet_sk(sk); 
static inline struct inet_sock *inet_sk(const struct sock *sk) 
{ 
  return (struct inet_sock *)sk; //inet_sock->sk
}
struct tcp_sock {
    /* inet_connection_sock has to be the first member of tcp_sock */
    struct inet_connection_sock    inet_conn;
    u16    tcp_header_len;    /* Bytes of tcp header to send        */
    ...
};

struct tcp_sock *tp = tcp_sk(sk); 
static inline struct tcp_sock *tcp_sk(const struct sock *sk) 
{ 
  return (struct tcp_sock *)sk; //tcp_sock->inet_conn->icsk_inet->sk
}

inet_create()运行完,一个 socket 套接字基本上就创建完毕了,剩下的就是与文件系统挂钩。

四、与文件系统交互

sys_socket()函数中来,它在调用完sock_create()后,紧接着调用sock_map_fd()函数:

int sock_map_fd(struct socket *sock, int flags)
{
    struct file *newfile;
    int fd = sock_alloc_file(sock, &newfile, flags);

    if (likely(fd >= 0))
        fd_install(fd, newfile);

    return fd;
}
EXPORT_SYMBOL(sock_map_fd);

这个函数的核心思想,在一开始,就已经分析过了。
从进程的角度来讲,一个 socket 套接字就是一个特殊的,已打开的文件。
前面分配好一个socket后,这里要做的就是将它与文件系统拉上亲戚关系。
首先获取一个空闲的文件描述符号和file结构。然后在文件系统中分配一个目录项(d_alloc),使其指向已经分配的inode节点(d_add),然后把其目录项挂在sockfs文件系统的根目录之下。
并且把目录项的指针d_op设置成指向 sockfs_dentry_operati,这个数据结构通过函数指针提供他与文件路径有关的操作:

static const struct dentry_operations sockfs_dentry_operations = {
    .d_dname  = sockfs_dname,
};

最后一步,就是将file结构中的f_op和sock结构中的i_fop都指向socket_file_ops,它是一个函数指针集,指向了socket面向文件系统的用户态调用的一些接口函数:

/*
 *    Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *    in the operation structures but are done directly via the socketcall() multiplexor.
 */

static const struct file_operations socket_file_ops = {
    .owner =    THIS_MODULE,
    .llseek =    no_llseek,
    .aio_read =    sock_aio_read,
    .aio_write =    sock_aio_write,
    .poll =        sock_poll,
    .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl = compat_sock_ioctl,
#endif
    .mmap =        sock_mmap,
    .open =        sock_no_open,    /* special open code to disallow open via /proc */
    .release =    sock_close,
    .fasync =    sock_fasync,
    .sendpage =    sock_sendpage,
    .splice_write = generic_splice_sendpage,
    .splice_read =    sock_splice_read,
};

到这里,整个socket 套接字的创建工作,就宣告完成了。

原文地址:https://www.cnblogs.com/cslunatic/p/3698653.html