sk_buff

他里面包含了各层协议的头部,比如ethernet, ip ,tcp ,udp等等。熟悉他是进一步了解Linux网络协议栈的基础
此结构定义头文件

<include/linux/skbuff.h>
struct sk_buff_head {
      /* These two members must be first. */
      struct sk_buff *next;
      struct sk_buff *prev;
      __u32       qlen; //代表元素节点数目
      spinlock_t      lock; //加锁,防止对表的并发访问
};

sk_buff_head

管理函数

int skb_headroom(const struct sk_buff *skb)
int skb_tailroom(const struct sk_buff *skb)

skb_headroom通常比较小(只有48),用的时候小心越界
skb_tailroom大小1500,足够自由调整
skb

//同时增加len和tail。用于向数据报尾部追加数据。返回原来tail所在位置
unsigned char *skb_put(struct sk_buff *skb, unsigned int len) 

skb_put

//将data指针上移并增加len长度。这个函数用来向头部添加一些数据。当然前提是有足够的headroom
unsigned char *skb_push(struct sk_buff *skb, unsigned int len) 

skb_push

//将data指针下移,并减小len的值。这个函数一般用来除去某个头部
unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) 

skb_pull

//将data指针和tail指针同时下移。这个操作在存储空间的头部预留len长度的空隙
void skb_reserve(struct sk_buff *skb, unsigned int len) 

skb_reserve

//将网络报文的长度缩减到len。这个操作丢弃了网络报文尾部的填充值
void skb_trim(struct sk_buff *skb, unsigned int len) 

skb_trim
下图是穿过协议栈从tcp层向下到链路层的过程
tcp

分配内存

alloc_skb 分配缓冲区和一个sk_buff结构
dev_alloc_skb 设备驱动程序使用的缓冲区分配函数

释放内存

kfree_skb 只有skb->users计数器为1时才释放
dev_kfree_skb

缓冲区克隆

skb_clone

sk_buff

/** 
 *   struct sk_buff - socket buffer
 *   @next: Next buffer in list
 *   @prev: Previous buffer in list
 *   @tstamp: Time we arrived
 *   @sk: Socket we are owned by
 *   @dev: Device we arrived on/are leaving by
 *   @cb: Control buffer. Free for use by every layer. Put private vars here
 *   @_skb_refdst: destination entry (with norefcount bit)
 *   @sp: the security path, used for xfrm
 *   @len: Length of actual data
 *   @data_len: Data length
 *   @mac_len: Length of link layer header
 *   @hdr_len: writable header length of cloned skb
 *   @csum: Checksum (must include start/offset pair)
 *   @csum_start: Offset from skb->head where checksumming should start
 *   @csum_offset: Offset from csum_start where checksum should be stored
 *   @priority: Packet queueing priority
 *   @local_df: allow local fragmentation
 *   @cloned: Head may be cloned (check refcnt to be sure)
 *   @ip_summed: Driver fed us an IP checksum
 *   @nohdr: Payload reference only, must not modify header
 *   @nfctinfo: Relationship of this skb to the connection
 *   @pkt_type: Packet class
 *   @fclone: skbuff clone status
 *   @ipvs_property: skbuff is owned by ipvs
 *   @peeked: this packet has been seen already, so stats have been
 *        done for it, don't do them again
 *   @nf_trace: netfilter packet trace flag
 *   @protocol: Packet protocol from driver
 *   @nfct: Associated connection, if any
 *   @nfct_reasm: netfilter conntrack re-assembly pointer
 *   @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
 *   @skb_iif: ifindex of device we arrived on
 *   @tc_index: Traffic control index
 *   @tc_verd: traffic control verdict
 *   @rxhash: the packet hash computed on receive
 *   @queue_mapping: Queue mapping for multiqueue devices
 *   @ndisc_nodetype: router type (from link layer)
 *   @ooo_okay: allow the mapping of a socket to a queue to be changed
 *   @l4_rxhash: indicate rxhash is a canonical 4-tuple hash over transport
 *        ports.
 *   @wifi_acked_valid: wifi_acked was set
 *   @wifi_acked: whether frame was acked on wifi or not
 *   @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 *   @dma_cookie: a cookie to one of several possible DMA operations
 *        done by skb DMA functions
 *   @secmark: security marking
 *   @mark: Generic packet mark
 *   @dropcount: total number of sk_receive_queue overflows
 *   @vlan_tci: vlan tag control information
 *   @inner_transport_header: Inner transport layer header (encapsulation)
 *   @inner_network_header: Network layer header (encapsulation)
 *   @transport_header: Transport layer header
 *   @network_header: Network layer header
 *   @mac_header: Link layer header
 *   @tail: Tail pointer
 *   @end: End pointer
 *   @head: Head of buffer
 *   @data: Data head pointer
 *   @truesize: Buffer size
 *   @users: User count - see {datagram,tcp}.c
 *   @destructor: Destruct function
 */

struct sk_buff {
     /* These two members must be first. */
     struct sk_buff      *next;
     struct sk_buff      *prev;

     ktime_t             tstamp; //skb接收包的时间

     struct sock         *sk; //从属于哪个socket,被4层用到
     struct net_device   *dev; //一个网络设备,当发出包时它表示输出设备,当接收包时它表示输入设备

     /*
      * This is the control buffer. It is free to use for every
      * layer. Please put your private variables there. If you
      * want to keep them across layers you have to do a skb_clone()
      * first. This is owned by whoever has the skb queued ATM.
      */
     char           cb[48] __aligned(8); //保存每层的控制信息

     unsigned long       _skb_refdst; //用于路由子系统。保存了一些路由相关信息
#ifdef CONFIG_XFRM
     struct    sec_path  *sp;
#endif
     unsigned int        len, //当前skb中的数据长度,这个值会随着从一层到另一层而改变
                    data_len; //分片数据长度
     __u16               mac_len, //mac头长度
                    hdr_len; //clone的skb头长度
     union {
          __wsum         csum;
          struct {
               __u16     csum_start;
               __u16     csum_offset;
          };
     };
     __u32               priority; //优先级。用于QOS
     kmemcheck_bitfield_begin(flags1);
     __u8           local_df:1, //是否可以本地切片
                    cloned:1, //头是否被clone
                    ip_summed:2, //硬件驱动是否已经进行了校验
                    nohdr:1, //skb的头指针是否分配完毕(比如head和data)
                    nfctinfo:3; 
     __u8           pkt_type:3, //数据包的类型
                    fclone:2, //clone标记。在fast clone中被设置
                    ipvs_property:1, //ipvs拥有的域
                    peeked:1,
                    nf_trace:1;
     kmemcheck_bitfield_end(flags1);
     __be16              protocol; //L3层的协议

#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
     struct nf_conntrack *nfct;
#endif
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
     struct sk_buff      *nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
     struct nf_bridge_info    *nf_bridge;
#endif

     int            skb_iif; //接收设备的index

     __u32               rxhash;

     __u16               vlan_tci;

//流量控制的相关域
#ifdef CONFIG_NET_SCHED
     __u16               tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
     __u16               tc_verd;  /* traffic control verdict */
#endif
#endif

     __u16               queue_mapping; //多队列设备的映射,映射到那个队列
     kmemcheck_bitfield_begin(flags2);
#ifdef CONFIG_IPV6_NDISC_NODETYPE
     __u8           ndisc_nodetype:2;
#endif
     __u8           pfmemalloc:1;
     __u8           ooo_okay:1;
     __u8           l4_rxhash:1;
     __u8           wifi_acked_valid:1;
     __u8           wifi_acked:1;
     __u8           no_fcs:1;
     __u8           head_frag:1;
     /* Encapsulation protocol and NIC drivers should use
      * this flag to indicate to each other if the skb contains
      * encapsulated packet or not and maybe use the inner packet
      * headers if needed
      */
     __u8           encapsulation:1;
     /* 7/9 bit hole (depending on ndisc_nodetype presence) */
     kmemcheck_bitfield_end(flags2);

#ifdef CONFIG_NET_DMA
     dma_cookie_t        dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
     __u32               secmark;
#endif
     union {
          __u32          mark; //skb的标记
          __u32          dropcount;
          __u32          reserved_tailroom;
     };

     sk_buff_data_t      inner_transport_header;
     sk_buff_data_t      inner_network_header;
     sk_buff_data_t      transport_header; //传输层的头
     sk_buff_data_t      network_header; //网络层的头
     sk_buff_data_t      mac_header; //链路层的头
     /* These elements must be at the end, see alloc_skb() for details.  */
     sk_buff_data_t      tail; //skb数据操作指针
     sk_buff_data_t      end;
     unsigned char       *head,
                    *data;
     unsigned int        truesize; //整个skb的大小
     atomic_t       users; //skb的引用计数

     void           (*destructor)(struct sk_buff *skb); //skb的析构函数(sock_rfree和sock_wfree)
};

skb_put

static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len)
{
    unsigned char *tmp = skb_tail_pointer(skb);
    SKB_LINEAR_ASSERT(skb);
    skb->tail += len;
    skb->len  += len;
    return tmp;
}

skb_push

static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len)
{
    skb->data -= len;
    skb->len  += len;
    return skb->data;
}

skb_pull

static inline unsigned char *__skb_pull(struct sk_buff *skb , unsigned int len)  
{  
    skb->len -= len ;  
    BUG_ON(skb->len < skb->data_len ) ;  
    return skb->data += len ;  
}  

alloc_skb

struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
                int fclone, int node)
{
    struct kmem_cache *cache;
    struct skb_shared_info *shinfo;
    struct sk_buff *skb;
    u8 *data;

//这里通过fclone的值来判断是要从fclone cache还是说从head cache中取
    cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;

//首先是分配skb,也就是包头
    skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
    if (!skb)
        goto out;
//首先将size对齐,这里是按一级缓存的大小来对齐
    size = SKB_DATA_ALIGN(size);
//然后是数据区的大小,大小为size+ sizeof(struct skb_shared_info的大小
    data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
            gfp_mask, node);
    if (!data)
        goto nodata;

//初始化相关域
    memset(skb, 0, offsetof(struct sk_buff, tail));
//这里truesize可以看到就是我们分配的整个skb+data的大小
    skb->truesize = size + sizeof(struct sk_buff);
//users加一
    atomic_set(&skb->users, 1);
//一开始head和data是一样大的
    skb->head = data;
    skb->data = data;
//设置tail指针
    skb_reset_tail_pointer(skb);
//一开始tail也就是和data是相同的
    skb->end = skb->tail + size;
    kmemcheck_annotate_bitfield(skb, flags1);
    kmemcheck_annotate_bitfield(skb, flags2);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
    skb->mac_header = ~0U;
#endif

//初始化shinfo,这个我就不介绍了,前面的blog分析切片时,这个结构很详细的分析过了
    shinfo = skb_shinfo(skb);
    atomic_set(&shinfo->dataref, 1);
    shinfo->nr_frags  = 0;
    shinfo->gso_size = 0;
    shinfo->gso_segs = 0;
    shinfo->gso_type = 0;
    shinfo->ip6_frag_id = 0;
    shinfo->tx_flags.flags = 0;
    skb_frag_list_init(skb);
    memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));

//fclone为1,说明多分配了一块内存,因此需要设置对应的fclone域
    if (fclone) {
//可以看到多分配的内存刚好在当前的skb的下方
        struct sk_buff *child = skb + 1;
        atomic_t *fclone_ref = (atomic_t *) (child + 1);

        kmemcheck_annotate_bitfield(child, flags1);
        kmemcheck_annotate_bitfield(child, flags2);
//设置标记。这里要注意,当前的skb和多分配的skb设置的fclone是不同的
        skb->fclone = SKB_FCLONE_ORIG;
        atomic_set(fclone_ref, 1);

        child->fclone = SKB_FCLONE_UNAVAILABLE;
    }
out:
    return skb;
nodata:
    kmem_cache_free(cache, skb);
    skb = NULL;
    goto out;
}

skb_linearize

int skb_linearize(struct sk_buff *skb, gfp_t gfp) 

将frag_list链表里面的数据包整合成一个报文

skb_clone


struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
    struct sk_buff *n;

//n为skb紧跟着那块内存,这里如果skb是通过skb_fclone分配的,那么n就是一个skb
    n = skb + 1;
//skb和n的fclone都要符合要求,可以看到这里的值就是我们在__alloc_skb中设置的值
    if (skb->fclone == SKB_FCLONE_ORIG &&
        n->fclone == SKB_FCLONE_UNAVAILABLE) {
//到这里,就说明我们不需要alloc一个skb,直接取n就可以了,并且设置fclone的标记。并修改引用计数
        atomic_t *fclone_ref = (atomic_t *) (n + 1);
        n->fclone = SKB_FCLONE_CLONE;
        atomic_inc(fclone_ref);
    } else {

//这里就需要从cache中取得一块内存
        n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
        if (!n)
            return NULL;

        kmemcheck_annotate_bitfield(n, flags1);
        kmemcheck_annotate_bitfield(n, flags2);
//设置新的skb的fclone域。这里我们新建的skb,没有被fclone的都是这个标记
        n->fclone = SKB_FCLONE_UNAVAILABLE;
    }

    return __skb_clone(n, skb);
}

只是复制sk_buff结构,并不复制skb的数据缓冲区。Clone后的sk_buff结构与原始的sk_buff指向同一数据缓冲区。原始的和clone后的skb描述符的cloned值都会被置1,clone的skb描述符的users值置1,同时数据缓冲区的引用计数dataref增加1
skb_clone()操作的skb结构的数据缓冲区是不能被修改的

pskb_copy

struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
{
    /*
     *  Allocate the copy buffer
     */
    struct sk_buff *n;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
    n = alloc_skb(skb->end, gfp_mask);
#else
    n = alloc_skb(skb->end - skb->head, gfp_mask);
#endif
    if (!n)
        goto out;

    /* Set the data pointer */
    skb_reserve(n, skb->data - skb->head);
    /* Set the tail pointer and length */
    skb_put(n, skb_headlen(skb));
//复制线性数据段
    skb_copy_from_linear_data(skb, n->data, n->len);
//更新相关域
    n->truesize += skb->data_len;
    n->data_len  = skb->data_len;
    n->len       = skb->len;

//下面只是复制切片数据的指针
if (skb_shinfo(skb)->nr_frags) {
        int i;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
            skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
            get_page(skb_shinfo(n)->frags[i].page);
        }
        skb_shinfo(n)->nr_frags = i;
    }

    copy_skb_header(n, skb);
out:
    return n;
}

pskb_copy()与skb_copy()重量级拷贝,还拷贝skb->data指向的数据

skb_copy

struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
    int headerlen = skb->data - skb->head;
    /*
     *  Allocate the copy buffer
     */
//先alloc一个新的skb
    struct sk_buff *n;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
    n = alloc_skb(skb->end + skb->data_len, gfp_mask);
#else
    n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
#endif
    if (!n)
        return NULL;

    /* Set the data pointer */
    skb_reserve(n, headerlen);
    /* Set the tail pointer and length */
    skb_put(n, skb->len);
//然后复制所有的数据
    if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
        BUG();

    copy_skb_header(n, skb);
    return n;
}

kfree_skb

void kfree_skb(struct sk_buff *skb)
{
    if (unlikely(!skb))
        return;
    if (likely(atomic_read(&skb->users) == 1))
        smp_rmb();
//减一,然后判断
    else if (likely(!atomic_dec_and_test(&skb->users)))
        return;
    trace_kfree_skb(skb, __builtin_return_address(0));
    __kfree_skb(skb);
}

其他函数

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
    return skb->head + skb->transport_header;
}
static inline void skb_reset_transport_header(struct sk_buff *skb)
{
    skb->transport_header = skb->data - skb->head;
}
static inline void skb_set_transport_header(struct sk_buff *skb,
         const int offset)
{
    skb_reset_transport_header(skb);
    skb->transport_header += offset;
}
static inline unsigned char *skb_network_header(const struct sk_buff *skb)
{
    return skb->head + skb->network_header;
}
static inline void skb_reset_network_header(struct sk_buff *skb)
{
    skb->network_header = skb->data - skb->head;
}
static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
{
    skb_reset_network_header(skb);
    skb->network_header += offset;
}
static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
{
    return skb->head + skb->mac_header;
}
static inline int skb_mac_header_was_set(const struct sk_buff *skb)
{
    return skb->mac_header != ~0U;
}
static inline void skb_reset_mac_header(struct sk_buff *skb)
{
    skb->mac_header = skb->data - skb->head;
}
static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
{
    skb_reset_mac_header(skb);
    skb->mac_header += offset;
}
#else /* NET_SKBUFF_DATA_USES_OFFSET */
不使用相对偏移的情况
static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
    return skb->transport_header;
}
static inline void skb_reset_transport_header(struct sk_buff *skb)
{
    skb->transport_header = skb->data;
}
static inline void skb_set_transport_header(struct sk_buff *skb,
         const int offset)
{
    skb->transport_header = skb->data + offset;
}
static inline unsigned char *skb_network_header(const struct sk_buff *skb)
{
    return skb->network_header;
}
static inline void skb_reset_network_header(struct sk_buff *skb)
{
    skb->network_header = skb->data;
}
static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
{
    skb->network_header = skb->data + offset;
}
static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
{
    return skb->mac_header;
}
static inline int skb_mac_header_was_set(const struct sk_buff *skb)
{
    return skb->mac_header != NULL;
}
static inline void skb_reset_mac_header(struct sk_buff *skb)
{
    skb->mac_header = skb->data;
}
static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
{
    skb->mac_header = skb->data + offset;
}
#endif /* NET_SKBUFF_DATA_USES_OFFSET */
//获得sk_buff结构中TCP头的指针
static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb)
{
    return (struct tcphdr *)skb_transport_header(skb);
}

//获得TCP头的长度
static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
{
    return tcp_hdr(skb)->doff * 4;
}

//获取tcp option的长度
static inline unsigned int tcp_optlen(const struct sk_buff *skb)
{
    return (tcp_hdr(skb)->doff - 5) * 4;
}

//获得ip头
static inline struct iphdr *ip_hdr(const struct sk_buff *skb)
{
    return (struct iphdr *)skb_network_header(skb);
}

//获得ipip头
static inline struct iphdr *ipip_hdr(const struct sk_buff *skb)
{
    return (struct iphdr *)skb_transport_header(skb);
}

//获取802.3MAC头指针
static inline struct ebt_802_3_hdr *ebt_802_3_hdr(const struct sk_buff *skb)
{
    return (struct ebt_802_3_hdr *)skb_mac_header(skb);
}

//获取以太网MAC头指针
static inline struct ethhdr *eth_hdr(const struct sk_buff *skb)
{
    return (struct ethhdr *)skb_mac_header(skb);
}

skb_copy_expand

struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
                                int newheadroom,
                                int newtailroom,
                                int gfp_mask)
{
    struct sk_buff *n;

    /*
     *        Allocate the copy buffer
     */

    n=alloc_skb(newheadroom + skb->len + newtailroom,
                gfp_mask);
    if(n==NULL)
            return NULL;

    skb_reserve(n,newheadroom);

    /* Set the tail pointer and length */
    skb_put(n,skb->len);
    /* 2.4的代码 ---- start ---------*/
    /* Copy the data only. */
    if (skb_copy_bits(skb, 0, n->data, skb->len))
            BUG();
    /* 2.4的代码 ---- end ---------*/
    /* 2.6的代码 ---- start ---------*/
    head_copy_len = skb_headroom(skb);
    head_copy_off = 0;
    if (newheadroom <= head_copy_len)
            head_copy_len = newheadroom;
    else
            head_copy_off = newheadroom - head_copy_len;

    /* Copy the linear header and data. */
    if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
                      skb->len + head_copy_len))
            BUG();
    /* 2.6的代码 ---- end ---------*/

    copy_skb_header(n, skb);
    return n;
}

改变skb的数据域

先判断skb的tailroom,如果空间够大,则我们可以把需要添加的数据放在skb的tailroom里。如果tailroom不够大,则需要调用skb_copy_expand函数来扩充tailroom或者headroom

if(skb_tailroom(skb) < 16)  
{  
    nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + 16, GFP_ATOMIC);  
    if(!nskb)  
    {  
        printk("low memory..../n");          
        dev_kfree_skb(skb);  
        return -1;  
    }  

    else  
    {  
        kfree_skb(skb);
        skb = nskb;  
    }  

    memcpy(skb_put(skb, 16), data, 16); //拷贝数据到skb tail  
}
原文地址:https://www.cnblogs.com/zhangxuechao/p/11709834.html