kernel socket

/** defined in include/linux/skbuff.h
0326  *  struct sk_buff - socket buffer 内核中的socket buffer
0327  *  @next: Next buffer in list    |
0328  *  @prev: Previous buffer in list  | 链
0329  *  @tstamp: Time we arrived      | 包的到达时间
0330  *  @sk: Socket we are owned by
0331  *  @dev: Device we arrived on/are leaving by
0332  *  @cb: Control buffer. Free for use by every layer. Put private vars here
0333  *  @_skb_refdst: destination entry (with norefcount bit)
0334  *  @sp: the security path, used for xfrm
0335  *  @len: Length of actual data
0336  *  @data_len: Data length
0337  *  @mac_len: Length of link layer header
0338  *  @hdr_len: writable header length of cloned skb
0339  *  @csum: Checksum (must include start/offset pair)
0340  *  @csum_start: Offset from skb->head where checksumming should start
0341  *  @csum_offset: Offset from csum_start where checksum should be stored
0342  *  @priority: Packet queueing priority
0343  *  @local_df: allow local fragmentation
0344  *  @cloned: Head may be cloned (check refcnt to be sure)
0345  *  @ip_summed: Driver fed us an IP checksum
0346  *  @nohdr: Payload reference only, must not modify header
0347  *  @nfctinfo: Relationship of this skb to the connection
0348  *  @pkt_type: Packet class
0349  *  @fclone: skbuff clone status
0350  *  @ipvs_property: skbuff is owned by ipvs
0351  *  @peeked: this packet has been seen already, so stats have been
0352  *      done for it, don't do them again
0353  *  @nf_trace: netfilter packet trace flag
0354  *  @protocol: Packet protocol from driver
0355  *  @destructor: Destruct function
0356  *  @nfct: Associated connection, if any
0357  *  @nfct_reasm: netfilter conntrack re-assembly pointer
0358  *  @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
0359  *  @skb_iif: ifindex of device we arrived on
0360  *  @tc_index: Traffic control index
0361  *  @tc_verd: traffic control verdict
0362  *  @rxhash: the packet hash computed on receive
0363  *  @queue_mapping: Queue mapping for multiqueue devices
0364  *  @ndisc_nodetype: router type (from link layer)
0365  *  @ooo_okay: allow the mapping of a socket to a queue to be changed
0366  *  @l4_rxhash: indicate rxhash is a canonical 4-tuple hash over transport
0367  *      ports.
0368  *  @wifi_acked_valid: wifi_acked was set
0369  *  @wifi_acked: whether frame was acked on wifi or not
0370  *  @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
0371  *  @dma_cookie: a cookie to one of several possible DMA operations
0372  *      done by skb DMA functions
0373  *  @secmark: security marking
0374  *  @mark: Generic packet mark
0375  *  @dropcount: total number of sk_receive_queue overflows
0376  *  @vlan_tci: vlan tag control information
0377  *  @transport_header: Transport layer header
0378  *  @network_header: Network layer header
0379  *  @mac_header: Link layer header
0380  *  @tail: Tail pointer
0381  *  @end: End pointer
0382  *  @head: Head of buffer
0383  *  @data: Data head pointer
0384  *  @truesize: Buffer size
0385  *  @users: User count - see {datagram,tcp}.c
0386  */
0387 
0388 struct sk_buff {
0389     /* These two members must be first. */
0390     struct sk_buff      *next;
0391     struct sk_buff      *prev;
0392 
0393     ktime_t         tstamp;
0394 
0395     struct sock     *sk;
0396     struct net_device   *dev;
0397 
0398     /*
0399      * This is the control buffer. It is free to use for every
0400      * layer. Please put your private variables there. If you
0401      * want to keep them across layers you have to do a skb_clone()
0402      * first. This is owned by whoever has the skb queued ATM.
0403      */
0404     char            cb[48] __aligned(8);
0405 
0406     unsigned long       _skb_refdst;
0407 #ifdef CONFIG_XFRM
0408     struct  sec_path    *sp;
0409 #endif
0410     unsigned int        len,
0411                 data_len;
0412     __u16           mac_len,
0413                 hdr_len;
0414     union {
0415         __wsum      csum;
0416         struct {
0417             __u16   csum_start;
0418             __u16   csum_offset;
0419         };
0420     };
0421     __u32           priority;
0422     kmemcheck_bitfield_begin(flags1);
0423     __u8            local_df:1,
0424                 cloned:1,
0425                 ip_summed:2,
0426                 nohdr:1,
0427                 nfctinfo:3;
0428     __u8            pkt_type:3,
0429                 fclone:2,
0430                 ipvs_property:1,
0431                 peeked:1,
0432                 nf_trace:1;
0433     kmemcheck_bitfield_end(flags1);
0434     __be16          protocol;
0435 
0436     void            (*destructor)(struct sk_buff *skb);
0437 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
0438     struct nf_conntrack *nfct;
0439 #endif
0440 #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
0441     struct sk_buff      *nfct_reasm;
0442 #endif
0443 #ifdef CONFIG_BRIDGE_NETFILTER
0444     struct nf_bridge_info   *nf_bridge;
0445 #endif
0446 
0447     int         skb_iif;
0448 
0449     __u32           rxhash;
0450 
0451     __u16           vlan_tci;
0452 
0453 #ifdef CONFIG_NET_SCHED
0454     __u16           tc_index;   /* traffic control index */
0455 #ifdef CONFIG_NET_CLS_ACT
0456     __u16           tc_verd;    /* traffic control verdict */
0457 #endif
0458 #endif
0459 
0460     __u16           queue_mapping;
0461     kmemcheck_bitfield_begin(flags2);
0462 #ifdef CONFIG_IPV6_NDISC_NODETYPE
0463     __u8            ndisc_nodetype:2;
0464 #endif
0465     __u8            ooo_okay:1;
0466     __u8            l4_rxhash:1;
0467     __u8            wifi_acked_valid:1;
0468     __u8            wifi_acked:1;
0469     __u8            no_fcs:1;
0470     __u8            head_frag:1;
0471     /* 8/10 bit hole (depending on ndisc_nodetype presence) */
0472     kmemcheck_bitfield_end(flags2);
0473 
0474 #ifdef CONFIG_NET_DMA
0475     dma_cookie_t        dma_cookie;
0476 #endif
0477 #ifdef CONFIG_NETWORK_SECMARK
0478     __u32           secmark;
0479 #endif
0480     union {
0481         __u32       mark;
0482         __u32       dropcount;
0483         __u32       avail_size;
0484     };
0485 
0486     sk_buff_data_t      transport_header;
0487     sk_buff_data_t      network_header;
0488     sk_buff_data_t      mac_header;
0489     /* These elements must be at the end, see alloc_skb() for details.  */
0490     sk_buff_data_t      tail;
0491     sk_buff_data_t      end;
0492     unsigned char       *head,
0493                 *data;
0494     unsigned int        truesize;
0495     atomic_t        users;
0496 };


1040 struct net_device {
1041 
1042     /*
1043      * This is the first field of the "visible" part of this structure
1044      * (i.e. as seen by users in the "Space.c" file).  It is the name
1045      * of the interface.
1046      */
1047     char            name[IFNAMSIZ];
1048 
1049     struct pm_qos_request   pm_qos_req;
1050 
1051     /* device name hash chain */
1052     struct hlist_node   name_hlist;
1053     /* snmp alias */
1054     char            *ifalias;
1055 
1056     /*
1057      *  I/O specific fields
1058      *  FIXME: Merge these and struct ifmap into one
1059      */
1060     unsigned long       mem_end;    /* shared mem end   */
1061     unsigned long       mem_start;  /* shared mem start */
1062     unsigned long       base_addr;  /* device I/O address   */
1063     unsigned int        irq;        /* device IRQ number    */
1064 
1065     /*
1066      *  Some hardware also needs these fields, but they are not
1067      *  part of the usual set specified in Space.c.
1068      */
1069 
1070     unsigned long       state;
1071 
1072     struct list_head    dev_list;
1073     struct list_head    napi_list;
1074     struct list_head    unreg_list;
1075 
1076     /* currently active device features */
1077     netdev_features_t   features;
1078     /* user-changeable features */
1079     netdev_features_t   hw_features;
1080     /* user-requested features */
1081     netdev_features_t   wanted_features;
1082     /* mask of features inheritable by VLAN devices */
1083     netdev_features_t   vlan_features;
1084 
1085     /* Interface index. Unique device identifier    */
1086     int         ifindex;
1087     int         iflink;
1088 
1089     struct net_device_stats stats;
1090     atomic_long_t       rx_dropped; /* dropped packets by core network
1091                          * Do not use this in drivers.
1092                          */
1093 
1094 #ifdef CONFIG_WIRELESS_EXT
1095     /* List of functions to handle Wireless Extensions (instead of ioctl).
1096      * See <net/iw_handler.h> for details. Jean II */
1097     const struct iw_handler_def *   wireless_handlers;
1098     /* Instance data managed by the core of Wireless Extensions. */
1099     struct iw_public_data * wireless_data;
1100 #endif
1101     /* Management operations */
1102     const struct net_device_ops *netdev_ops;
1103     const struct ethtool_ops *ethtool_ops;
1104 
1105     /* Hardware header description */
1106     const struct header_ops *header_ops;
1107 
1108     unsigned int        flags;  /* interface flags (a la BSD)   */
1109     unsigned int        priv_flags; /* Like 'flags' but invisible to userspace.
1110                          * See if.h for definitions. */
1111     unsigned short      gflags;
1112     unsigned short      padded; /* How much padding added by alloc_netdev() */
1113 
1114     unsigned char       operstate; /* RFC2863 operstate */
1115     unsigned char       link_mode; /* mapping policy to operstate */
1116 
1117     unsigned char       if_port;    /* Selectable AUI, TP,..*/
1118     unsigned char       dma;        /* DMA channel      */
1119 
1120     unsigned int        mtu;    /* interface MTU value      */
1121     unsigned short      type;   /* interface hardware type  */
1122     unsigned short      hard_header_len;    /* hardware hdr length  */
1123 
1124     /* extra head- and tailroom the hardware may need, but not in all cases
1125      * can this be guaranteed, especially tailroom. Some cases also use
1126      * LL_MAX_HEADER instead to allocate the skb.
1127      */
1128     unsigned short      needed_headroom;
1129     unsigned short      needed_tailroom;
1130 
1131     /* Interface address info. */
1132     unsigned char       perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
1133     unsigned char       addr_assign_type; /* hw address assignment type */
1134     unsigned char       addr_len;   /* hardware address length  */
1135     unsigned char       neigh_priv_len;
1136     unsigned short          dev_id;     /* for shared network cards */
1137 
1138     spinlock_t      addr_list_lock;
1139     struct netdev_hw_addr_list  uc; /* Unicast mac addresses */
1140     struct netdev_hw_addr_list  mc; /* Multicast mac addresses */
1141     bool            uc_promisc;
1142     unsigned int        promiscuity;
1143     unsigned int        allmulti;
1144 
1145 
1146     /* Protocol specific pointers */
1147 
1148 #if IS_ENABLED(CONFIG_VLAN_8021Q)
1149     struct vlan_info __rcu  *vlan_info; /* VLAN info */
1150 #endif
1151 #if IS_ENABLED(CONFIG_NET_DSA)
1152     struct dsa_switch_tree  *dsa_ptr;   /* dsa specific data */
1153 #endif
1154     void            *atalk_ptr; /* AppleTalk link   */
1155     struct in_device __rcu  *ip_ptr;    /* IPv4 specific data   */
1156     struct dn_dev __rcu     *dn_ptr;        /* DECnet specific data */
1157     struct inet6_dev __rcu  *ip6_ptr;       /* IPv6 specific data */
1158     void            *ax25_ptr;  /* AX.25 specific data */
1159     struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
1160                            assign before registering */
1161 
1162 /*
1163  * Cache lines mostly used on receive path (including eth_type_trans())
1164  */
1165     unsigned long       last_rx;    /* Time of last Rx
1166                          * This should not be set in
1167                          * drivers, unless really needed,
1168                          * because network stack (bonding)
1169                          * use it if/when necessary, to
1170                          * avoid dirtying this cache line.
1171                          */
1172 
1173     struct net_device   *master; /* Pointer to master device of a group,
1174                       * which this device is member of.
1175                       */
1176 
1177     /* Interface address info used in eth_type_trans() */
1178     unsigned char       *dev_addr;  /* hw address, (before bcast
1179                            because most packets are
1180                            unicast) */
1181 
1182     struct netdev_hw_addr_list  dev_addrs; /* list of device
1183                               hw addresses */
1184 
1185     unsigned char       broadcast[MAX_ADDR_LEN];    /* hw bcast add */
1186 
1187 #ifdef CONFIG_SYSFS
1188     struct kset     *queues_kset;
1189 #endif
1190 
1191 #ifdef CONFIG_RPS
1192     struct netdev_rx_queue  *_rx;
1193 
1194     /* Number of RX queues allocated at register_netdev() time */
1195     unsigned int        num_rx_queues;
1196 
1197     /* Number of RX queues currently active in device */
1198     unsigned int        real_num_rx_queues;
1199 
1200 #ifdef CONFIG_RFS_ACCEL
1201     /* CPU reverse-mapping for RX completion interrupts, indexed
1202      * by RX queue number.  Assigned by driver.  This must only be
1203      * set if the ndo_rx_flow_steer operation is defined. */
1204     struct cpu_rmap     *rx_cpu_rmap;
1205 #endif
1206 #endif
1207 
1208     rx_handler_func_t __rcu *rx_handler;
1209     void __rcu      *rx_handler_data;
1210 
1211     struct netdev_queue __rcu *ingress_queue;
1212 
1213 /*
1214  * Cache lines mostly used on transmit path
1215  */
1216     struct netdev_queue *_tx ____cacheline_aligned_in_smp;
1217 
1218     /* Number of TX queues allocated at alloc_netdev_mq() time  */
1219     unsigned int        num_tx_queues;
1220 
1221     /* Number of TX queues currently active in device  */
1222     unsigned int        real_num_tx_queues;
1223 
1224     /* root qdisc from userspace point of view */
1225     struct Qdisc        *qdisc;
1226 
1227     unsigned long       tx_queue_len;   /* Max frames per queue allowed */
1228     spinlock_t      tx_global_lock;
1229 
1230 #ifdef CONFIG_XPS
1231     struct xps_dev_maps __rcu *xps_maps;
1232 #endif
1233 
1234     /* These may be needed for future network-power-down code. */
1235 
1236     /*
1237      * trans_start here is expensive for high speed devices on SMP,
1238      * please use netdev_queue->trans_start instead.
1239      */
1240     unsigned long       trans_start;    /* Time (in jiffies) of last Tx */
1241 
1242     int         watchdog_timeo; /* used by dev_watchdog() */
1243     struct timer_list   watchdog_timer;
1244 
1245     /* Number of references to this device */
1246     int __percpu        *pcpu_refcnt;
1247 
1248     /* delayed register/unregister */
1249     struct list_head    todo_list;
1250     /* device index hash chain */
1251     struct hlist_node   index_hlist;
1252 
1253     struct list_head    link_watch_list;
1254 
1255     /* register/unregister state machine */
1256     enum { NETREG_UNINITIALIZED=0,
1257            NETREG_REGISTERED,   /* completed register_netdevice */
1258            NETREG_UNREGISTERING,    /* called unregister_netdevice */
1259            NETREG_UNREGISTERED, /* completed unregister todo */
1260            NETREG_RELEASED,     /* called free_netdev */
1261            NETREG_DUMMY,        /* dummy device for NAPI poll */
1262     } reg_state:8;
1263 
1264     bool dismantle; /* device is going do be freed */
1265 
1266     enum {
1267         RTNL_LINK_INITIALIZED,
1268         RTNL_LINK_INITIALIZING,
1269     } rtnl_link_state:16;
1270 
1271     /* Called from unregister, can be used to call free_netdev */
1272     void (*destructor)(struct net_device *dev);
1273 
1274 #ifdef CONFIG_NETPOLL
1275     struct netpoll_info *npinfo;
1276 #endif
1277 
1278 #ifdef CONFIG_NET_NS
1279     /* Network namespace this network device is inside */
1280     struct net      *nd_net;
1281 #endif
1282 
1283     /* mid-layer private */
1284     union {
1285         void                *ml_priv;
1286         struct pcpu_lstats __percpu *lstats; /* loopback stats */
1287         struct pcpu_tstats __percpu *tstats; /* tunnel stats */
1288         struct pcpu_dstats __percpu *dstats; /* dummy stats */
1289     };
1290     /* GARP */
1291     struct garp_port __rcu  *garp_port;
1292 
1293     /* class/net/name entry */
1294     struct device       dev;
1295     /* space for optional device, statistics, and wireless sysfs groups */
1296     const struct attribute_group *sysfs_groups[4];
1297 
1298     /* rtnetlink link ops */
1299     const struct rtnl_link_ops *rtnl_link_ops;
1300 
1301     /* for setting kernel sock attribute on TCP connection setup */
1302 #define GSO_MAX_SIZE        65536
1303     unsigned int        gso_max_size;
1304 #define GSO_MAX_SEGS        65535
1305     u16         gso_max_segs;
1306 
1307 #ifdef CONFIG_DCB
1308     /* Data Center Bridging netlink ops */
1309     const struct dcbnl_rtnl_ops *dcbnl_ops;
1310 #endif
1311     u8 num_tc;
1312     struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
1313     u8 prio_tc_map[TC_BITMASK + 1];
1314 
1315 #if IS_ENABLED(CONFIG_FCOE)
1316     /* max exchange id for FCoE LRO by ddp */
1317     unsigned int        fcoe_ddp_xid;
1318 #endif
1319 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1320     struct netprio_map __rcu *priomap;
1321 #endif
1322     /* phy device may attach itself for hardware timestamping */
1323     struct phy_device *phydev;
1324 
1325     /* group the device belongs to */
1326     int group;
1327 };

skb表示数据和头部信息
分配: alloc_skb() or dev_alloc_skb()
drivers use dev_alloc_skb()
释放: kfree_skb() and dev_kfree_skb()spor

三层
transport_header
network_header
mac_header

skb_transport_header(skb)
skb_network_header(skb)
skb_mac_header(skb)

// 路由
unsigned long _skb_refdst;
之前的
struct dst_entry *dst;
// 每个sbk只有一个dst,dst有两个重要的函数
0069 int (*input)(struct sk_buff*); 0070 int (*output)(struct sk_buff*);

而net_device表示网络接口卡
//接口的mtu值
unsigned mtu; /* interface MTU value */
可以用命令改变: ifconfig wlan0 mtu 1400
// mac地址 6个字节
unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address */
// 发送数据的函数
int (*hard_start_xmit) (struct sk_buff *skb, 0415 struct net_device *dev)
// 是否处于混合模式, 如果值大于1 用来支持多个sniffer client
int promiscuity;

fib 首先查找local FIB table 然后查找main FIB table
-> cache lookup -->fib_lookup(){ ip_fib_local_table --> ip_fib_main_table}

LPM longest prefix match
netmask=0 default gateway (如果存在多个,第一个将会返回)

接收包处理
当工作 在中断驱动模型中, nic的寄存器保存一个中断handler,及中断号  调用 request_irq()
the same interrupt handler will be called when a frame is received
当有一个包接收时 调用handler

在hangdler调用dev_alloc_skb() 分配 sk_buff

iptables -A INPUT -p udp --dport 9999 -j DROP
默认表是filter表
这是 NF_IP_LOCAL_IN 规则
-->ip_rcv()->ip_rcv_finish()->ip_local_deliver() -x-> ip_local_deliver_finish()
--> nf_hook_slow()-->verdict==NF_DROP then skb_free(skb)












原文地址:https://www.cnblogs.com/kwingmei/p/3486085.html