TCP/IP源码学习(52)——TCP的连接过程的实现(1)

http://blog.chinaunix.net/uid-23629988-id-3178006.html

作者：gfree.wind@gmail.com
博客：blog.focus-linux.net linuxfocus.blog.chinaunix.net

本文的copyleft归gfree.wind@gmail.com所有，使用GPL发布，可以自由拷贝，转载。但转载请保持文档的完整性，注明原作者及原链接，严禁用于任何商业用途。

======================================================================================================

在以前的文章中，学习了UDP数据包的接收和发送。今天开始研究一下TCP数据包的接受。与UDP数据包类似，当IP数据包到达ip_local_deliver_finish函数时，根据四层协议从inet_protos数组中得到TCP协议对应的tcp_protocol。

    static const struct net_protocol tcp_protocol = {
        .handler = tcp_v4_rcv,
        .err_handler = tcp_v4_err,
        .gso_send_check = tcp_v4_gso_send_check,
        .gso_segment = tcp_tso_segment,
        .gro_receive = tcp4_gro_receive,
        .gro_complete = tcp4_gro_complete,
        .no_policy = 1,
        .netns_ok = 1,
    };

那么TCP数据包的接收函数入口即为tcp_v4_rcv

    int tcp_v4_rcv(struct sk_buff *skb)
    {
        const struct iphdr *iph;
        const struct tcphdr *th;
        struct sock *sk;
        int ret;
        struct net *net = dev_net(skb->dev);

     
     /* 检测该包是否为发给本机的 */

        if (skb->pkt_type != PACKET_HOST)
            goto discard_it;

        /* Count it even if it's bad */
        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);

     
     /* 检查包长至少比TCP的首部长 */

        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
            goto discard_it;

        th = tcp_hdr(skb);


      /* 检查TCP首部 */

        if (th->doff < sizeof(struct tcphdr) / 4)
            goto bad_packet;
        if (!pskb_may_pull(skb, th->doff * 4))
            goto discard_it;

        /* An explanation is required here, I think.
         * Packet length and doff are validated by header prediction,
         * provided case of th->doff==0 is eliminated.
         * So, we defer the checks. */
        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
            goto bad_packet;

     /* 将sequence，ack等保存到socket的TCP控制块中 */

        th = tcp_hdr(skb);
        iph = ip_hdr(skb);
        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
                     skb->len - th->doff * 4);
        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
        TCP_SKB_CB(skb)->when     = 0;
        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
        TCP_SKB_CB(skb)->sacked     = 0;

     
     /* 
     通过源IP，目的IP，源端口，目的端口，和接收到的interface来查找socket。
     这里一共涉及两个hash表，一个是保存已连接TCP session，一个是处于listening的TCP session
     关于这两个hash，以后再分析。
     */

        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
        if (!sk)
            goto no_tcp_socket;

    process:
        /* TIME_WAIT的处理，以后再学习 */
        if (sk->sk_state == TCP_TIME_WAIT)
            goto do_time_wait;

        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
            NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
            goto discard_and_relse;
        }

     /* IPsec的检查 */

        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
            goto discard_and_relse;
        nf_reset(skb);

     
     /* socket filter没有用过。。。 */

        if (sk_filter(sk, skb))
            goto discard_and_relse;

        skb->dev = NULL;

        bh_lock_sock_nested(sk);
        ret = 0;

     /* 
     检查该socket是否由当前执行上下文拥有，如果是，可以继续处理该skb，
     如果不是，那么就将skb加到当前socket的sk_backlog上。
     这样的处理与UDP不同，因为TCP是有内部状态的，当处理一个TCP报文的时候，在中间又处理另外一个TCP报文的      时候，可能会改变TCP的状态，导致被打断的TCP报文处理失败。
     这里保证TCP的一个报文处理不会被打断
     */

        if (!sock_owned_by_user(sk)) {
    #ifdef CONFIG_NET_DMA
            struct tcp_sock *tp = tcp_sk(sk);
            if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
                tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
            if (tp->ucopy.dma_chan)
                ret = tcp_v4_do_rcv(sk, skb);
            else
    #endif
            {
                if (!tcp_prequeue(sk, skb))
                    ret = tcp_v4_do_rcv(sk, skb);
            }
        } else if (unlikely(sk_add_backlog(sk, skb))) {
            bh_unlock_sock(sk);
            NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
            goto discard_and_relse;
        }

        ...... ......

进入tcp_v4_do_rcv

    int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
    {
        struct sock *rsk;
    #ifdef CONFIG_TCP_MD5SIG
        /*
         * We really want to reject the packet as early as possible
         * if:
         * o We're expecting an MD5'd packet and this is no MD5 tcp option
         * o There is an MD5 option and we're not expecting one
         */
        if (tcp_v4_inbound_md5_hash(sk, skb))
            goto discard;
    #endif

        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
            /* 该TCP处于已连接状态，留作以后学习 */
            sock_rps_save_rxhash(sk, skb);
            if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
                rsk = sk;
                goto reset;
            }
            return 0;
        }

     

        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
            goto csum_err;

        if (sk->sk_state == TCP_LISTEN) {
            /* 
            处理TCP request包，即请求连接本机TCP端口的TCP报文，并返回应处理该skb的socket。
            对于第一个sync包，返回的nsk就是sk。
            */
            struct sock *nsk = tcp_v4_hnd_req(sk, skb);
            if (!nsk)
                goto discard;


         /* 如前面所说，对于第一个sync包，nsk就是sk，于是继续往下执行 */

            if (nsk != sk) {
                sock_rps_save_rxhash(nsk, skb);
                if (tcp_child_process(sk, nsk, skb)) {
                    rsk = nsk;
                    goto reset;
                }
                return 0;
            }
        } else
            sock_rps_save_rxhash(sk, skb);


        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
            rsk = sk;
            goto reset;
        }
        return 0;

        ...... ...... 
    }

进入tcp_rcv_state_process

    int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                 const struct tcphdr *th, unsigned int len)
    {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        int queued = 0;
        int res;

        tp->rx_opt.saw_tstamp = 0;

        switch (sk->sk_state) {
        case TCP_CLOSE:
            goto discard;

        case TCP_LISTEN:
            /* 本文的重点，第一个sync包会到这里 */
             
            /* 非法的TCP包，LISTEN状态只处理sync包 */
            if (th->ack)
                return 1;

            if (th->rst)
                goto discard;

            if (th->syn) {
                /* 第一个syn包 */
                if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
                    return 1;

                /* Now we have several options: In theory there is
                 * nothing else in the frame. KA9Q has an option to
                 * send data with the syn, BSD accepts data with the
                 * syn up to the [to be] advertised window and
                 * Solaris 2.1 gives you a protocol error. For now
                 * we just ignore it, that fits the spec precisely
                 * and avoids incompatibilities. It would be nice in
                 * future to drop through and process the data.
                 *
                 * Now that TTCP is starting to be used we ought to
                 * queue this data.
                 * But, this leaves one open to an easy denial of
                 * service attack, and SYN cookies can't defend
                 * against this problem. So, we drop the data
                 * in the interest of security over speed unless
                 * it's still in use.
                 */
                kfree_skb(skb);
                return 0;
            }
            goto discard;

     ......  ......
     ......  ......

    }

对于IPv4的TCP数据包，conn_request为tcp_v4_conn_request

    int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
    {
        struct tcp_extend_values tmp_ext;
        struct tcp_options_received tmp_opt;
        const u8 *hash_location;
        struct request_sock *req;
        struct inet_request_sock *ireq;
        struct tcp_sock *tp = tcp_sk(sk);
        struct dst_entry *dst = NULL;
        __be32 saddr = ip_hdr(skb)->saddr;
        __be32 daddr = ip_hdr(skb)->daddr;
        __u32 isn = TCP_SKB_CB(skb)->when;
        int want_cookie = 0;

        /* Never answer to SYNs send to broadcast or multicast */
        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
            goto drop;

        /* TW buckets are converted to open requests without
         * limitations, they conserve resources and peer is
         * evidently real one.
         */
        //检查syn queue是否已满，即request queue是否已满
        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
            /* 是否使用sync cookie */
            want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
            if (!want_cookie)
                goto drop;
        }

        /* Accept backlog is full. If we have already queued enough
         * of warm entries in syn queue, drop request. It is better than
         * clogging syn queue with openreqs with exponentially increasing
         * timeout.
         */
        //检查accept queue是否已满
        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
            goto drop;

     
     //申请一个新的request_sock

        req = inet_reqsk_alloc(&tcp_request_sock_ops);
        if (!req)
            goto drop;

    #ifdef CONFIG_TCP_MD5SIG
        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
    #endif

     //解析TCP的option

        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
        tmp_opt.user_mss = tp->rx_opt.user_mss;
        tcp_parse_options(skb, &tmp_opt, &hash_location, 0);


        if (tmp_opt.cookie_plus > 0 &&
         tmp_opt.saw_tstamp &&
         !tp->rx_opt.cookie_out_never &&
         (sysctl_tcp_cookie_size > 0 ||
         (tp->cookie_values != NULL &&
         tp->cookie_values->cookie_desired > 0))) {
            /* 
            不太确定这部分代码的用途，看上去跟sync cookie相关
            貌似是为了检查sync-cookie。
            */
            u8 *c;
            u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
            int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;

            if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
                goto drop_and_release;

            /* Secret recipe starts with IP addresses */
            *mess++ ^= (__force u32)daddr;
            *mess++ ^= (__force u32)saddr;

            /* plus variable length Initiator Cookie */
            c = (u8 *)mess;
            while (l-- > 0)
                *c++ ^= *hash_location++;

            want_cookie = 0;    /* not our kind of cookie */
            tmp_ext.cookie_out_never = 0; /* false */
            tmp_ext.cookie_plus = tmp_opt.cookie_plus;
        } else if (!tp->rx_opt.cookie_in_always) {
            /* redundant indications, but ensure initialization. */
            tmp_ext.cookie_out_never = 1; /* true */
            tmp_ext.cookie_plus = 0;
        } else {
            goto drop_and_release;
        }
        tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;

        if (want_cookie && !tmp_opt.saw_tstamp)
            tcp_clear_options(&tmp_opt);

        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
        tcp_openreq_init(req, &tmp_opt, skb);

        ireq = inet_rsk(req);
        ireq->loc_addr = daddr;
        ireq->rmt_addr = saddr;
        ireq->no_srccheck = inet_sk(sk)->transparent;
        ireq->opt = tcp_v4_save_options(sk, skb);

        if (security_inet_conn_request(sk, skb, req))
            goto drop_and_free;

        if (!want_cookie || tmp_opt.tstamp_ok)
            TCP_ECN_create_request(req, tcp_hdr(skb));

        if (want_cookie) {
            /* 生成sync cookie使用的Initial sequence numnber */
            isn = cookie_v4_init_sequence(sk, skb, &req->mss);
            req->cookie_ts = tmp_opt.tstamp_ok;
        } else if (!isn) {
            struct inet_peer *peer = NULL;
            struct flowi4 fl4;

            /* VJ's idea. We save last timestamp seen
             * from the destination in peer table, when entering
             * state TIME-WAIT, and check against it before
             * accepting new connection request.
             *
             * If "isn" is not zero, this request hit alive
             * timewait bucket, so that all the necessary checks
             * are made in the function processing timewait state.
             */
            /* 还是不懂这块的检查是为了什么。。。*/
            if (tmp_opt.saw_tstamp &&
             tcp_death_row.sysctl_tw_recycle &&
             (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
             fl4.daddr == saddr &&
             (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
                inet_peer_refcheck(peer);
                if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
                 (s32)(peer->tcp_ts - req->ts_recent) >
                                TCP_PAWS_WINDOW) {
                    NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
                    goto drop_and_release;
                }
            }
            /* Kill the following clause, if you dislike this way. */
            else if (!sysctl_tcp_syncookies &&
                 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
                 (sysctl_max_syn_backlog >> 2)) &&
                 (!peer || !peer->tcp_ts_stamp) &&
                 (!dst || !dst_metric(dst, RTAX_RTT))) {
                /* Without syncookies last quarter of
                 * backlog is filled with destinations,
                 * proven to be alive.
                 * It means that we continue to communicate
                 * to destinations, already remembered
                 * to the moment of synflood.
                 */
                LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u
",
                     &saddr, ntohs(tcp_hdr(skb)->source));
                goto drop_and_release;
            }


         /* 生成Initial Sequence Number */

            isn = tcp_v4_init_sequence(skb);
        }
        tcp_rsk(req)->snt_isn = isn;
        tcp_rsk(req)->snt_synack = tcp_time_stamp;

     /* 回复syn+ack包 */

        if (tcp_v4_send_synack(sk, dst, req,
                 (struct request_values *)&tmp_ext) ||
         want_cookie)
            goto drop_and_free;

     /* 将该request_sock添加到父socket的icsk_accept_queue中的listen_opt上 */

        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
        return 0;

    drop_and_release:
        dst_release(dst);
    drop_and_free:
        reqsk_free(req);
    drop:
        return 0;
    }

今天仅仅学习了一下TCP处理第一个sync包的过程，就发现了很多不明白的地方，还需要继续努力啊。争取早日把TCP的这些细节搞懂。