TCP层close系统调用的实现分析

在调用close系统调用关闭套接字时，如果套接字引用计数已经归零，则需继续向上层调用其close实现，tcp为tcp_close；本文仅介绍tcp部分，前置部分请参考本博关于close系统调用的文章；
  1 void tcp_close(struct sock *sk, long timeout)
  2 {
  3     struct sk_buff *skb;
  4     int data_was_unread = 0;
  5     int state;
  6 
  7     lock_sock(sk);
  8     sk->sk_shutdown = SHUTDOWN_MASK;
  9 
 10     /* LISTEN状态处理 */
 11     if (sk->sk_state == TCP_LISTEN) {
 12         /* 设置close状态 */
 13         tcp_set_state(sk, TCP_CLOSE);
 14 
 15         /* Special case. */
 16         /* 清理完成连接队列 */
 17         inet_csk_listen_stop(sk);
 18 
 19         goto adjudge_to_death;
 20     }
 21 
 22     /*  We need to flush the recv. buffs.  We do this only on the
 23      *  descriptor close, not protocol-sourced closes, because the
 24      *  reader process may not have drained the data yet!
 25      */
 26     /* 删除接收队列中用户进程未读取的skb */
 27     while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 28         u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
 29 
 30         /* 减去fin的一个序号长度 */
 31         if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 32             len--;
 33         data_was_unread += len;
 34         __kfree_skb(skb);
 35     }
 36 
 37     sk_mem_reclaim(sk);
 38 
 39     /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
 40     /* CLOSE状态 */
 41     if (sk->sk_state == TCP_CLOSE)
 42         goto adjudge_to_death;
 43 
 44     /* As outlined in RFC 2525, section 2.17, we send a RST here because
 45      * data was lost. To witness the awful effects of the old behavior of
 46      * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
 47      * GET in an FTP client, suspend the process, wait for the client to
 48      * advertise a zero window, then kill -9 the FTP client, wheee...
 49      * Note: timeout is always zero in such a case.
 50      */
 51     /* 修复状态，断开连接 */
 52     if (unlikely(tcp_sk(sk)->repair)) {
 53         sk->sk_prot->disconnect(sk, 0);
 54     } 
 55     /* 用户进程有数据未读 */
 56     else if (data_was_unread) {
 57         /* Unread data was tossed, zap the connection. */
 58         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
 59 
 60         /* 设置为close */
 61         tcp_set_state(sk, TCP_CLOSE);
 62 
 63         /* 发送rst */
 64         tcp_send_active_reset(sk, sk->sk_allocation);
 65     } 
 66     /* lingertime==0，断开连接 */
 67     else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
 68         /* Check zero linger _after_ checking for unread data. */
 69         sk->sk_prot->disconnect(sk, 0);
 70         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
 71     } 
 72     /* 关闭状态转移 */
 73     else if (tcp_close_state(sk)) {
 74         /* We FIN if the application ate all the data before
 75          * zapping the connection.
 76          */
 77 
 78         /* RED-PEN. Formally speaking, we have broken TCP state
 79          * machine. State transitions:
 80          *
 81          * TCP_ESTABLISHED -> TCP_FIN_WAIT1
 82          * TCP_SYN_RECV    -> TCP_FIN_WAIT1 (forget it, it's impossible)
 83          * TCP_CLOSE_WAIT -> TCP_LAST_ACK
 84          *
 85          * are legal only when FIN has been sent (i.e. in window),
 86          * rather than queued out of window. Purists blame.
 87          *
 88          * F.e. "RFC state" is ESTABLISHED,
 89          * if Linux state is FIN-WAIT-1, but FIN is still not sent.
 90          *
 91          * The visible declinations are that sometimes
 92          * we enter time-wait state, when it is not required really
 93          * (harmless), do not send active resets, when they are
 94          * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
 95          * they look as CLOSING or LAST_ACK for Linux)
 96          * Probably, I missed some more holelets.
 97          *                         --ANK
 98          * XXX (TFO) - To start off we don't support SYN+ACK+FIN
 99          * in a single packet! (May consider it later but will
100          * probably need API support or TCP_CORK SYN-ACK until
101          * data is written and socket is closed.)
102          */
103         /* 发送fin */
104         tcp_send_fin(sk);
105     }
106 
107     /* 等待关闭，状态为FIN_WAIT_1 CLOSING LAST_ACK或sk_lingertime超时 */
108     sk_stream_wait_close(sk, timeout);
109 
110 adjudge_to_death:
111     state = sk->sk_state;
112     sock_hold(sk);
113 
114     /* 设置为DEAD状态 */
115     sock_orphan(sk);
116 
117     /* It is the last release_sock in its life. It will remove backlog. */
118     /* 删除控制块的backlog、cb */
119     release_sock(sk);
120 
121 
122     /* Now socket is owned by kernel and we acquire BH lock
123        to finish close. No need to check for user refs.
124      */
125     local_bh_disable();
126     bh_lock_sock(sk);
127     WARN_ON(sock_owned_by_user(sk));
128 
129     /* 增加孤儿计数 */
130     percpu_counter_inc(sk->sk_prot->orphan_count);
131 
132     /* Have we already been destroyed by a softirq or backlog? */
133     /* 被软中断或者backlog销毁了????? */
134     if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
135         goto out;
136 
137     /*    This is a (useful) BSD violating of the RFC. There is a
138      *    problem with TCP as specified in that the other end could
139      *    keep a socket open forever with no application left this end.
140      *    We use a 1 minute timeout (about the same as BSD) then kill
141      *    our end. If they send after that then tough - BUT: long enough
142      *    that we won't make the old 4*rto = almost no time - whoops
143      *    reset mistake.
144      *
145      *    Nope, it was not mistake. It is really desired behaviour
146      *    f.e. on http servers, when such sockets are useless, but
147      *    consume significant resources. Let's do it with special
148      *    linger2    option.                    --ANK
149      */
150 
151     if (sk->sk_state == TCP_FIN_WAIT2) {
152         struct tcp_sock *tp = tcp_sk(sk);
153         /* linger2小于0，无需等待 */
154         if (tp->linger2 < 0) {
155 
156             /* 转到CLOSE */
157             tcp_set_state(sk, TCP_CLOSE);
158             /* 发送rst */
159             tcp_send_active_reset(sk, GFP_ATOMIC);
160             __NET_INC_STATS(sock_net(sk),
161                     LINUX_MIB_TCPABORTONLINGER);
162         } else {
163 
164             /* 获取FIN_WAIT_2超时时间 */
165             const int tmo = tcp_fin_time(sk);
166 
167             /* FIN_WAIT_2超时时间> timewait时间，加FIN_WAIT_2定时器 */
168             if (tmo > TCP_TIMEWAIT_LEN) {
169                 inet_csk_reset_keepalive_timer(sk,
170                         tmo - TCP_TIMEWAIT_LEN);
171             } 
172             /* 小于TIME_WAIT时间，则进入TIME_WAIT */
173             else {
174                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
175                 goto out;
176             }
177         }
178     }
179 
180     /* 未处于CLOSE */
181     if (sk->sk_state != TCP_CLOSE) {
182         sk_mem_reclaim(sk);
183 
184         /* 孤儿数量过多，或者socket内存过多 */
185         if (tcp_check_oom(sk, 0)) {
186             /* 进入CLOSE */
187             tcp_set_state(sk, TCP_CLOSE);
188             /* 发送rst */
189             tcp_send_active_reset(sk, GFP_ATOMIC);
190             __NET_INC_STATS(sock_net(sk),
191                     LINUX_MIB_TCPABORTONMEMORY);
192         }
193     }
194 
195     /* 处于CLOSE */
196     if (sk->sk_state == TCP_CLOSE) {
197         struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
198         /* We could get here with a non-NULL req if the socket is
199          * aborted (e.g., closed with unread data) before 3WHS
200          * finishes.
201          */
202         if (req)
203             reqsk_fastopen_remove(sk, req, false);
204 
205         /* 销毁控制块 */
206         inet_csk_destroy_sock(sk);
207     }
208     /* Otherwise, socket is reprieved until protocol close. */
209 
210 out:
211     bh_unlock_sock(sk);
212     local_bh_enable();
213     sock_put(sk);
214 }