1、到底那個是半連接隊列
/** struct listen_sock – listen state
*
* @max_qlen_log – log_2 of maximal queued SYNs/REQUESTs
*/
struct listen_sock {
u8 max_qlen_log; /*2^max_qlen_log is the length of the accpet queue, max of max_qlen_log is 10. (2^10=1024)*/
/* 3 bytes hole, try to use */
int qlen; /* qlen is the current length of the accpet queue*/
int qlen_young;
int clock_hand;
u32 hash_rnd;
u32 nr_table_entries; /*nr_table_entries is the number of the syn_table,max is 512*/
struct request_sock *syn_table[0];
};
里面有幾個關鍵的成員變量:max_qlen_log、qlen和syn_table。注意syn_table是一個零數組。
跟蹤listen系統調用:
inet_listen
inet_csk_listen_start
reqsk_queue_alloc
在reqsk_queue_alloc中:
const int lopt_size = sizeof(struct listen_sock) +
nr_table_entries * sizeof(struct request_sock *);
struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL);
我們發現這里進行了分配內存,分配了nr_table_entries個struct request_sock *。
對于nr_table_entries,我們可以往回追蹤:
err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
#define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */
跟蹤SYN數據包的處理,在tcp_v4_conn_request中,最后調用了inet_csk_reqsk_queue_hash_add函數:
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
lopt->hash_rnd, lopt->nr_table_entries);
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
inet_csk_reqsk_queue_added(sk, timeout);
}
reqsk_queue_hash_req將新建的request_sock添加到reqsk_queue中:
static inline void reqsk_queue_hash_req(struct request_sock_queue *queue,
u32 hash, struct request_sock *req,
unsigned long timeout)
{
struct listen_sock *lopt = queue->listen_opt;
req->expires = jiffies + timeout;
req->retrans = 0;
req->sk = NULL;
req->dl_next = lopt->syn_table[hash];
write_lock(&queue->syn_wait_lock);
lopt->syn_table[hash] = req;
write_unlock(&queue->syn_wait_lock);
}
inet_csk_reqsk_queue_added增加連接請求隊列的計數,必要是設置計數器:
static inline void inet_csk_reqsk_queue_added(struct sock *sk,
const unsigned long timeout)
{
if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0)
inet_csk_reset_keepalive_timer(sk, timeout);
}
static inline int reqsk_queue_added(struct request_sock_queue *queue)
{
struct listen_sock *lopt = queue->listen_opt;
const int prev_qlen = lopt->qlen;
lopt->qlen_young++;
lopt->qlen++;
return prev_qlen;
}
其他的幾個數據結構:
struct inet_connection_sock {
/* inet_sock has to be the first member! */
struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash;
unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
__u32 icsk_rto;
__u32 icsk_pmtu_cookie;
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state;
__u8 icsk_retransmits;
__u8 icsk_pending;
__u8 icsk_backoff;
__u8 icsk_syn_retries;
__u8 icsk_probes_out;
__u16 icsk_ext_hdr_len;
struct {
__u8 pending; /* ACK is pending */
__u8 quick; /* Scheduled number of quick acks */
__u8 pingpong; /* The session is interactive */
__u8 blocked; /* Delayed ACK was blocked by socket lock */
__u32 ato; /* Predicted tick of soft clock */
unsigned long timeout; /* Currently scheduled timeout */
__u32 lrcvtime; /* timestamp of last received data packet */
__u16 last_seg_size; /* Size of last incoming segment */
__u16 rcv_mss; /* MSS used for delayed ACK decisions */
} icsk_ack;
struct {
int enabled;
/* Range of MTUs to search */
int search_high;
int search_low;
/* Information on the current probe. */
int probe_size;
} icsk_mtup;
u32 icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32))
};
struct request_sock_queue {
/*Points to the request_sock accept queue, when after 3 handshake will add the request_sock from syn_table to here*/
struct request_sock *rskq_accept_head;
struct request_sock *rskq_accept_tail;
rwlock_t syn_wait_lock;
u8 rskq_defer_accept;
/* 3 bytes hole, try to pack */
struct listen_sock *listen_opt;
};
因此,半連接隊列在這里可以認為是icsk_accept_queue,叫做連接請求隊列。
2、半連接隊列的長度
跟蹤inet_csk_reqsk_queue_is_full,發現會比較queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log,看來關鍵在于max_qlen_log。
發現reqsk_queue_alloc中:
for (lopt->max_qlen_log = 6; /*64*/
(1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
lopt->max_qlen_log++);
我們在/proc/sys/net/ipv4/tcp_max_syn_backlog中會可以設置max_syn_backlog,這個就是我們可以設置的半連接隊列的長度。
默認是1024,那么max_qlen_log就是10了;加入我們設置成64,那么max_qlen_log就是6了,我們設置成128,就是7了;其他的依次類推。
3、連接請求的數據流向
在前面的分析中,SYN數據包的處理中,接收到SYN數據包,將會建立一個reqest_sock結構,添加到syn_table哈希表相應的表中。
接收到ACK數據包后,跟蹤tcp_v4_do_rcv,發現會調用tcp_v4_hnd_req。
在tcp_v4_hnd_req中:
/* Check the request_sock is in the syn_table or not.
If the request_sock have been in the syn_table, then call tcp_check_req*/
/*If ACK in 3 handsharks, will find a request_sock in syn_table, then call tcp_check_req().*/
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
iph->saddr, iph->daddr);
/*Normal: Call syn_recv_sock function(tcp_v4_syn_recv_sock)*/
if (req)
return tcp_check_req(sk, skb, req, prev);
在tcp_check_req中:
/*ipv4_specific.syn_recv_sock = tcp_v4_syn_recv_sock*/
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
req, NULL);
if (child == NULL)
goto listen_overflow;
/*Move the request_sock from the syn_table to accept_queue
Notes: syn_table isn’t A hlist_header structure.*/
inet_csk_reqsk_queue_unlink(sk, req, prev);
inet_csk_reqsk_queue_removed(sk, req);
inet_csk_reqsk_queue_add(sk, req, child);
return child;
tcp_v4_syn_recv_sock會根據request_sock新建一個sock結構,并且進行一定的初始化,返回新建的sock結構。
將request_sock從syn_table中移到accept_queue中。
static inline void inet_csk_reqsk_queue_add(struct sock *sk,
struct request_sock *req,
struct sock *child)
{
reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child);
}
static inline void reqsk_queue_add(struct request_sock_queue *queue,
struct request_sock *req,
struct sock *parent,
struct sock *child)
{
req->sk = child;
/*Add the number of backlog, that not completed 3 handsharks but have connected the server.*/*/
sk_acceptq_added(parent);
if (queue->rskq_accept_head == NULL)
queue->rskq_accept_head = req;
else
queue->rskq_accept_tail->dl_next = req;
queue->rskq_accept_tail = req;
req->dl_next = NULL;
}
4、accept系統調用的處理
三次握手之后,request_sock已經在rskq_accept隊列中了,等待accept系統調用取走。
static inline void sk_acceptq_removed(struct sock *sk)
{
sk->sk_ack_backlog–;
}
static inline void sk_acceptq_added(struct sock *sk)
{
sk->sk_ack_backlog++;
}
這個時候,我們關注一個struct sock中的兩個變量:
unsigned short sk_ack_backlog; /*sk_ack_backlog is the socket number that not completed 3 handsharks but have connected the server.*/
unsigned short sk_max_ack_backlog; /*sk_max_ack_backlog is the Max sk_ack_backlog, is assigned in the listen()*/
其中,sk_ack_backlog是已經完成了三次握手,但是還沒有被accept系統調用處理的連接請求數量;sk_max_ack_backlog就是我們經常熟悉的listen的參數。
跟蹤accept系統調用:
inet_csk_accept:
newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
static inline struct sock *reqsk_queue_get_child(struct request_sock_queue *queue,
struct sock *parent)
{
struct request_sock *req = reqsk_queue_remove(queue);
struct sock *child = req->sk;
BUG_TRAP(child != NULL);
sk_acceptq_removed(parent);
__reqsk_free(req);
return child;
}
注意這里free掉了在三次握手中建立的request_sock結構。
5、防止溢出的兩個鏈表檢查
在tcp_v4_conn_request中,對SYN包的處理過程中:
if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
want_cookie = 1;
} else
#endif
goto drop;
}
/* Accept backlog is full. If we have already queued enough
* of warm entries in syn queue, drop request. It is better than
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
/*If Accept Queue is full, Drop the packet*/
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
這里面有兩個隊列的檢查:request_sock隊列和accept隊列。
request_sock隊列:
static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
{
return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
}
static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
{
return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
}
accept隊列:
static inline int sk_acceptq_is_full(struct sock *sk)
{
return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
}
其中關系到4個變量,其中兩個是sock的成員變量,兩個是request_sock_queue中listen_opt的變量。
max_qlen_log的初始化:
在reqsk_queue_alloc中:
for (lopt->max_qlen_log = 6; /*64*/
(1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
lopt->max_qlen_log++);
sk_max_ack_backlog的初始化:
在inet_listen中:
sk->sk_max_ack_backlog = backlog;
注:sk_max_ack_backlog就是我們經常熟悉的listen的參數。
qlen的增加:
tcp_v4_conn_request
inet_csk_reqsk_queue_hash_add
inet_csk_reqsk_queue_added
reqsk_queue_added
注:跟蹤SYN數據包的處理,在tcp_v4_conn_request中,最后調用了inet_csk_reqsk_queue_hash_add函數:
inet_csk_reqsk_queue_added(sk, timeout);
inet_csk_reqsk_queue_added增加連接請求隊列的計數,必要時候設置計數器。
reqsk_queue_added:
lopt->qlen++;
qlen的減少:
tcp_v4_hnd_req
tcp_check_req
inet_csk_reqsk_queue_removed
reqsk_queue_removed
注:
在inet_csk_listen_stop中:
/* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close)
* or to send active reset (abort).
* Certainly, it is pretty dangerous while synflood, but it is
* bad justification for our negligence 
* To be honest, we are not able to make either
* of the variants now. –ANK
*/
reqsk_queue_destroy(&icsk->icsk_accept_queue);
sk_ack_backlog的增加:
tcp_check_req
inet_csk_reqsk_queue_add
reqsk_queue_add
sk_acceptq_added
sk_ack_backlog的減少:
inet_csk_accept
reqsk_queue_get_child
sk_acceptq_removed