在报文接收路径上,处理完报文的接收之后,使用函数__tcp_ack_snd_check检查是否需要发送ACK确认报文。如果不符合立即发送的条件,内核将延迟发送ACK确认报文。
1)接收到一个以上的全尺寸报文;
2)a)接收窗口增长足够大,可接收新报文。否则,只有在应用层调用tcp_recvmsg函数取走数据后,发送ACK确认报文,避免窗口满;
b)或者套接口处于QUICK ACK模式;
c)或者接收到了Out-Of-Order数据,需要告知对端;
同时满足以上2个条件,立即回复ACK报文。否则,进入延迟ACK逻辑。
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* ... and right edge of window advances far enough. (tcp_recvmsg() will
*send ACK otherwise). Or... */ __tcp_select_window(sk) >= tp->rcv_wnd) ||
/* We ACK each frame or... */ tcp_in_quickack_mode(sk) ||
/* We have out of order data. */ (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
tcp_send_ack(sk); /* Then ack it now */
} else {
/* Else, send delayed ack. */
tcp_send_delayed_ack(sk);
}
}
一、延迟ACK的启动
延迟ACK函数tcp_send_delayed_ack实现如下。如果当前的延迟ACK超时时间(ATO)大于定义的最小时长TCP_DELACK_MIN(40毫秒),对ATO进行调整以保证其值在适当范围内。内核假定最大的ATO值为500毫秒(HZ/2),如果TCP套接口为交互模式的应用所创建,例如telnet/SSH等,或者ACK的pinding设置了ICSK_ACK_PUSHED标志,需要缩短最大ATO的值到宏TCP_DELACK_MAX定义的值(200毫秒)。如果SRTT(Smoothed Round Trip Time)有值的话,检查是否可适度减低ATO值,SRTT换算为jiffies值(右移3位),如果结果大于最小延迟ACK时间TCP_DELACK_MIN,并且小于之前计算的最大ATO,使用RTT作为新的ATO时间限值。
#define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */
#define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
void tcp_send_delayed_ack(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int ato = icsk->icsk_ack.ato;
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
/* Slow path, intersegment interval is "high". */
/* If some rtt estimate is known, use it to bound delayed ack.
* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements directly.
*/
if (tp->srtt_us) {
int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), TCP_DELACK_MIN);
if (rtt < max_ato)
max_ato = rtt;
}
ato = min(ato, max_ato);
}
新的延迟ACK超时定时器的时长为timeout,但是如果在此之前已经启动了一个定时器,并且此定时器已到期但是被阻塞了未能执行ACK发送;或者最多再经过1/4的ATO时间,之前的定时器就要到期,马上发送ACK确认报文。否则,如果当前要启动的定时器的超时时间在之前定时器的超时时间之后,新定时器使用与之前定时器相同的超时时间。
最后,由于有ACK要发送并且要启动延时ACK定时器,设置上两个标志位ICSK_ACK_SCHED和ICSK_ACK_TIMER,函数sk_reset_timer启动定时器,超时时间设置在timeout时刻。
timeout = jiffies + ato; /* Stay within the limit we were given */
/* Use new timeout only if there wasn't a older one earlier. */
if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
/* If delack timer was blocked or is about to expire, send ACK now. */
if (icsk->icsk_ack.blocked || time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
if (!time_before(timeout, icsk->icsk_ack.timeout))
timeout = icsk->icsk_ack.timeout;
}
icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
icsk->icsk_ack.timeout = timeout;
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
二、延迟ACK超时定时器
如以上接收在函数tcp_send_delayed_ack中,在选择好ATO超时时间后将启动延迟ACK定时器。另外极端情况下在发送ACK时,有可能遇到内存不足的情况,启动延迟ACK定时器,超时时间设置为最小值TCP_ATO_MIN。
void tcp_send_ack(struct sock *sk)
{
struct sk_buff *buff;
/* We are not putting this on the write queue, so tcp_transmit_skb() will set the ownership to this sock. */
buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (unlikely(!buff)) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX);
return;
}
}
对于TCP客户端来说,在接收到服务端发来的三次握手中的第二个SYN+ACK报文之后,在满足以下条件是可不用立即回复ACK报文。a)客户端套接口有数据正等待发送;b)设置了延迟ACCEPT功能;c)ACK处于pingpong模式。但是,内核启动了延迟ACK定时器,时长设置为TCP_DELACK_MAX(200毫秒),以保证ACK确认报文最终能够发送出去。
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
if (th->ack) {
if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) {
/* Save one ACK. Data will be ready after several ticks, if write_pending is set.
*
* It may be deleted, but with this feature tcpdumps look so _wonderfully_ clever, that I was not able
* to stand against the temptation 8) --ANK
*/
inet_csk_schedule_ack(sk);
tcp_enter_quickack_mode(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
tcp_drop(sk, skb);
return 0;
}
}
超时处理函数如下tcp_delack_timer_handler。如果延迟ACK定时器标志位ICSK_ACK_TIMER并没有设置,退出执行。或者如果套接口的延迟ACK超时时间在当前时间之后,说明超时时间还没到,重新启动定时器后退出。在进行处理前,清除定时器启动标志ICSK_ACK_TIMER。如果存在需要调用的ACK报文,更新ATO时间,发送ACK确认报文。如果定时器超时并且套接口的ACK处在QUICKACK模式,将超时时间增大一倍;否则,套接口ACK策略处在pingpong模式下,关闭pingpong模式,并将超时时间ATO设置为最小值TCP_ATO_MIN。
void tcp_delack_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
if (time_after(icsk->icsk_ack.timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
goto out;
}
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (inet_csk_ack_scheduled(sk)) {
if (!icsk->icsk_ack.pingpong) {
icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); /* Delayed ACK missed: inflate ATO. */
} else {
/* Delayed ACK missed: leave pingpong mode and deflate ATO. */
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
tcp_mstamp_refresh(tcp_sk(sk));
tcp_send_ack(sk);
}
在发送ACK确认报文时,使用函数inet_csk_clear_xmit_timer函数取消延迟ACK定时器。
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
{
if (likely(tcb->tcp_flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
}
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
三、ACK超时时间更新
在处理完对端数据报文后,函数tcp_event_data_recv执行ATO的调整。在第一次接收到对端数据时,延迟ACK功能还未启动,ACK超时时间ATO为0,因为接收到数据,应当回复ACK确认报文,所以增加QUICK ACK的配额,并且开启延迟ACK定时器。
如果并非首次接收到数据报文,本次的接收时间与上次时间的间隔大于重传超时时间RTO,意味值对端重启窗口失败,增加QUICK ACK发送配额,以便快速发送ACK确认报文。如果报文间隔很短,小于最小的ACK超时时间ATO的一半,可将ATO时间降低为原ATO时间的一半与最小ATO一半的和。但是如果报文间隔大于最小ATO的一半,并且不超过当前ATO值,将ATO值更新为原ATO时间的一半与当前报文间隔的和,更新后的ATO值最大不超过重传超时RTO。
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
{
inet_csk_schedule_ack(sk);
tcp_measure_rcv_mss(sk, skb);
tcp_rcv_rtt_measure(tp);
now = tcp_jiffies32;
if (!icsk->icsk_ack.ato) {
/* The _first_ data packet received, initialize delayed ACK engine. */
tcp_incr_quickack(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
} else {
int m = now - icsk->icsk_ack.lrcvtime;
if (m <= TCP_ATO_MIN / 2) {
icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; /* The fastest case is the first. */
} else if (m < icsk->icsk_ack.ato) {
icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
if (icsk->icsk_ack.ato > icsk->icsk_rto)
icsk->icsk_ack.ato = icsk->icsk_rto;
} else if (m > icsk->icsk_rto) {
/* Too long gap. Apparently sender failed to restart window, so that we send ACKs quickly. */
tcp_incr_quickack(sk);
sk_mem_reclaim(sk);
}
}
icsk->icsk_ack.lrcvtime = now;
if (skb->len >= 128)
tcp_grow_window(sk, skb);
}
在数据发送路径上,如果发送时间与接收时间的差值小于ATO的时长,即在ATO超时处理函数tcp_delack_timer_handler被调用之前,内核已经回复了ACK报文。设置ACK模式为pingpong。在定时器到期之后,超时处理函数将ATO设置为最小值TCP_ATO_MIN。
/* Congestion state accounting after a packet has been sent. */
static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_jiffies32;
tp->lsndtime = now;
/* If it is a reply for ato after last received packet, enter pingpong mode. */
if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
icsk->icsk_ack.pingpong = 1;
}
QUICKACK模式启用的时候,固定将延迟ACK的超时时间初始化为最小值TCP_ATO_MIN。
static void tcp_enter_quickack_mode(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_incr_quickack(sk);
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
当QUICK ACK额度耗尽,关闭QUICKACK模式时,将ATO时间设置为最小值TCP_ATO_MIN。
static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts)
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ack.quick) {
if (pkts >= icsk->icsk_ack.quick) {
icsk->icsk_ack.quick = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN; /* Leaving quickack mode we deflate ATO. */
} else
icsk->icsk_ack.quick -= pkts;
}
}
感谢redwingz博主分享优等文章