From 522e7548a9bd40305df41c0beae69448b7620d6b Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 22:54:52 -0800 Subject: [TCP] FRTO: Incorrectly clears TCPCB_EVER_RETRANS bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FRTO was slightly too brave... Should only clear TCPCB_SACKED_RETRANS bit. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1a14191687a..b21e232d5d3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1266,7 +1266,7 @@ void tcp_enter_frto(struct sock *sk) tp->undo_retrans = 0; sk_stream_for_retrans_queue(skb, sk) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS; + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; } tcp_sync_left_out(tp); -- cgit From 9ead9a1d385ae2c52a6dcf2828d84ce66be04fc2 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 22:56:19 -0800 Subject: [TCP] FRTO: Separated response from FRTO detection algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FRTO spurious RTO detection algorithm (RFC4138) does not include response to a detected spurious RTO but can use different response algorithms. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b21e232d5d3..c5be3d0465f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2467,6 +2467,15 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, return flag; } +/* A very conservative spurious RTO response algorithm: reduce cwnd and + * continue in congestion avoidance. + */ +static void tcp_conservative_spur_to_response(struct tcp_sock *tp) +{ + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tcp_moderate_cwnd(tp); +} + static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); @@ -2488,12 +2497,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) */ tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; } else { - /* Also the second ACK after RTO advances the window. - * The RTO was likely spurious. Reduce cwnd and continue - * in congestion avoidance - */ - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tcp_moderate_cwnd(tp); + tcp_conservative_spur_to_response(tp); } /* F-RTO affects on two new ACKs following RTO. -- cgit From bdaae17da81db79b9aa4dfbf43305cfeef64f6a8 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 22:59:58 -0800 Subject: [TCP] FRTO: Moved tcp_use_frto from tcp.h to tcp_input.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In addition, removed inline. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c5be3d0465f..fe96e176d85 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1236,6 +1236,19 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } +int tcp_use_frto(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + /* F-RTO must be activated in sysctl and there must be some + * unsent new data, and the advertised window should allow + * sending it. + */ + return (sysctl_tcp_frto && sk->sk_send_head && + !after(TCP_SKB_CB(sk->sk_send_head)->end_seq, + tp->snd_una + tp->snd_wnd)); +} + /* RTO occurred, but do not yet enter loss state. Instead, transmit two new * segments to see from the next ACKs whether any data was really missing. * If the RTO was spurious, new ACKs should arrive. -- cgit From 30935cf4f915c3178ce63331d6ff4c82163e26af Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:01:36 -0800 Subject: [TCP] FRTO: Comment cleanup & improvement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moved comments out from the body of process_frto() to the head (preferred way; see Documentation/CodingStyle). Bonus: it's much easier to read in this compacted form. FRTO algorithm and implementation is described in greater detail. For interested reader, more information is available in RFC4138. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fe96e176d85..561e5d40498 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1236,22 +1236,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } +/* F-RTO can only be used if these conditions are satisfied: + * - there must be some unsent new data + * - the advertised window should allow sending it + */ int tcp_use_frto(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - /* F-RTO must be activated in sysctl and there must be some - * unsent new data, and the advertised window should allow - * sending it. - */ return (sysctl_tcp_frto && sk->sk_send_head && !after(TCP_SKB_CB(sk->sk_send_head)->end_seq, tp->snd_una + tp->snd_wnd)); } -/* RTO occurred, but do not yet enter loss state. Instead, transmit two new - * segments to see from the next ACKs whether any data was really missing. - * If the RTO was spurious, new ACKs should arrive. +/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO + * recovery a bit and use heuristics in tcp_process_frto() to detect if + * the RTO was spurious. */ void tcp_enter_frto(struct sock *sk) { @@ -2489,6 +2489,30 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) tcp_moderate_cwnd(tp); } +/* F-RTO spurious RTO detection algorithm (RFC4138) + * + * F-RTO affects during two new ACKs following RTO. State (ACK number) is kept + * in frto_counter. When ACK advances window (but not to or beyond highest + * sequence sent before RTO): + * On First ACK, send two new segments out. + * On Second ACK, RTO was likely spurious. Do spurious response (response + * algorithm is not part of the F-RTO detection algorithm + * given in RFC4138 but can be selected separately). + * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss + * and TCP falls back to conventional RTO recovery. + * + * Rationale: if the RTO was spurious, new ACKs should arrive from the + * original window even after we transmit two new data segments. + * + * F-RTO is implemented (mainly) in four functions: + * - tcp_use_frto() is used to determine if TCP is can use F-RTO + * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is + * called when tcp_use_frto() showed green light + * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm + * - tcp_enter_frto_loss() is called if there is not enough evidence + * to prove that the RTO is indeed spurious. It transfers the control + * from F-RTO to the conventional RTO recovery + */ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); @@ -2497,25 +2521,16 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) if (tp->snd_una == prior_snd_una || !before(tp->snd_una, tp->frto_highmark)) { - /* RTO was caused by loss, start retransmitting in - * go-back-N slow start - */ tcp_enter_frto_loss(sk); return; } if (tp->frto_counter == 1) { - /* First ACK after RTO advances the window: allow two new - * segments out. - */ tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; - } else { + } else /* frto_counter == 2 */ { tcp_conservative_spur_to_response(tp); } - /* F-RTO affects on two new ACKs following RTO. - * At latest on third ACK the TCP behavior is back to normal. - */ tp->frto_counter = (tp->frto_counter + 1) % 3; } -- cgit From 7487c48c4fd15d1e2542be1183b783562cfe10bc Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:02:30 -0800 Subject: [TCP] FRTO: Consecutive RTOs keep prior_ssthresh and ssthresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case a latency spike causes more than one RTO, the later should not cause the already reduced ssthresh to propagate into the prior_ssthresh since FRTO declares all such RTOs spurious at once or none of them. In treating of ssthresh, we mimic what tcp_enter_loss() does. The previous state (in frto_counter) must be available until we have checked it in tcp_enter_frto(), and also ACK information flag in process_frto(). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 561e5d40498..194e880af51 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1252,6 +1252,10 @@ int tcp_use_frto(const struct sock *sk) /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO * recovery a bit and use heuristics in tcp_process_frto() to detect if * the RTO was spurious. + * + * Do like tcp_enter_loss() would; when RTO expires the second time it + * does: + * "Reduce ssthresh if it has not yet been made inside this window." */ void tcp_enter_frto(struct sock *sk) { @@ -1259,11 +1263,10 @@ void tcp_enter_frto(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - tp->frto_counter = 1; - - if (icsk->icsk_ca_state <= TCP_CA_Disorder || + if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) || tp->snd_una == tp->high_seq || - (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && + !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_FRTO); @@ -1285,6 +1288,7 @@ void tcp_enter_frto(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Open); tp->frto_highmark = tp->snd_nxt; + tp->frto_counter = 1; } /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, @@ -2513,12 +2517,16 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) * to prove that the RTO is indeed spurious. It transfers the control * from F-RTO to the conventional RTO recovery */ -static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) +static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) { struct tcp_sock *tp = tcp_sk(sk); tcp_sync_left_out(tp); + /* Duplicate the behavior from Loss state (fastretrans_alert) */ + if (flag&FLAG_DATA_ACKED) + inet_csk(sk)->icsk_retransmits = 0; + if (tp->snd_una == prior_snd_una || !before(tp->snd_una, tp->frto_highmark)) { tcp_enter_frto_loss(sk); @@ -2607,7 +2615,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) - tcp_process_frto(sk, prior_snd_una); + tcp_process_frto(sk, prior_snd_una, flag); if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ -- cgit From 7b0eb22b1d3b049306813a4aaa52966650f7491c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:03:35 -0800 Subject: [TCP] FRTO: Use Disorder state during operation instead of Open MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Retransmission counter assumptions are to be changed. Forcing reason to do this exist: Using sysctl in check would be racy as soon as FRTO starts to ignore some ACKs (doing that in the following patches). Userspace may disable it at any moment giving nice oops if timing is right. frto_counter would be inaccessible from userspace, but with SACK enhanced FRTO retrans_out can include other than head, and possibly leaving it non-zero after spurious RTO, boom again. Luckily, solution seems rather simple: never go directly to Open state but use Disorder instead. This does not really change much, since TCP could anyway change its state to Disorder during FRTO using path tcp_fastretrans_alert -> tcp_try_to_open (e.g., when a SACK block makes ACK dubious). Besides, Disorder seems to be the state where TCP should be if not recovering (in Recovery or Loss state) while having some retransmissions in-flight (see tcp_try_to_open), which is exactly what happens with FRTO. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 194e880af51..e806839acdd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1286,7 +1286,8 @@ void tcp_enter_frto(struct sock *sk) } tcp_sync_left_out(tp); - tcp_set_ca_state(sk, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Disorder); + tp->high_seq = tp->snd_nxt; tp->frto_highmark = tp->snd_nxt; tp->frto_counter = 1; } @@ -2014,8 +2015,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - if (!sysctl_tcp_frto) - BUG_TRAP(tp->retrans_out == 0); + BUG_TRAP(tp->retrans_out == 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { -- cgit From 6408d206c7484615ecae54bf6474a02c94e9e862 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:04:11 -0800 Subject: [TCP] FRTO: Ignore some uninteresting ACKs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handles RFC4138 shortcoming (in step 2); it should also have case c) which ignores ACKs that are not duplicates nor advance window (opposite dir data, winupdate). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e806839acdd..e990d562f5e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2495,9 +2495,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) /* F-RTO spurious RTO detection algorithm (RFC4138) * - * F-RTO affects during two new ACKs following RTO. State (ACK number) is kept - * in frto_counter. When ACK advances window (but not to or beyond highest - * sequence sent before RTO): + * F-RTO affects during two new ACKs following RTO (well, almost, see inline + * comments). State (ACK number) is kept in frto_counter. When ACK advances + * window (but not to or beyond highest sequence sent before RTO): * On First ACK, send two new segments out. * On Second ACK, RTO was likely spurious. Do spurious response (response * algorithm is not part of the F-RTO detection algorithm @@ -2527,6 +2527,13 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) if (flag&FLAG_DATA_ACKED) inet_csk(sk)->icsk_retransmits = 0; + /* RFC4138 shortcoming in step 2; should also have case c): ACK isn't + * duplicate nor advances window, e.g., opposite dir data, winupdate + */ + if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && + !(flag&FLAG_FORWARD_PROGRESS)) + return; + if (tp->snd_una == prior_snd_una || !before(tp->snd_una, tp->frto_highmark)) { tcp_enter_frto_loss(sk); -- cgit From 95c4922bf9330eb2c71b752359dd89c4e166f3c5 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:05:18 -0800 Subject: [TCP] FRTO: fixes fallback to conventional recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FRTO detection did not care how ACK pattern affects to cwnd calculation of the conventional recovery. This caused incorrect setting of cwnd when the fallback becames necessary. The knowledge tcp_process_frto() has about the incoming ACK is now passed on to tcp_enter_frto_loss() in allowed_segments parameter that gives the number of segments that must be added to packets-in-flight while calculating the new cwnd. Instead of snd_una we use FLAG_DATA_ACKED in duplicate ACK detection because RFC4138 states (in Section 2.2): If the first acknowledgment after the RTO retransmission does not acknowledge all of the data that was retransmitted in step 1, the TCP sender reverts to the conventional RTO recovery. Otherwise, a malicious receiver acknowledging partial segments could cause the sender to declare the timeout spurious in a case where data was lost. If the next ACK after RTO is duplicate, we do not retransmit anything, which is equal to what conservative conventional recovery does in such case. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e990d562f5e..cc935c8a6aa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1296,7 +1296,7 @@ void tcp_enter_frto(struct sock *sk) * which indicates that we should follow the traditional RTO recovery, * i.e. mark everything lost and do go-back-N retransmission. */ -static void tcp_enter_frto_loss(struct sock *sk) +static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1326,7 +1326,7 @@ static void tcp_enter_frto_loss(struct sock *sk) } tcp_sync_left_out(tp); - tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1; + tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->undo_marker = 0; @@ -2527,6 +2527,11 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) if (flag&FLAG_DATA_ACKED) inet_csk(sk)->icsk_retransmits = 0; + if (!before(tp->snd_una, tp->frto_highmark)) { + tcp_enter_frto_loss(sk, tp->frto_counter + 1); + return; + } + /* RFC4138 shortcoming in step 2; should also have case c): ACK isn't * duplicate nor advances window, e.g., opposite dir data, winupdate */ @@ -2534,9 +2539,8 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) !(flag&FLAG_FORWARD_PROGRESS)) return; - if (tp->snd_una == prior_snd_una || - !before(tp->snd_una, tp->frto_highmark)) { - tcp_enter_frto_loss(sk); + if (!(flag&FLAG_DATA_ACKED)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3)); return; } -- cgit From aa8b6a7ad147dfbaaf10368ff15df9418b670d8b Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:06:03 -0800 Subject: [TCP] FRTO: Response should reset also snd_cwnd_cnt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since purpose is to reduce CWND, we prevent immediate growth. This is not a major issue nor is "the correct way" specified anywhere. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cc935c8a6aa..924b2e6d7d1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2490,6 +2490,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, static void tcp_conservative_spur_to_response(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd_cnt = 0; tcp_moderate_cwnd(tp); } -- cgit From 52c63f1e86ebb18ef4b710b5b647e552a041e5ca Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:06:52 -0800 Subject: [TCP]: Don't enter to fast recovery while using FRTO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because TCP is not in Loss state during FRTO recovery, fast recovery could be triggered by accident. Non-SACK FRTO is more robust than not yet included SACK-enhanced version (that can receiver high number of duplicate ACKs with SACK blocks during FRTO), at least with unidirectional transfers, but under extraordinary patterns fast recovery can be incorrectly triggered, e.g., Data loss+ACK losses => cumulative ACK with enough SACK blocks to meet sacked_out >= dupthresh condition). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 924b2e6d7d1..7213740477e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1547,6 +1547,10 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) { __u32 packets_out; + /* Do not perform any recovery during FRTO algorithm */ + if (tp->frto_counter) + return 0; + /* Trick#1: The loss is proven. */ if (tp->lost_out) return 1; -- cgit From 94d0ea7786714d78d7cb73144bb850254dd0bb78 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:07:27 -0800 Subject: [TCP] FRTO: frto_counter modulo-op converted to two assignments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7213740477e..9dc5754141e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2551,11 +2551,11 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) if (tp->frto_counter == 1) { tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + tp->frto_counter = 2; } else /* frto_counter == 2 */ { tcp_conservative_spur_to_response(tp); + tp->frto_counter = 0; } - - tp->frto_counter = (tp->frto_counter + 1) % 3; } /* This routine deals with incoming acks, but not outgoing ones. */ -- cgit From 7c9a4a5b67926dd186d427bc5b9fce6ccbde154c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:08:34 -0800 Subject: [TCP]: Prevent unrelated cwnd adjustment while using FRTO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FRTO controls cwnd when it still processes the ACK input or it has just reverted back to conventional RTO recovery; the normal rules apply when FRTO has reverted to standard congestion control. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9dc5754141e..723cee63791 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2522,7 +2522,7 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) * to prove that the RTO is indeed spurious. It transfers the control * from F-RTO to the conventional RTO recovery */ -static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) +static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2534,7 +2534,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) if (!before(tp->snd_una, tp->frto_highmark)) { tcp_enter_frto_loss(sk, tp->frto_counter + 1); - return; + return 1; } /* RFC4138 shortcoming in step 2; should also have case c): ACK isn't @@ -2542,20 +2542,22 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) */ if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && !(flag&FLAG_FORWARD_PROGRESS)) - return; + return 1; if (!(flag&FLAG_DATA_ACKED)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3)); - return; + return 1; } if (tp->frto_counter == 1) { tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; + return 1; } else /* frto_counter == 2 */ { tcp_conservative_spur_to_response(tp); tp->frto_counter = 0; } + return 0; } /* This routine deals with incoming acks, but not outgoing ones. */ @@ -2569,6 +2571,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 prior_in_flight; s32 seq_rtt; int prior_packets; + int frto_cwnd = 0; /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. @@ -2631,15 +2634,16 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) - tcp_process_frto(sk, prior_snd_una, flag); + frto_cwnd = tcp_process_frto(sk, prior_snd_una, flag); if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) + if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && + tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag & FLAG_DATA_ACKED)) + if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); } -- cgit From 46d0de4ed92650b95f27acae09914996bbe624e7 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:10:39 -0800 Subject: [TCP] FRTO: Entry is allowed only during (New)Reno like recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This interpretation comes from RFC4138: "If the sender implements some loss recovery algorithm other than Reno or NewReno [FHG04], the F-RTO algorithm SHOULD NOT be entered when earlier fast recovery is underway." I think the RFC means to say (especially in the light of Appendix B) that ...recovery is underway (not just fast recovery) or was underway when it was interrupted by an earlier (F-)RTO that hasn't yet been resolved (snd_una has not advanced enough). Thus, my interpretation is that whenever TCP has ever retransmitted other than head, basic version cannot be used because then the order assumptions which are used as FRTO basis do not hold. NewReno has only the head segment retransmitted at a time. Therefore, walk up to the segment that has not been SACKed, if that segment is not retransmitted nor anything before it, we know for sure, that nothing after the non-SACKed segment should be either. This assumption is valid because TCPCB_EVER_RETRANS does not leave holes but each non-SACKed segment is rexmitted in-order. Check for retrans_out > 1 avoids more expensive walk through the skb list, as we can know the result beforehand: F-RTO will not be allowed. SACKed skb can turn into non-SACked only in the extremely rare case of SACK reneging, in this case we might fail to detect retransmissions if there were them for any other than head. To get rid of that feature, whole rexmit queue would have to be walked (always) or FRTO should be prevented when SACK reneging happens. Of course RTO should still trigger after reneging which makes this issue even less likely to show up. And as long as the response is as conservative as it's now, nothing bad happens even then. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 723cee63791..a283fc12186 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1239,14 +1239,31 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* F-RTO can only be used if these conditions are satisfied: * - there must be some unsent new data * - the advertised window should allow sending it + * - TCP has never retransmitted anything other than head */ -int tcp_use_frto(const struct sock *sk) +int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (!sysctl_tcp_frto || !sk->sk_send_head || + after(TCP_SKB_CB(sk->sk_send_head)->end_seq, + tp->snd_una + tp->snd_wnd)) + return 0; - return (sysctl_tcp_frto && sk->sk_send_head && - !after(TCP_SKB_CB(sk->sk_send_head)->end_seq, - tp->snd_una + tp->snd_wnd)); + /* Avoid expensive walking of rexmit queue if possible */ + if (tp->retrans_out > 1) + return 0; + + skb = skb_peek(&sk->sk_write_queue)->next; /* Skips head */ + sk_stream_for_retrans_queue_from(skb, sk) { + if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + return 0; + /* Short-circuit when first non-SACKed skb has been checked */ + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) + break; + } + return 1; } /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO -- cgit From d1a54c6a0a3f9c2c4ef71982d89b8571bd9eaa51 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:11:57 -0800 Subject: [TCP] FRTO: Reverse RETRANS bit clearing logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously RETRANS bits were cleared on the entry to FRTO. We postpone that into tcp_enter_frto_loss, which is really the place were the clearing should be done anyway. This allows simplification of the logic from a clearing loop to the head skb clearing only. Besides, the other changes made in the previous patches to tcp_use_frto made it impossible for the non-SACKed FRTO to be entered if other than the head has been rexmitted. With SACK-enhanced FRTO (and Appendix B), however, there can be a number retransmissions in flight when RTO expires (same thing could happen before this patchset also with non-SACK FRTO). To not introduce any jumpiness into the packet counting during FRTO, instead of clearing RETRANS bits from skbs during entry, do it later on. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a283fc12186..3ef7e9e0796 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1268,7 +1268,11 @@ int tcp_use_frto(struct sock *sk) /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO * recovery a bit and use heuristics in tcp_process_frto() to detect if - * the RTO was spurious. + * the RTO was spurious. Only clear SACKED_RETRANS of the head here to + * keep retrans_out counting accurate (with SACK F-RTO, other than head + * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS + * bits are handled if the Loss state is really to be entered (in + * tcp_enter_frto_loss). * * Do like tcp_enter_loss() would; when RTO expires the second time it * does: @@ -1289,17 +1293,13 @@ void tcp_enter_frto(struct sock *sk) tcp_ca_event(sk, CA_EVENT_FRTO); } - /* Have to clear retransmission markers here to keep the bookkeeping - * in shape, even though we are not yet in Loss state. - * If something was really lost, it is eventually caught up - * in tcp_enter_frto_loss. - */ - tp->retrans_out = 0; tp->undo_marker = tp->snd_una; tp->undo_retrans = 0; - sk_stream_for_retrans_queue(skb, sk) { + skb = skb_peek(&sk->sk_write_queue); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); } tcp_sync_left_out(tp); @@ -1313,7 +1313,7 @@ void tcp_enter_frto(struct sock *sk) * which indicates that we should follow the traditional RTO recovery, * i.e. mark everything lost and do go-back-N retransmission. */ -static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments) +static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1322,10 +1322,21 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments) tp->sacked_out = 0; tp->lost_out = 0; tp->fackets_out = 0; + tp->retrans_out = 0; sk_stream_for_retrans_queue(skb, sk) { cnt += tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + /* + * Count the retransmission made on RTO correctly (only when + * waiting for the first ACK and did not get it)... + */ + if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) { + tp->retrans_out += tcp_skb_pcount(skb); + /* ...enter this if branch just for the first segment */ + flag |= FLAG_DATA_ACKED; + } else { + TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + } if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { /* Do not mark those segments lost that were @@ -2550,7 +2561,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) inet_csk(sk)->icsk_retransmits = 0; if (!before(tp->snd_una, tp->frto_highmark)) { - tcp_enter_frto_loss(sk, tp->frto_counter + 1); + tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); return 1; } @@ -2562,7 +2573,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) return 1; if (!(flag&FLAG_DATA_ACKED)) { - tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3)); + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), flag); return 1; } -- cgit From 66e93e45c09affa407750cc06398492e8b897848 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:13:47 -0800 Subject: [TCP] FRTO: Fake cwnd for ssthresh callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TCP without FRTO would be in Loss state with small cwnd. FRTO, however, leaves cwnd (typically) to a larger value which causes ssthresh to become too large in case RTO is triggered again compared to what conventional recovery would do. Because consecutive RTOs result in only a single ssthresh reduction, RTO+cumulative ACK+RTO pattern is required to trigger this event. A large comment is included for congestion control module writers trying to figure out what CA_EVENT_FRTO handler should do because there exists a remote possibility of incompatibility between FRTO and module defined ssthresh functions. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ef7e9e0796..055721d8495 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1289,7 +1289,31 @@ void tcp_enter_frto(struct sock *sk) ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + /* Our state is too optimistic in ssthresh() call because cwnd + * is not reduced until tcp_enter_frto_loss() when previous FRTO + * recovery has not yet completed. Pattern would be this: RTO, + * Cumulative ACK, RTO (2xRTO for the same segment does not end + * up here twice). + * RFC4138 should be more specific on what to do, even though + * RTO is quite unlikely to occur after the first Cumulative ACK + * due to back-off and complexity of triggering events ... + */ + if (tp->frto_counter) { + u32 stored_cwnd; + stored_cwnd = tp->snd_cwnd; + tp->snd_cwnd = 2; + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = stored_cwnd; + } else { + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + } + /* ... in theory, cong.control module could do "any tricks" in + * ssthresh(), which means that ca_state, lost bits and lost_out + * counter would have to be faked before the call occurs. We + * consider that too expensive, unlikely and hacky, so modules + * using these in ssthresh() must deal these incompatibility + * issues if they receives CA_EVENT_FRTO and frto_counter != 0 + */ tcp_ca_event(sk, CA_EVENT_FRTO); } -- cgit From 288035f915686a9a9e85e0358c5392bb5d7ae58d Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:14:42 -0800 Subject: [TCP]: Prevent reordering adjustments during FRTO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be honest, I'm not too sure how the reord stuff works in the first place but this seems necessary. When FRTO has been active, the one and only retransmission could be unnecessary but the state and sending order might not be what the sacktag code expects it to be (to work correctly). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 055721d8495..df516d4eca9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1224,7 +1224,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp->left_out = tp->sacked_out + tp->lost_out; - if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss) + if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && + (tp->frto_highmark && after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); #if FASTRETRANS_DEBUG > 0 -- cgit From 4dc2665e3634d720a62bd27128fc8781fcdad2dc Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 21 Feb 2007 23:16:11 -0800 Subject: [TCP]: SACK enhanced FRTO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the SACK-enhanced FRTO given in RFC4138 using the variant given in Appendix B. RFC4138, Appendix B: "This means that in order to declare timeout spurious, the TCP sender must receive an acknowledgment for non-retransmitted segment between SND.UNA and RecoveryPoint in algorithm step 3. RecoveryPoint is defined in conservative SACK-recovery algorithm [RFC3517]" The basic version of the FRTO algorithm can still be used also when SACK is enabled. To enabled SACK-enhanced version, tcp_frto sysctl is set to 2. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 76 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 11 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index df516d4eca9..bb3f234668b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,6 +100,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -110,6 +111,8 @@ int sysctl_tcp_abc __read_mostly; #define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) #define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) +#define IsSackFrto() (sysctl_tcp_frto == 0x2) + #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) /* Adapt the MSS value used to make delayed ack decision to the @@ -1159,6 +1162,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* clear lost hint */ tp->retransmit_skb_hint = NULL; } + /* SACK enhanced F-RTO detection. + * Set flag if and only if non-rexmitted + * segments below frto_highmark are + * SACKed (RFC4138; Appendix B). + * Clearing correct due to in-order walk + */ + if (after(end_seq, tp->frto_highmark)) { + flag &= ~FLAG_ONLY_ORIG_SACKED; + } else { + if (!(sacked & TCPCB_RETRANS)) + flag |= FLAG_ONLY_ORIG_SACKED; + } } TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; @@ -1240,7 +1255,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* F-RTO can only be used if these conditions are satisfied: * - there must be some unsent new data * - the advertised window should allow sending it - * - TCP has never retransmitted anything other than head + * - TCP has never retransmitted anything other than head (SACK enhanced + * variant from Appendix B of RFC4138 is more robust here) */ int tcp_use_frto(struct sock *sk) { @@ -1252,6 +1268,9 @@ int tcp_use_frto(struct sock *sk) tp->snd_una + tp->snd_wnd)) return 0; + if (IsSackFrto()) + return 1; + /* Avoid expensive walking of rexmit queue if possible */ if (tp->retrans_out > 1) return 0; @@ -1328,9 +1347,18 @@ void tcp_enter_frto(struct sock *sk) } tcp_sync_left_out(tp); + /* Earlier loss recovery underway (see RFC4138; Appendix B). + * The last condition is necessary at least in tp->frto_counter case. + */ + if (IsSackFrto() && (tp->frto_counter || + ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && + after(tp->high_seq, tp->snd_una)) { + tp->frto_highmark = tp->high_seq; + } else { + tp->frto_highmark = tp->snd_nxt; + } tcp_set_ca_state(sk, TCP_CA_Disorder); tp->high_seq = tp->snd_nxt; - tp->frto_highmark = tp->snd_nxt; tp->frto_counter = 1; } @@ -2566,6 +2594,10 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) * Rationale: if the RTO was spurious, new ACKs should arrive from the * original window even after we transmit two new data segments. * + * SACK version: + * on first step, wait until first cumulative ACK arrives, then move to + * the second step. In second step, the next ACK decides. + * * F-RTO is implemented (mainly) in four functions: * - tcp_use_frto() is used to determine if TCP is can use F-RTO * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is @@ -2590,16 +2622,38 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) return 1; } - /* RFC4138 shortcoming in step 2; should also have case c): ACK isn't - * duplicate nor advances window, e.g., opposite dir data, winupdate - */ - if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && - !(flag&FLAG_FORWARD_PROGRESS)) - return 1; + if (!IsSackFrto() || IsReno(tp)) { + /* RFC4138 shortcoming in step 2; should also have case c): + * ACK isn't duplicate nor advances window, e.g., opposite dir + * data, winupdate + */ + if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && + !(flag&FLAG_FORWARD_PROGRESS)) + return 1; - if (!(flag&FLAG_DATA_ACKED)) { - tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), flag); - return 1; + if (!(flag&FLAG_DATA_ACKED)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), + flag); + return 1; + } + } else { + if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + /* Prevent sending of new data. */ + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)); + return 1; + } + + if ((tp->frto_counter == 2) && + (!(flag&FLAG_FORWARD_PROGRESS) || + ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { + /* RFC4138 shortcoming (see comment above) */ + if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP)) + return 1; + + tcp_enter_frto_loss(sk, 3, flag); + return 1; + } } if (tp->frto_counter == 1) { -- cgit From c5e7af0df5d7234afd8596560d9f570cfc6c18bf Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 23 Feb 2007 16:22:06 -0800 Subject: [TCP]: Correct reordering detection change (no FRTO case) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reordering detection must work also when FRTO has not been used at all which was the original intention of mine, just the expression of the idea was flawed. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bb3f234668b..f6ba07f0d81 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1240,7 +1240,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp->left_out = tp->sacked_out + tp->lost_out; if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && - (tp->frto_highmark && after(tp->snd_una, tp->frto_highmark))) + (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); #if FASTRETRANS_DEBUG > 0 -- cgit From 3cfe3baaf07c9e40a75f9a70662de56df1c246a8 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 27 Feb 2007 10:09:49 -0800 Subject: [TCP]: Add two new spurious RTO responses to FRTO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New sysctl tcp_frto_response is added to select amongst these responses: - Rate halving based; reuses CA_CWR state (default) - Very conservative; used to be the only one available (=1) - Undo cwr; undoes ssthresh and cwnd reductions (=2) The response with rate halving requires a new parameter to tcp_enter_cwr because FRTO has already reduced ssthresh and doing a second reduction there has to be prevented. In addition, to keep things nice on 80 cols screen, a local variable was added. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f6ba07f0d81..322e43c5646 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -86,6 +86,7 @@ int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly; +int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; @@ -762,15 +763,17 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) } /* Set slow start threshold and cwnd not falling to slow start */ -void tcp_enter_cwr(struct sock *sk) +void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); tp->prior_ssthresh = 0; tp->bytes_acked = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + if (set_ssthresh) + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -2003,7 +2006,7 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) tp->retrans_stamp = 0; if (flag&FLAG_ECE) - tcp_enter_cwr(sk); + tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; @@ -2579,6 +2582,21 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) tcp_moderate_cwnd(tp); } +/* A conservative spurious RTO response algorithm: reduce cwnd using + * rate halving and continue in congestion avoidance. + */ +static void tcp_ratehalving_spur_to_response(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + tcp_enter_cwr(sk, 0); + tp->high_seq = tp->frto_highmark; /* Smoother w/o this? - ij */ +} + +static void tcp_undo_spur_to_response(struct sock *sk) +{ + tcp_undo_cwr(sk, 1); +} + /* F-RTO spurious RTO detection algorithm (RFC4138) * * F-RTO affects during two new ACKs following RTO (well, almost, see inline @@ -2661,7 +2679,17 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) tp->frto_counter = 2; return 1; } else /* frto_counter == 2 */ { - tcp_conservative_spur_to_response(tp); + switch (sysctl_tcp_frto_response) { + case 2: + tcp_undo_spur_to_response(sk); + break; + case 1: + tcp_conservative_spur_to_response(tp); + break; + default: + tcp_ratehalving_spur_to_response(sk); + break; + }; tp->frto_counter = 0; } return 0; -- cgit From e01f9d7793be82e6c252efbd52c399d3eb65abe4 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 2 Mar 2007 13:27:25 -0800 Subject: [TCP]: Complete icsk-to-local-variable change (in tcp_enter_cwr) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A local variable for icsk was created but this change was missing. Spotted by Jarek Poplawski. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 322e43c5646..cb715eadf8f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -770,7 +770,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) tp->prior_ssthresh = 0; tp->bytes_acked = 0; - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (icsk->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; if (set_ssthresh) tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); -- cgit From e317f6f69cb95527799d308a9421b7dc1252989a Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 2 Mar 2007 13:34:19 -0800 Subject: [TCP]: FRTO undo response falls back to ratehalving one if ECEd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Undoing ssthresh is disabled in fastretrans_alert whenever FLAG_ECE is set by clearing prior_ssthresh. The clearing does not protect FRTO because FRTO operates before fastretrans_alert. Moving the clearing of prior_ssthresh earlier seems to be a suboptimal solution to the FRTO case because then FLAG_ECE will cause a second ssthresh reduction in try_to_open (the first occurred when FRTO was entered). So instead, FRTO falls back immediately to the rate halving response, which switches TCP to CA_CWR state preventing the latter reduction of ssthresh. If the first ECE arrived before the ACK after which FRTO is able to decide RTO as spurious, prior_ssthresh is already cleared. Thus no undoing for ssthresh occurs. Besides, FLAG_ECE should be set also in the following ACKs resulting in rate halving response that sees TCP is already in CA_CWR, which again prevents an extra ssthresh reduction on that round-trip. If the first ECE arrived before RTO, ssthresh has already been adapted and prior_ssthresh remains cleared on entry because TCP is in CA_CWR (the same applies also to a case where FRTO is entered more than once and ECE comes in the middle). High_seq must not be touched after tcp_enter_cwr because CWR round-trip calculation depends on it. I believe that after this patch, FRTO should be ECN-safe and even able to take advantage of synergy benefits. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cb715eadf8f..d894bbcc1d2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2587,14 +2587,15 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) */ static void tcp_ratehalving_spur_to_response(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); tcp_enter_cwr(sk, 0); - tp->high_seq = tp->frto_highmark; /* Smoother w/o this? - ij */ } -static void tcp_undo_spur_to_response(struct sock *sk) +static void tcp_undo_spur_to_response(struct sock *sk, int flag) { - tcp_undo_cwr(sk, 1); + if (flag&FLAG_ECE) + tcp_ratehalving_spur_to_response(sk); + else + tcp_undo_cwr(sk, 1); } /* F-RTO spurious RTO detection algorithm (RFC4138) @@ -2681,7 +2682,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) } else /* frto_counter == 2 */ { switch (sysctl_tcp_frto_response) { case 2: - tcp_undo_spur_to_response(sk); + tcp_undo_spur_to_response(sk, flag); break; case 1: tcp_conservative_spur_to_response(tp); -- cgit From 9d729f72dca9406025bcfa9c1f660d71d9ef0ff5 Mon Sep 17 00:00:00 2001 From: James Morris Date: Sun, 4 Mar 2007 16:12:44 -0800 Subject: [NET]: Convert xtime.tv_sec to get_seconds() Where appropriate, convert references to xtime.tv_sec to the get_seconds() helper function. Signed-off-by: James Morris Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d894bbcc1d2..d0a3630f41a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2933,7 +2933,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, static inline void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = xtime.tv_sec; + tp->rx_opt.ts_recent_stamp = get_seconds(); } static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) @@ -2947,7 +2947,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) */ if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || - xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) + get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) tcp_store_ts_recent(tp); } } @@ -2999,7 +2999,7 @@ static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff * { const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && - xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && + get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && !tcp_disordered_ack(sk, skb)); } -- cgit From fe067e8ab5e0dc5ca3c54634924c628da92090b4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 7 Mar 2007 12:12:44 -0800 Subject: [TCP]: Abstract out all write queue operations. This allows the write queue implementation to be changed, for example, to one which allows fast interval searching. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 64 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 23 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d0a3630f41a..22d0bb03c5d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1044,7 +1044,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ cached_skb = tp->fastpath_skb_hint; cached_fack_count = tp->fastpath_cnt_hint; if (!cached_skb) { - cached_skb = sk->sk_write_queue.next; + cached_skb = tcp_write_queue_head(sk); cached_fack_count = 0; } @@ -1061,10 +1061,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + if (skb == tcp_send_head(sk)) + break; + cached_skb = skb; cached_fack_count = fack_count; if (i == first_sack_index) { @@ -1213,7 +1216,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) break; if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) @@ -1266,8 +1271,8 @@ int tcp_use_frto(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!sysctl_tcp_frto || !sk->sk_send_head || - after(TCP_SKB_CB(sk->sk_send_head)->end_seq, + if (!sysctl_tcp_frto || !tcp_send_head(sk) || + after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tp->snd_una + tp->snd_wnd)) return 0; @@ -1278,8 +1283,11 @@ int tcp_use_frto(struct sock *sk) if (tp->retrans_out > 1) return 0; - skb = skb_peek(&sk->sk_write_queue)->next; /* Skips head */ - sk_stream_for_retrans_queue_from(skb, sk) { + skb = tcp_write_queue_head(sk); + skb = tcp_write_queue_next(sk, skb); /* Skips head */ + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) return 0; /* Short-circuit when first non-SACKed skb has been checked */ @@ -1343,7 +1351,7 @@ void tcp_enter_frto(struct sock *sk) tp->undo_marker = tp->snd_una; tp->undo_retrans = 0; - skb = skb_peek(&sk->sk_write_queue); + skb = tcp_write_queue_head(sk); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -1380,7 +1388,9 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->fackets_out = 0; tp->retrans_out = 0; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; cnt += tcp_skb_pcount(skb); /* * Count the retransmission made on RTO correctly (only when @@ -1468,7 +1478,9 @@ void tcp_enter_loss(struct sock *sk, int how) if (!how) tp->undo_marker = tp->snd_una; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; cnt += tcp_skb_pcount(skb); if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) tp->undo_marker = 0; @@ -1503,14 +1515,14 @@ static int tcp_check_sack_reneging(struct sock *sk) * receiver _host_ is heavily congested (or buggy). * Do processing similar to RTO timeout. */ - if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && + if ((skb = tcp_write_queue_head(sk)) != NULL && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); icsk->icsk_retransmits++; - tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); return 1; @@ -1531,7 +1543,7 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) { return tp->packets_out && - tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); + tcp_skb_timedout(sk, tcp_write_queue_head(sk)); } /* Linux NewReno/SACK/FACK/ECN state machine. @@ -1726,11 +1738,13 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, skb = tp->lost_skb_hint; cnt = tp->lost_cnt_hint; } else { - skb = sk->sk_write_queue.next; + skb = tcp_write_queue_head(sk); cnt = 0; } - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; @@ -1777,9 +1791,11 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint - : sk->sk_write_queue.next; + : tcp_write_queue_head(sk); - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (!tcp_skb_timedout(sk, skb)) break; @@ -1970,7 +1986,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) { if (tcp_may_undo(tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } @@ -2382,8 +2400,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) = icsk->icsk_ca_ops->rtt_sample; struct timeval tv = { .tv_sec = 0, .tv_usec = 0 }; - while ((skb = skb_peek(&sk->sk_write_queue)) && - skb != sk->sk_send_head) { + while ((skb = tcp_write_queue_head(sk)) && + skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); __u8 sacked = scb->sacked; @@ -2446,7 +2464,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); - __skb_unlink(skb, &sk->sk_write_queue); + tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); clear_all_retrans_hints(tp); } @@ -2495,7 +2513,7 @@ static void tcp_ack_probe(struct sock *sk) /* Was it a usable window open? */ - if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, + if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tp->snd_una + tp->snd_wnd)) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); @@ -2795,7 +2813,7 @@ no_queue: * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (sk->sk_send_head) + if (tcp_send_head(sk)) tcp_ack_probe(sk); return 1; -- cgit From 2de979bd7da9c8b39cc0aabb0ab5aa1516d929eb Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 8 Mar 2007 20:45:19 -0800 Subject: [TCP]: whitespace cleanup Add whitespace around keywords. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 57 ++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 22d0bb03c5d..fb025608594 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -578,7 +578,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if(m == 0) + if (m == 0) m = 1; if (tp->srtt != 0) { m -= (tp->srtt >> 3); /* m is now error in rtt est */ @@ -1758,12 +1758,11 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, /* clear xmit_retransmit_queue hints * if this is beyond hint */ - if(tp->retransmit_skb_hint != NULL && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { - + if (tp->retransmit_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = NULL; - } + } } tcp_sync_left_out(tp); @@ -2441,7 +2440,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) if (sacked) { if (sacked & TCPCB_RETRANS) { - if(sacked & TCPCB_SACKED_RETRANS) + if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= tcp_skb_pcount(skb); acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; @@ -2840,7 +2839,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, ptr = (unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; - while(length>0) { + while (length > 0) { int opcode=*ptr++; int opsize; @@ -2856,9 +2855,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, return; if (opsize > length) return; /* don't parse partial options */ - switch(opcode) { + switch (opcode) { case TCPOPT_MSS: - if(opsize==TCPOLEN_MSS && th->syn && !estab) { + if (opsize==TCPOLEN_MSS && th->syn && !estab) { u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); if (in_mss) { if (opt_rx->user_mss && opt_rx->user_mss < in_mss) @@ -2868,12 +2867,12 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_WINDOW: - if(opsize==TCPOLEN_WINDOW && th->syn && !estab) + if (opsize==TCPOLEN_WINDOW && th->syn && !estab) if (sysctl_tcp_window_scaling) { __u8 snd_wscale = *(__u8 *) ptr; opt_rx->wscale_ok = 1; if (snd_wscale > 14) { - if(net_ratelimit()) + if (net_ratelimit()) printk(KERN_INFO "tcp_parse_options: Illegal window " "scaling value %d >14 received.\n", snd_wscale); @@ -2883,7 +2882,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_TIMESTAMP: - if(opsize==TCPOLEN_TIMESTAMP) { + if (opsize==TCPOLEN_TIMESTAMP) { if ((estab && opt_rx->tstamp_ok) || (!estab && sysctl_tcp_timestamps)) { opt_rx->saw_tstamp = 1; @@ -2893,7 +2892,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_SACK_PERM: - if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { + if (opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { if (sysctl_tcp_sack) { opt_rx->sack_ok = 1; tcp_sack_reset(opt_rx); @@ -2902,7 +2901,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, break; case TCPOPT_SACK: - if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && opt_rx->sack_ok) { TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; @@ -2964,7 +2963,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * Not only, also it occurs for expired timestamps. */ - if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || + if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) tcp_store_ts_recent(tp); } @@ -3223,7 +3222,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) */ tp->rx_opt.num_sacks--; tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); - for(i=this_sack; i < tp->rx_opt.num_sacks; i++) + for (i=this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i+1]; continue; } @@ -3276,7 +3275,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) tp->rx_opt.num_sacks--; sp--; } - for(; this_sack > 0; this_sack--, sp--) + for (; this_sack > 0; this_sack--, sp--) *sp = *(sp-1); new_sack: @@ -3302,7 +3301,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) return; } - for(this_sack = 0; this_sack < num_sacks; ) { + for (this_sack = 0; this_sack < num_sacks; ) { /* Check if the start of the sack is covered by RCV.NXT. */ if (!before(tp->rcv_nxt, sp->start_seq)) { int i; @@ -3358,7 +3357,7 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) + if (skb->h.th->fin) tcp_fin(skb, sk, skb->h.th); } } @@ -3424,9 +3423,9 @@ queue_and_out: __skb_queue_tail(&sk->sk_receive_queue, skb); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->len) + if (skb->len) tcp_event_data_recv(sk, tp, skb); - if(th->fin) + if (th->fin) tcp_fin(skb, sk, th); if (!skb_queue_empty(&tp->out_of_order_queue)) { @@ -4323,7 +4322,7 @@ slow_path: goto discard; } - if(th->rst) { + if (th->rst) { tcp_reset(sk); goto discard; } @@ -4338,7 +4337,7 @@ slow_path: } step5: - if(th->ack) + if (th->ack) tcp_ack(sk, skb, FLAG_SLOWPATH); tcp_rcv_rtt_measure_ts(sk, skb); @@ -4626,13 +4625,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_LISTEN: - if(th->ack) + if (th->ack) return 1; - if(th->rst) + if (th->rst) goto discard; - if(th->syn) { + if (th->syn) { if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; @@ -4688,7 +4687,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } /* step 2: check RST bit */ - if(th->rst) { + if (th->rst) { tcp_reset(sk); goto discard; } @@ -4711,7 +4710,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (th->ack) { int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); - switch(sk->sk_state) { + switch (sk->sk_state) { case TCP_SYN_RECV: if (acceptable) { tp->copied_seq = tp->rcv_nxt; -- cgit From c51957dafa6f960c5c6372aa3da6c8fa71c13730 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 10 Mar 2007 12:47:22 -0300 Subject: [TCP]: Do the layer header setting in tcp_collapse relative to skb->data That is equal to skb->head before skb_reserve, to help in the layer header changes. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb025608594..e5d1c2c8cea 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3632,11 +3632,13 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, nskb = alloc_skb(copy+header, GFP_ATOMIC); if (!nskb) return; + + nskb->mac.raw = nskb->data + (skb->mac.raw - skb->head); + nskb->nh.raw = nskb->data + (skb->nh.raw - skb->head); + nskb->h.raw = nskb->data + (skb->h.raw - skb->head); + skb_reserve(nskb, header); memcpy(nskb->head, skb->head, header); - nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); - nskb->h.raw = nskb->head + (skb->h.raw-skb->head); - nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_insert(nskb, skb->prev, skb, list); -- cgit From 31713c333ddbb66d694829082620b69b71c4b09a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 10 Mar 2007 12:48:37 -0300 Subject: [TCP]: Use skb_set_mac_header in tcp_collapse Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e5d1c2c8cea..1ec05bd673a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3633,7 +3633,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, if (!nskb) return; - nskb->mac.raw = nskb->data + (skb->mac.raw - skb->head); + skb_set_mac_header(nskb, skb->mac.raw - skb->head); nskb->nh.raw = nskb->data + (skb->nh.raw - skb->head); nskb->h.raw = nskb->data + (skb->h.raw - skb->head); -- cgit From 98e399f82ab3a6d863d1d4a7ea48925cc91c830e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 19 Mar 2007 15:33:04 -0700 Subject: [SK_BUFF]: Introduce skb_mac_header() For the places where we need a pointer to the mac header, it is still legal to touch skb->mac.raw directly if just adding to, subtracting from or setting it to another layer header. This one also converts some more cases to skb_reset_mac_header() that my regex missed as it had no spaces before nor after '=', ugh. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1ec05bd673a..f5e019cefc1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3633,7 +3633,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, if (!nskb) return; - skb_set_mac_header(nskb, skb->mac.raw - skb->head); + skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); nskb->nh.raw = nskb->data + (skb->nh.raw - skb->head); nskb->h.raw = nskb->data + (skb->h.raw - skb->head); -- cgit From d56f90a7c96da5187f0cdf07ee7434fe6aa78bbc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Apr 2007 20:50:43 -0700 Subject: [SK_BUFF]: Introduce skb_network_header() For the places where we need a pointer to the network header, it is still legal to touch skb->nh.raw directly if just adding to, subtracting from or setting it to another layer header. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f5e019cefc1..00190835cea 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3634,7 +3634,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, return; skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); - nskb->nh.raw = nskb->data + (skb->nh.raw - skb->head); + nskb->nh.raw = nskb->data + (skb_network_header(skb) - skb->head); nskb->h.raw = nskb->data + (skb->h.raw - skb->head); skb_reserve(nskb, header); -- cgit From c14d2450cb7fe1786e2ec325172baf66922bf597 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 11 Mar 2007 22:39:41 -0300 Subject: [SK_BUFF]: Introduce skb_set_network_header For the cases where the network header is being set to a offset from skb->data. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 00190835cea..5da823a3225 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3634,7 +3634,8 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, return; skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); - nskb->nh.raw = nskb->data + (skb_network_header(skb) - skb->head); + skb_set_network_header(nskb, + skb_network_header(skb) - skb->head); nskb->h.raw = nskb->data + (skb->h.raw - skb->head); skb_reserve(nskb, header); -- cgit From 967b05f64e27d04a4c8879addd0e1c52137e2c9e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Mar 2007 13:51:52 -0300 Subject: [SK_BUFF]: Introduce skb_set_transport_header For the cases where the transport header is being set to a offset from skb->data. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5da823a3225..2776a8b0133 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3636,7 +3636,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); skb_set_network_header(nskb, skb_network_header(skb) - skb->head); - nskb->h.raw = nskb->data + (skb->h.raw - skb->head); + skb_set_transport_header(nskb, skb->h.raw - skb->head); skb_reserve(nskb, header); memcpy(nskb->head, skb->head, header); -- cgit From aa8223c7bb0b05183e1737881ed21827aa5b9e73 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Apr 2007 21:04:22 -0700 Subject: [SK_BUFF]: Introduce tcp_hdr(), remove skb->h.th Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2776a8b0133..c1ce3623738 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -148,7 +148,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, * to handle super-low mtu links fairly. */ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && - !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) { + !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. @@ -2559,9 +2559,9 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb, u32 ack, u32 ack_seq) { int flag = 0; - u32 nwin = ntohs(skb->h.th->window); + u32 nwin = ntohs(tcp_hdr(skb)->window); - if (likely(!skb->h.th->syn)) + if (likely(!tcp_hdr(skb)->syn)) nwin <<= tp->rx_opt.snd_wscale; if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { @@ -2766,7 +2766,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) + if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; tcp_ca_event(sk, CA_EVENT_SLOW_ACK); @@ -2833,7 +2833,7 @@ uninteresting_ack: void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab) { unsigned char *ptr; - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); int length=(th->doff*4)-sizeof(struct tcphdr); ptr = (unsigned char *)(th + 1); @@ -2995,7 +2995,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -3357,8 +3357,8 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if (skb->h.th->fin) - tcp_fin(skb, sk, skb->h.th); + if (tcp_hdr(skb)->fin) + tcp_fin(skb, sk, tcp_hdr(skb)); } } @@ -3366,7 +3366,7 @@ static int tcp_prune_queue(struct sock *sk); static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; @@ -3605,7 +3605,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, * - bloated or contains data before "start" or * overlaps to the next one. */ - if (!skb->h.th->syn && !skb->h.th->fin && + if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && (tcp_win_from_space(skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start) || (skb->next != tail && @@ -3616,7 +3616,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, start = TCP_SKB_CB(skb)->end_seq; skb = skb->next; } - if (skb == tail || skb->h.th->syn || skb->h.th->fin) + if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) return; while (before(start, end)) { @@ -3665,7 +3665,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; - if (skb == tail || skb->h.th->syn || skb->h.th->fin) + if (skb == tail || + tcp_hdr(skb)->syn || + tcp_hdr(skb)->fin) return; } } @@ -4072,7 +4074,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen tcp_rcv_space_adjust(sk); if ((tp->ucopy.len == 0) || - (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || + (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; sk->sk_data_ready(sk, 0); -- cgit From 9c70220b73908f64792422a2c39c593c4792f2c5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 25 Apr 2007 18:04:18 -0700 Subject: [SK_BUFF]: Introduce skb_transport_header(skb) For the places where we need a pointer to the transport header, it is still legal to touch skb->h.raw directly if just adding to, subtracting from or setting it to another layer header. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c1ce3623738..9c3b4c7a50a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -140,7 +140,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, * * "len" is invariant segment length, including TCP header. */ - len += skb->data - skb->h.raw; + len += skb->data - skb_transport_header(skb); if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. @@ -940,7 +940,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; + unsigned char *ptr = (skb_transport_header(ack_skb) + + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); struct sk_buff *cached_skb; int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; @@ -3634,10 +3635,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, return; skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); - skb_set_network_header(nskb, - skb_network_header(skb) - skb->head); - skb_set_transport_header(nskb, skb->h.raw - skb->head); - + skb_set_network_header(nskb, (skb_network_header(skb) - + skb->head)); + skb_set_transport_header(nskb, (skb_transport_header(skb) - + skb->head)); skb_reserve(nskb, header); memcpy(nskb->head, skb->head, header); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); -- cgit From 604763722c655c7e3f31ecf6f7b4dafcd26a7a15 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 9 Apr 2007 11:59:39 -0700 Subject: [NET]: Treat CHECKSUM_PARTIAL as CHECKSUM_UNNECESSARY When a transmitted packet is looped back directly, CHECKSUM_PARTIAL maps to the semantics of CHECKSUM_UNNECESSARY. Therefore we should treat it as such in the stack. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c3b4c7a50a..d1604f59d77 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4009,7 +4009,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) int err; local_bh_enable(); - if (skb->ip_summed==CHECKSUM_UNNECESSARY) + if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); else err = skb_copy_and_csum_datagram_iovec(skb, hlen, @@ -4041,7 +4041,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { - return skb->ip_summed != CHECKSUM_UNNECESSARY && + return !skb_csum_unnecessary(skb) && __tcp_checksum_complete_user(sk, skb); } @@ -4059,7 +4059,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) tp->ucopy.dma_chan = get_softnet_dma(); - if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); -- cgit From 3ff50b7997fe06cd5d276b229967bb52d6b3b6c1 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 20 Apr 2007 17:09:22 -0700 Subject: [NET]: cleanup extra semicolons Spring cleaning time... There seems to be a lot of places in the network code that have extra bogus semicolons after conditionals. Most commonly is a bogus semicolon after: switch() { } Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d1604f59d77..2fbfc2e4209 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2708,7 +2708,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) default: tcp_ratehalving_spur_to_response(sk); break; - }; + } tp->frto_counter = 0; } return 0; @@ -2915,10 +2915,11 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, */ break; #endif - }; + } + ptr+=opsize-2; length-=opsize; - }; + } } } @@ -3124,7 +3125,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", __FUNCTION__, sk->sk_state); break; - }; + } /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. -- cgit From 9e412ba7632f71259a53085665d4983b78257b7c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 20 Apr 2007 22:18:02 -0700 Subject: [TCP]: Sed magic converts func(sk, tp, ...) -> func(sk, ...) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is (mostly) automated change using magic: sed -e '/struct sock \*sk/ N' -e '/struct sock \*sk/ N' -e '/struct sock \*sk/ N' -e '/struct sock \*sk/ N' -e 's|struct sock \*sk,[\n\t ]*struct tcp_sock \*tp\([^{]*\n{\n\)| struct sock \*sk\1\tstruct tcp_sock *tp = tcp_sk(sk);\n|g' -e 's|struct sock \*sk, struct tcp_sock \*tp| struct sock \*sk|g' -e 's|sk, tp\([^-]\)|sk\1|g' Fixed four unused variable (tp) warnings that were introduced. In addition, manually added newlines after local variables and tweaked function arguments positioning. $ gcc --version gcc (GCC) 4.1.1 20060525 (Red Hat 4.1.1-1) ... $ codiff -fV built-in.o.old built-in.o.new net/ipv4/route.c: rt_cache_flush | +14 1 function changed, 14 bytes added net/ipv4/tcp.c: tcp_setsockopt | -5 tcp_sendpage | -25 tcp_sendmsg | -16 3 functions changed, 46 bytes removed net/ipv4/tcp_input.c: tcp_try_undo_recovery | +3 tcp_try_undo_dsack | +2 tcp_mark_head_lost | -12 tcp_ack | -15 tcp_event_data_recv | -32 tcp_rcv_state_process | -10 tcp_rcv_established | +1 7 functions changed, 6 bytes added, 69 bytes removed, diff: -63 net/ipv4/tcp_output.c: update_send_head | -9 tcp_transmit_skb | +19 tcp_cwnd_validate | +1 tcp_write_wakeup | -17 __tcp_push_pending_frames | -25 tcp_push_one | -8 tcp_send_fin | -4 7 functions changed, 20 bytes added, 63 bytes removed, diff: -43 built-in.o.new: 18 functions changed, 40 bytes added, 178 bytes removed, diff: -138 Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 145 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 59 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2fbfc2e4209..63338939078 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -235,9 +235,9 @@ static void tcp_fixup_sndbuf(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, - const struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; @@ -252,9 +252,11 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, return 0; } -static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, +static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && @@ -267,7 +269,7 @@ static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, if (tcp_win_from_space(skb->truesize) <= skb->len) incr = 2*tp->advmss; else - incr = __tcp_grow_window(sk, tp, skb); + incr = __tcp_grow_window(sk, skb); if (incr) { tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); @@ -330,8 +332,9 @@ static void tcp_init_buffer_space(struct sock *sk) } /* 5. Recalculate window clamp after socket hit its memory bounds. */ -static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) +static void tcp_clamp_window(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ack.quick = 0; @@ -503,8 +506,9 @@ new_measure: * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ -static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u32 now; @@ -545,7 +549,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ TCP_ECN_check_ce(tp, skb); if (skb->len >= 128) - tcp_grow_window(sk, tp, skb); + tcp_grow_window(sk, skb); } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -1541,8 +1545,10 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); } -static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) +static inline int tcp_head_timedout(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + return tp->packets_out && tcp_skb_timedout(sk, tcp_write_queue_head(sk)); } @@ -1640,8 +1646,9 @@ static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) * Main question: may we further continue forward transmission * with the same cwnd? */ -static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) +static int tcp_time_to_recover(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; /* Do not perform any recovery during FRTO algorithm */ @@ -1659,7 +1666,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ - if (tcp_head_timedout(sk, tp)) + if (tcp_head_timedout(sk)) return 1; /* Trick#4: It is still not OK... But will it be useful to delay @@ -1668,7 +1675,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) packets_out = tp->packets_out; if (packets_out <= tp->reordering && tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && - !tcp_may_send_now(sk, tp)) { + !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. */ @@ -1708,8 +1715,10 @@ static void tcp_add_reno_sack(struct sock *sk) /* Account for ACK, ACKing some data in Reno Recovery phase. */ -static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked) +static void tcp_remove_reno_sacks(struct sock *sk, int acked) { + struct tcp_sock *tp = tcp_sk(sk); + if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ if (acked-1 >= tp->sacked_out) @@ -1728,9 +1737,10 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) } /* Mark head of queue up as lost. */ -static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, +static void tcp_mark_head_lost(struct sock *sk, int packets, u32 high_seq) { + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt; @@ -1771,15 +1781,17 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, /* Account newly detected lost packet(s) */ -static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) +static void tcp_update_scoreboard(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (IsFack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, tp, lost, tp->high_seq); + tcp_mark_head_lost(sk, lost, tp->high_seq); } else { - tcp_mark_head_lost(sk, tp, 1, tp->high_seq); + tcp_mark_head_lost(sk, 1, tp->high_seq); } /* New heuristics: it is possible only after we switched @@ -1787,7 +1799,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (!IsReno(tp) && tcp_head_timedout(sk, tp)) { + if (!IsReno(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -1867,9 +1879,11 @@ static inline int tcp_packet_delayed(struct tcp_sock *tp) /* Undo procedures. */ #if FASTRETRANS_DEBUG > 1 -static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) +static void DBGUNDO(struct sock *sk, const char *msg) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); + printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", msg, NIPQUAD(inet->daddr), ntohs(inet->dport), @@ -1915,13 +1929,15 @@ static inline int tcp_may_undo(struct tcp_sock *tp) } /* People celebrate: "We love our President!" */ -static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) +static int tcp_try_undo_recovery(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_may_undo(tp)) { /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ - DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); + DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); @@ -1941,10 +1957,12 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ -static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) +static void tcp_try_undo_dsack(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tp->undo_marker && !tp->undo_retrans) { - DBGUNDO(sk, tp, "D-SACK"); + DBGUNDO(sk, "D-SACK"); tcp_undo_cwr(sk, 1); tp->undo_marker = 0; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); @@ -1953,9 +1971,9 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) /* Undo during fast recovery after partial ACK. */ -static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, - int acked) +static int tcp_try_undo_partial(struct sock *sk, int acked) { + struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ int failed = IsReno(tp) || tp->fackets_out>tp->reordering; @@ -1968,7 +1986,7 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); - DBGUNDO(sk, tp, "Hoe"); + DBGUNDO(sk, "Hoe"); tcp_undo_cwr(sk, 0); NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); @@ -1982,8 +2000,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, } /* Undo during loss recovery after partial ACK. */ -static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) +static int tcp_try_undo_loss(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_may_undo(tp)) { struct sk_buff *skb; tcp_for_write_queue(skb, sk) { @@ -1994,7 +2014,7 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) clear_all_retrans_hints(tp); - DBGUNDO(sk, tp, "partial loss"); + DBGUNDO(sk, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; tcp_undo_cwr(sk, 1); @@ -2016,8 +2036,10 @@ static inline void tcp_complete_cwr(struct sock *sk) tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } -static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) +static void tcp_try_to_open(struct sock *sk, int flag) { + struct tcp_sock *tp = tcp_sk(sk); + tp->left_out = tp->sacked_out; if (tp->retrans_out == 0) @@ -2111,7 +2133,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); + tcp_mark_head_lost(sk, tp->fackets_out-tp->reordering, tp->high_seq); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } @@ -2127,7 +2149,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, switch (icsk->icsk_ca_state) { case TCP_CA_Loss: icsk->icsk_retransmits = 0; - if (tcp_try_undo_recovery(sk, tp)) + if (tcp_try_undo_recovery(sk)) return; break; @@ -2141,7 +2163,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, break; case TCP_CA_Disorder: - tcp_try_undo_dsack(sk, tp); + tcp_try_undo_dsack(sk); if (!tp->undo_marker || /* For SACK case do not Open to allow to undo * catching for all duplicate ACKs. */ @@ -2154,7 +2176,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, case TCP_CA_Recovery: if (IsReno(tp)) tcp_reset_reno_sack(tp); - if (tcp_try_undo_recovery(sk, tp)) + if (tcp_try_undo_recovery(sk)) return; tcp_complete_cwr(sk); break; @@ -2170,14 +2192,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } else { int acked = prior_packets - tp->packets_out; if (IsReno(tp)) - tcp_remove_reno_sacks(sk, tp, acked); - is_dupack = tcp_try_undo_partial(sk, tp, acked); + tcp_remove_reno_sacks(sk, acked); + is_dupack = tcp_try_undo_partial(sk, acked); } break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) icsk->icsk_retransmits = 0; - if (!tcp_try_undo_loss(sk, tp)) { + if (!tcp_try_undo_loss(sk)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); return; @@ -2194,10 +2216,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } if (icsk->icsk_ca_state == TCP_CA_Disorder) - tcp_try_undo_dsack(sk, tp); + tcp_try_undo_dsack(sk); - if (!tcp_time_to_recover(sk, tp)) { - tcp_try_to_open(sk, tp, flag); + if (!tcp_time_to_recover(sk)) { + tcp_try_to_open(sk, flag); return; } @@ -2236,8 +2258,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tcp_set_ca_state(sk, TCP_CA_Recovery); } - if (is_dupack || tcp_head_timedout(sk, tp)) - tcp_update_scoreboard(sk, tp); + if (is_dupack || tcp_head_timedout(sk)) + tcp_update_scoreboard(sk); tcp_cwnd_down(sk); tcp_xmit_retransmit_queue(sk); } @@ -2313,8 +2335,10 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, * RFC2988 recommends to restart timer to now+rto. */ -static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +static void tcp_ack_packets_out(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { @@ -2471,7 +2495,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) if (acked&FLAG_ACKED) { tcp_ack_update_rtt(sk, acked, seq_rtt); - tcp_ack_packets_out(sk, tp); + tcp_ack_packets_out(sk); if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED)) (*rtt_sample)(sk, tcp_usrtt(&tv)); @@ -2556,9 +2580,10 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 * and in FreeBSD. NetBSD's one is even worse.) is wrong. */ -static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb, u32 ack, u32 ack_seq) +static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, + u32 ack_seq) { + struct tcp_sock *tp = tcp_sk(sk); int flag = 0; u32 nwin = ntohs(tcp_hdr(skb)->window); @@ -2576,7 +2601,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, * fast path is recovered for sending TCP. */ tp->pred_flags = 0; - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); if (nwin > tp->max_window) { tp->max_window = nwin; @@ -2762,7 +2787,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) else NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); - flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq); + flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); @@ -3426,7 +3451,7 @@ queue_and_out: } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) - tcp_event_data_recv(sk, tp, skb); + tcp_event_data_recv(sk, skb); if (th->fin) tcp_fin(skb, sk, th); @@ -3443,7 +3468,7 @@ queue_and_out: if (tp->rx_opt.num_sacks) tcp_sack_remove(tp); - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); if (eaten > 0) __kfree_skb(skb); @@ -3734,7 +3759,7 @@ static int tcp_prune_queue(struct sock *sk) NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) - tcp_clamp_window(sk, tp); + tcp_clamp_window(sk); else if (tcp_memory_pressure) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -3803,8 +3828,10 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +static int tcp_should_expand_sndbuf(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + /* If the user specified a specific send buffer setting, do * not modify it. */ @@ -3836,7 +3863,7 @@ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_should_expand_sndbuf(sk, tp)) { + if (tcp_should_expand_sndbuf(sk)) { int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, @@ -3860,9 +3887,9 @@ static void tcp_check_space(struct sock *sk) } } -static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_data_snd_check(struct sock *sk) { - tcp_push_pending_frames(sk, tp); + tcp_push_pending_frames(sk); tcp_check_space(sk); } @@ -4196,7 +4223,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ tcp_ack(sk, skb, 0); __kfree_skb(skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TCP_MIB_INERRS); @@ -4267,12 +4294,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; } - tcp_event_data_recv(sk, tp, skb); + tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; } @@ -4355,7 +4382,7 @@ step5: /* step 7: process the segment text */ tcp_data_queue(sk, skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); tcp_ack_snd_check(sk); return 0; @@ -4672,7 +4699,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); return 0; } @@ -4864,7 +4891,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); tcp_ack_snd_check(sk); } -- cgit From 164891aadf1721fca4dce473bb0e0998181537c6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 23 Apr 2007 22:26:16 -0700 Subject: [TCP]: Congestion control API update. Do some simple changes to make congestion control API faster/cleaner. * use ktime_t rather than timeval * merge rtt sampling into existing ack callback this means one indirect call versus two per ack. * use flags bits to store options/settings Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 63338939078..051f0f815f1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2402,14 +2402,6 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } -static u32 tcp_usrtt(struct timeval *tv) -{ - struct timeval now; - - do_gettimeofday(&now); - return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec); -} - /* Remove acknowledged frames from the retransmission queue. */ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { @@ -2420,9 +2412,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) int acked = 0; __s32 seq_rtt = -1; u32 pkts_acked = 0; - void (*rtt_sample)(struct sock *sk, u32 usrtt) - = icsk->icsk_ca_ops->rtt_sample; - struct timeval tv = { .tv_sec = 0, .tv_usec = 0 }; + ktime_t last_ackt = ktime_set(0,0); while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { @@ -2471,7 +2461,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) seq_rtt = -1; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - skb_get_timestamp(skb, &tv); + last_ackt = skb->tstamp; } if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); @@ -2484,7 +2474,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - skb_get_timestamp(skb, &tv); + last_ackt = skb->tstamp; } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); @@ -2494,13 +2484,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (acked&FLAG_ACKED) { + const struct tcp_congestion_ops *ca_ops + = inet_csk(sk)->icsk_ca_ops; + tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk); - if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED)) - (*rtt_sample)(sk, tcp_usrtt(&tv)); - if (icsk->icsk_ca_ops->pkts_acked) - icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, last_ackt); } #if FASTRETRANS_DEBUG > 0 -- cgit From 575ee7140dabe9b9c4f66f4f867039b97e548867 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Mon, 30 Apr 2007 00:39:55 -0700 Subject: [TCP] FRTO: Delay skb available check until it's mandatory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No new data is needed until the first ACK comes, so no need to check for application limitedness until then. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 051f0f815f1..6b669898b19 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1265,20 +1265,15 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } -/* F-RTO can only be used if these conditions are satisfied: - * - there must be some unsent new data - * - the advertised window should allow sending it - * - TCP has never retransmitted anything other than head (SACK enhanced - * variant from Appendix B of RFC4138 is more robust here) +/* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!sysctl_tcp_frto || !tcp_send_head(sk) || - after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, - tp->snd_una + tp->snd_wnd)) + if (!sysctl_tcp_frto) return 0; if (IsSackFrto()) @@ -2710,6 +2705,14 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) } if (tp->frto_counter == 1) { + /* Sending of the next skb must be allowed or no FRTO */ + if (!tcp_send_head(sk) || + after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, + tp->snd_una + tp->snd_wnd)) { + tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + return 1; + } + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; return 1; -- cgit From d551e4541dd60ae53459f77a971f2d6043431f5f Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Mon, 30 Apr 2007 00:42:20 -0700 Subject: [TCP] FRTO: RFC4138 allows Nagle override when new data must be sent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a corner case where less than MSS sized new data thingie is awaiting in the send queue. For F-RTO to work correctly, a new data segment must be sent at certain point or F-RTO cannot be used at all. RFC4138 allows overriding of Nagle at that point. Implementation uses frto_counter states 2 and 3 to distinguish when Nagle override is needed. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6b669898b19..7641b2761a1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2637,7 +2637,9 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) * algorithm is not part of the F-RTO detection algorithm * given in RFC4138 but can be selected separately). * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss - * and TCP falls back to conventional RTO recovery. + * and TCP falls back to conventional RTO recovery. F-RTO allows overriding + * of Nagle, this is done using frto_counter states 2 and 3, when a new data + * segment of any size sent during F-RTO, state 2 is upgraded to 3. * * Rationale: if the RTO was spurious, new ACKs should arrive from the * original window even after we transmit two new data segments. @@ -2666,7 +2668,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) inet_csk(sk)->icsk_retransmits = 0; if (!before(tp->snd_una, tp->frto_highmark)) { - tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return 1; } @@ -2692,7 +2694,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) return 1; } - if ((tp->frto_counter == 2) && + if ((tp->frto_counter >= 2) && (!(flag&FLAG_FORWARD_PROGRESS) || ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { /* RFC4138 shortcoming (see comment above) */ @@ -2709,14 +2711,15 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) if (!tcp_send_head(sk) || after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tp->snd_una + tp->snd_wnd)) { - tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), + flag); return 1; } tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; return 1; - } else /* frto_counter == 2 */ { + } else { switch (sysctl_tcp_frto_response) { case 2: tcp_undo_spur_to_response(sk, flag); -- cgit