From 522e7548a9bd40305df41c0beae69448b7620d6b Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 22:54:52 -0800
Subject: [TCP] FRTO: Incorrectly clears TCPCB_EVER_RETRANS bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

FRTO was slightly too brave... Should only clear
TCPCB_SACKED_RETRANS bit.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1a14191687a..b21e232d5d3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1266,7 +1266,7 @@ void tcp_enter_frto(struct sock *sk)
 	tp->undo_retrans = 0;
 
 	sk_stream_for_retrans_queue(skb, sk) {
-		TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS;
+		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 	}
 	tcp_sync_left_out(tp);
 
-- 
cgit 


From 9ead9a1d385ae2c52a6dcf2828d84ce66be04fc2 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 22:56:19 -0800
Subject: [TCP] FRTO: Separated response from FRTO detection algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

FRTO spurious RTO detection algorithm (RFC4138) does not include response
to a detected spurious RTO but can use different response algorithms.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b21e232d5d3..c5be3d0465f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2467,6 +2467,15 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
 	return flag;
 }
 
+/* A very conservative spurious RTO response algorithm: reduce cwnd and
+ * continue in congestion avoidance.
+ */
+static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
+{
+	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+	tcp_moderate_cwnd(tp);
+}
+
 static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2488,12 +2497,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
 		 */
 		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
 	} else {
-		/* Also the second ACK after RTO advances the window.
-		 * The RTO was likely spurious. Reduce cwnd and continue
-		 * in congestion avoidance
-		 */
-		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
-		tcp_moderate_cwnd(tp);
+		tcp_conservative_spur_to_response(tp);
 	}
 
 	/* F-RTO affects on two new ACKs following RTO.
-- 
cgit 


From bdaae17da81db79b9aa4dfbf43305cfeef64f6a8 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 22:59:58 -0800
Subject: [TCP] FRTO: Moved tcp_use_frto from tcp.h to tcp_input.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In addition, removed inline.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5be3d0465f..fe96e176d85 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1236,6 +1236,19 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	return flag;
 }
 
+int tcp_use_frto(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	/* F-RTO must be activated in sysctl and there must be some
+	 * unsent new data, and the advertised window should allow
+	 * sending it.
+	 */
+	return (sysctl_tcp_frto && sk->sk_send_head &&
+		!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
+		       tp->snd_una + tp->snd_wnd));
+}
+
 /* RTO occurred, but do not yet enter loss state. Instead, transmit two new
  * segments to see from the next ACKs whether any data was really missing.
  * If the RTO was spurious, new ACKs should arrive.
-- 
cgit 


From 30935cf4f915c3178ce63331d6ff4c82163e26af Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:01:36 -0800
Subject: [TCP] FRTO: Comment cleanup & improvement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moved comments out from the body of process_frto() to the head
(preferred way; see Documentation/CodingStyle). Bonus: it's much
easier to read in this compacted form.

FRTO algorithm and implementation is described in greater detail.
For interested reader, more information is available in RFC4138.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 49 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fe96e176d85..561e5d40498 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1236,22 +1236,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	return flag;
 }
 
+/* F-RTO can only be used if these conditions are satisfied:
+ *  - there must be some unsent new data
+ *  - the advertised window should allow sending it
+ */
 int tcp_use_frto(const struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
-	/* F-RTO must be activated in sysctl and there must be some
-	 * unsent new data, and the advertised window should allow
-	 * sending it.
-	 */
 	return (sysctl_tcp_frto && sk->sk_send_head &&
 		!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
 		       tp->snd_una + tp->snd_wnd));
 }
 
-/* RTO occurred, but do not yet enter loss state. Instead, transmit two new
- * segments to see from the next ACKs whether any data was really missing.
- * If the RTO was spurious, new ACKs should arrive.
+/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
+ * recovery a bit and use heuristics in tcp_process_frto() to detect if
+ * the RTO was spurious.
  */
 void tcp_enter_frto(struct sock *sk)
 {
@@ -2489,6 +2489,30 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
 	tcp_moderate_cwnd(tp);
 }
 
+/* F-RTO spurious RTO detection algorithm (RFC4138)
+ *
+ * F-RTO affects during two new ACKs following RTO. State (ACK number) is kept
+ * in frto_counter. When ACK advances window (but not to or beyond highest
+ * sequence sent before RTO):
+ *   On First ACK,  send two new segments out.
+ *   On Second ACK, RTO was likely spurious. Do spurious response (response
+ *                  algorithm is not part of the F-RTO detection algorithm
+ *                  given in RFC4138 but can be selected separately).
+ * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
+ * and TCP falls back to conventional RTO recovery.
+ *
+ * Rationale: if the RTO was spurious, new ACKs should arrive from the
+ * original window even after we transmit two new data segments.
+ *
+ * F-RTO is implemented (mainly) in four functions:
+ *   - tcp_use_frto() is used to determine if TCP is can use F-RTO
+ *   - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
+ *     called when tcp_use_frto() showed green light
+ *   - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
+ *   - tcp_enter_frto_loss() is called if there is not enough evidence
+ *     to prove that the RTO is indeed spurious. It transfers the control
+ *     from F-RTO to the conventional RTO recovery
+ */
 static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2497,25 +2521,16 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
 
 	if (tp->snd_una == prior_snd_una ||
 	    !before(tp->snd_una, tp->frto_highmark)) {
-		/* RTO was caused by loss, start retransmitting in
-		 * go-back-N slow start
-		 */
 		tcp_enter_frto_loss(sk);
 		return;
 	}
 
 	if (tp->frto_counter == 1) {
-		/* First ACK after RTO advances the window: allow two new
-		 * segments out.
-		 */
 		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
-	} else {
+	} else /* frto_counter == 2 */ {
 		tcp_conservative_spur_to_response(tp);
 	}
 
-	/* F-RTO affects on two new ACKs following RTO.
-	 * At latest on third ACK the TCP behavior is back to normal.
-	 */
 	tp->frto_counter = (tp->frto_counter + 1) % 3;
 }
 
-- 
cgit 


From 7487c48c4fd15d1e2542be1183b783562cfe10bc Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:02:30 -0800
Subject: [TCP] FRTO: Consecutive RTOs keep prior_ssthresh and ssthresh
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In case a latency spike causes more than one RTO, the later should not
cause the already reduced ssthresh to propagate into the prior_ssthresh
since FRTO declares all such RTOs spurious at once or none of them. In
treating of ssthresh, we mimic what tcp_enter_loss() does.

The previous state (in frto_counter) must be available until we have
checked it in tcp_enter_frto(), and also ACK information flag in
process_frto().

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 561e5d40498..194e880af51 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1252,6 +1252,10 @@ int tcp_use_frto(const struct sock *sk)
 /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
  * recovery a bit and use heuristics in tcp_process_frto() to detect if
  * the RTO was spurious.
+ *
+ * Do like tcp_enter_loss() would; when RTO expires the second time it
+ * does:
+ *  "Reduce ssthresh if it has not yet been made inside this window."
  */
 void tcp_enter_frto(struct sock *sk)
 {
@@ -1259,11 +1263,10 @@ void tcp_enter_frto(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 
-	tp->frto_counter = 1;
-
-	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
+	if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
 	    tp->snd_una == tp->high_seq ||
-	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+	    ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
+	     !icsk->icsk_retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(sk);
 		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
 		tcp_ca_event(sk, CA_EVENT_FRTO);
@@ -1285,6 +1288,7 @@ void tcp_enter_frto(struct sock *sk)
 
 	tcp_set_ca_state(sk, TCP_CA_Open);
 	tp->frto_highmark = tp->snd_nxt;
+	tp->frto_counter = 1;
 }
 
 /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
@@ -2513,12 +2517,16 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
  *     to prove that the RTO is indeed spurious. It transfers the control
  *     from F-RTO to the conventional RTO recovery
  */
-static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
+static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tcp_sync_left_out(tp);
 
+	/* Duplicate the behavior from Loss state (fastretrans_alert) */
+	if (flag&FLAG_DATA_ACKED)
+		inet_csk(sk)->icsk_retransmits = 0;
+
 	if (tp->snd_una == prior_snd_una ||
 	    !before(tp->snd_una, tp->frto_highmark)) {
 		tcp_enter_frto_loss(sk);
@@ -2607,7 +2615,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
 
 	if (tp->frto_counter)
-		tcp_process_frto(sk, prior_snd_una);
+		tcp_process_frto(sk, prior_snd_una, flag);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		/* Advance CWND, if state allows this. */
-- 
cgit 


From 7b0eb22b1d3b049306813a4aaa52966650f7491c Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:03:35 -0800
Subject: [TCP] FRTO: Use Disorder state during operation instead of Open
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Retransmission counter assumptions are to be changed. Forcing
reason to do this exist: Using sysctl in check would be racy
as soon as FRTO starts to ignore some ACKs (doing that in the
following patches). Userspace may disable it at any moment
giving nice oops if timing is right. frto_counter would be
inaccessible from userspace, but with SACK enhanced FRTO
retrans_out can include other than head, and possibly leaving
it non-zero after spurious RTO, boom again.

Luckily, solution seems rather simple: never go directly to Open
state but use Disorder instead. This does not really change much,
since TCP could anyway change its state to Disorder during FRTO
using path tcp_fastretrans_alert -> tcp_try_to_open (e.g., when
a SACK block makes ACK dubious). Besides, Disorder seems to be
the state where TCP should be if not recovering (in Recovery or
Loss state) while having some retransmissions in-flight (see
tcp_try_to_open), which is exactly what happens with FRTO.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 194e880af51..e806839acdd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1286,7 +1286,8 @@ void tcp_enter_frto(struct sock *sk)
 	}
 	tcp_sync_left_out(tp);
 
-	tcp_set_ca_state(sk, TCP_CA_Open);
+	tcp_set_ca_state(sk, TCP_CA_Disorder);
+	tp->high_seq = tp->snd_nxt;
 	tp->frto_highmark = tp->snd_nxt;
 	tp->frto_counter = 1;
 }
@@ -2014,8 +2015,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 	/* E. Check state exit conditions. State can be terminated
 	 *    when high_seq is ACKed. */
 	if (icsk->icsk_ca_state == TCP_CA_Open) {
-		if (!sysctl_tcp_frto)
-			BUG_TRAP(tp->retrans_out == 0);
+		BUG_TRAP(tp->retrans_out == 0);
 		tp->retrans_stamp = 0;
 	} else if (!before(tp->snd_una, tp->high_seq)) {
 		switch (icsk->icsk_ca_state) {
-- 
cgit 


From 6408d206c7484615ecae54bf6474a02c94e9e862 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:04:11 -0800
Subject: [TCP] FRTO: Ignore some uninteresting ACKs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Handles RFC4138 shortcoming (in step 2); it should also have case
c) which ignores ACKs that are not duplicates nor advance window
(opposite dir data, winupdate).

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e806839acdd..e990d562f5e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2495,9 +2495,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
 
 /* F-RTO spurious RTO detection algorithm (RFC4138)
  *
- * F-RTO affects during two new ACKs following RTO. State (ACK number) is kept
- * in frto_counter. When ACK advances window (but not to or beyond highest
- * sequence sent before RTO):
+ * F-RTO affects during two new ACKs following RTO (well, almost, see inline
+ * comments). State (ACK number) is kept in frto_counter. When ACK advances
+ * window (but not to or beyond highest sequence sent before RTO):
  *   On First ACK,  send two new segments out.
  *   On Second ACK, RTO was likely spurious. Do spurious response (response
  *                  algorithm is not part of the F-RTO detection algorithm
@@ -2527,6 +2527,13 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 	if (flag&FLAG_DATA_ACKED)
 		inet_csk(sk)->icsk_retransmits = 0;
 
+	/* RFC4138 shortcoming in step 2; should also have case c): ACK isn't
+	 * duplicate nor advances window, e.g., opposite dir data, winupdate
+	 */
+	if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
+	    !(flag&FLAG_FORWARD_PROGRESS))
+		return;
+
 	if (tp->snd_una == prior_snd_una ||
 	    !before(tp->snd_una, tp->frto_highmark)) {
 		tcp_enter_frto_loss(sk);
-- 
cgit 


From 95c4922bf9330eb2c71b752359dd89c4e166f3c5 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:05:18 -0800
Subject: [TCP] FRTO: fixes fallback to conventional recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The FRTO detection did not care how ACK pattern affects to cwnd
calculation of the conventional recovery. This caused incorrect
setting of cwnd when the fallback becames necessary. The
knowledge tcp_process_frto() has about the incoming ACK is now
passed on to tcp_enter_frto_loss() in allowed_segments parameter
that gives the number of segments that must be added to
packets-in-flight while calculating the new cwnd.

Instead of snd_una we use FLAG_DATA_ACKED in duplicate ACK
detection because RFC4138 states (in Section 2.2):
  If the first acknowledgment after the RTO retransmission
  does not acknowledge all of the data that was retransmitted
  in step 1, the TCP sender reverts to the conventional RTO
  recovery.  Otherwise, a malicious receiver acknowledging
  partial segments could cause the sender to declare the
  timeout spurious in a case where data was lost.

If the next ACK after RTO is duplicate, we do not retransmit
anything, which is equal to what conservative conventional
recovery does in such case.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e990d562f5e..cc935c8a6aa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1296,7 +1296,7 @@ void tcp_enter_frto(struct sock *sk)
  * which indicates that we should follow the traditional RTO recovery,
  * i.e. mark everything lost and do go-back-N retransmission.
  */
-static void tcp_enter_frto_loss(struct sock *sk)
+static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -1326,7 +1326,7 @@ static void tcp_enter_frto_loss(struct sock *sk)
 	}
 	tcp_sync_left_out(tp);
 
-	tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1;
+	tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
 	tp->snd_cwnd_cnt = 0;
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 	tp->undo_marker = 0;
@@ -2527,6 +2527,11 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 	if (flag&FLAG_DATA_ACKED)
 		inet_csk(sk)->icsk_retransmits = 0;
 
+	if (!before(tp->snd_una, tp->frto_highmark)) {
+		tcp_enter_frto_loss(sk, tp->frto_counter + 1);
+		return;
+	}
+
 	/* RFC4138 shortcoming in step 2; should also have case c): ACK isn't
 	 * duplicate nor advances window, e.g., opposite dir data, winupdate
 	 */
@@ -2534,9 +2539,8 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 	    !(flag&FLAG_FORWARD_PROGRESS))
 		return;
 
-	if (tp->snd_una == prior_snd_una ||
-	    !before(tp->snd_una, tp->frto_highmark)) {
-		tcp_enter_frto_loss(sk);
+	if (!(flag&FLAG_DATA_ACKED)) {
+		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3));
 		return;
 	}
 
-- 
cgit 


From aa8b6a7ad147dfbaaf10368ff15df9418b670d8b Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:06:03 -0800
Subject: [TCP] FRTO: Response should reset also snd_cwnd_cnt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since purpose is to reduce CWND, we prevent immediate growth. This
is not a major issue nor is "the correct way" specified anywhere.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cc935c8a6aa..924b2e6d7d1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2490,6 +2490,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
 static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
 {
 	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+	tp->snd_cwnd_cnt = 0;
 	tcp_moderate_cwnd(tp);
 }
 
-- 
cgit 


From 52c63f1e86ebb18ef4b710b5b647e552a041e5ca Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:06:52 -0800
Subject: [TCP]: Don't enter to fast recovery while using FRTO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because TCP is not in Loss state during FRTO recovery, fast
recovery could be triggered by accident. Non-SACK FRTO is more
robust than not yet included SACK-enhanced version (that can
receiver high number of duplicate ACKs with SACK blocks during
FRTO), at least with unidirectional transfers, but under
extraordinary patterns fast recovery can be incorrectly
triggered, e.g., Data loss+ACK losses => cumulative ACK with
enough SACK blocks to meet sacked_out >= dupthresh condition).

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 924b2e6d7d1..7213740477e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1547,6 +1547,10 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
 {
 	__u32 packets_out;
 
+	/* Do not perform any recovery during FRTO algorithm */
+	if (tp->frto_counter)
+		return 0;
+
 	/* Trick#1: The loss is proven. */
 	if (tp->lost_out)
 		return 1;
-- 
cgit 


From 94d0ea7786714d78d7cb73144bb850254dd0bb78 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:07:27 -0800
Subject: [TCP] FRTO: frto_counter modulo-op converted to two assignments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7213740477e..9dc5754141e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2551,11 +2551,11 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 
 	if (tp->frto_counter == 1) {
 		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
+		tp->frto_counter = 2;
 	} else /* frto_counter == 2 */ {
 		tcp_conservative_spur_to_response(tp);
+		tp->frto_counter = 0;
 	}
-
-	tp->frto_counter = (tp->frto_counter + 1) % 3;
 }
 
 /* This routine deals with incoming acks, but not outgoing ones. */
-- 
cgit 


From 7c9a4a5b67926dd186d427bc5b9fce6ccbde154c Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:08:34 -0800
Subject: [TCP]: Prevent unrelated cwnd adjustment while using FRTO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

FRTO controls cwnd when it still processes the ACK input or it
has just reverted back to conventional RTO recovery; the normal
rules apply when FRTO has reverted to standard congestion
control.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9dc5754141e..723cee63791 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2522,7 +2522,7 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
  *     to prove that the RTO is indeed spurious. It transfers the control
  *     from F-RTO to the conventional RTO recovery
  */
-static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
+static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2534,7 +2534,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 
 	if (!before(tp->snd_una, tp->frto_highmark)) {
 		tcp_enter_frto_loss(sk, tp->frto_counter + 1);
-		return;
+		return 1;
 	}
 
 	/* RFC4138 shortcoming in step 2; should also have case c): ACK isn't
@@ -2542,20 +2542,22 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 	 */
 	if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
 	    !(flag&FLAG_FORWARD_PROGRESS))
-		return;
+		return 1;
 
 	if (!(flag&FLAG_DATA_ACKED)) {
 		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3));
-		return;
+		return 1;
 	}
 
 	if (tp->frto_counter == 1) {
 		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
 		tp->frto_counter = 2;
+		return 1;
 	} else /* frto_counter == 2 */ {
 		tcp_conservative_spur_to_response(tp);
 		tp->frto_counter = 0;
 	}
+	return 0;
 }
 
 /* This routine deals with incoming acks, but not outgoing ones. */
@@ -2569,6 +2571,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	u32 prior_in_flight;
 	s32 seq_rtt;
 	int prior_packets;
+	int frto_cwnd = 0;
 
 	/* If the ack is newer than sent or older than previous acks
 	 * then we can probably ignore it.
@@ -2631,15 +2634,16 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
 
 	if (tp->frto_counter)
-		tcp_process_frto(sk, prior_snd_una, flag);
+		frto_cwnd = tcp_process_frto(sk, prior_snd_una, flag);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		/* Advance CWND, if state allows this. */
-		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
+		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
+		    tcp_may_raise_cwnd(sk, flag))
 			tcp_cong_avoid(sk, ack,  seq_rtt, prior_in_flight, 0);
 		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
 	} else {
-		if ((flag & FLAG_DATA_ACKED))
+		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
 			tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
 	}
 
-- 
cgit 


From 46d0de4ed92650b95f27acae09914996bbe624e7 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:10:39 -0800
Subject: [TCP] FRTO: Entry is allowed only during (New)Reno like recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This interpretation comes from RFC4138:
    "If the sender implements some loss recovery algorithm other
     than Reno or NewReno [FHG04], the F-RTO algorithm SHOULD
     NOT be entered when earlier fast recovery is underway."

I think the RFC means to say (especially in the light of
Appendix B) that ...recovery is underway (not just fast recovery)
or was underway when it was interrupted by an earlier (F-)RTO
that hasn't yet been resolved (snd_una has not advanced enough).
Thus, my interpretation is that whenever TCP has ever
retransmitted other than head, basic version cannot be used
because then the order assumptions which are used as FRTO basis
do not hold.

NewReno has only the head segment retransmitted at a time.
Therefore, walk up to the segment that has not been SACKed, if
that segment is not retransmitted nor anything before it, we know
for sure, that nothing after the non-SACKed segment should be
either. This assumption is valid because TCPCB_EVER_RETRANS does
not leave holes but each non-SACKed segment is rexmitted
in-order.

Check for retrans_out > 1 avoids more expensive walk through the
skb list, as we can know the result beforehand: F-RTO will not be
allowed.

SACKed skb can turn into non-SACked only in the extremely rare
case of SACK reneging, in this case we might fail to detect
retransmissions if there were them for any other than head. To
get rid of that feature, whole rexmit queue would have to be
walked (always) or FRTO should be prevented when SACK reneging
happens. Of course RTO should still trigger after reneging which
makes this issue even less likely to show up. And as long as the
response is as conservative as it's now, nothing bad happens even
then.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 723cee63791..a283fc12186 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1239,14 +1239,31 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 /* F-RTO can only be used if these conditions are satisfied:
  *  - there must be some unsent new data
  *  - the advertised window should allow sending it
+ *  - TCP has never retransmitted anything other than head
  */
-int tcp_use_frto(const struct sock *sk)
+int tcp_use_frto(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if (!sysctl_tcp_frto || !sk->sk_send_head ||
+		after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
+		      tp->snd_una + tp->snd_wnd))
+		return 0;
 
-	return (sysctl_tcp_frto && sk->sk_send_head &&
-		!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
-		       tp->snd_una + tp->snd_wnd));
+	/* Avoid expensive walking of rexmit queue if possible */
+	if (tp->retrans_out > 1)
+		return 0;
+
+	skb = skb_peek(&sk->sk_write_queue)->next;	/* Skips head */
+	sk_stream_for_retrans_queue_from(skb, sk) {
+		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
+			return 0;
+		/* Short-circuit when first non-SACKed skb has been checked */
+		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED))
+			break;
+	}
+	return 1;
 }
 
 /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
-- 
cgit 


From d1a54c6a0a3f9c2c4ef71982d89b8571bd9eaa51 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:11:57 -0800
Subject: [TCP] FRTO: Reverse RETRANS bit clearing logic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously RETRANS bits were cleared on the entry to FRTO. We
postpone that into tcp_enter_frto_loss, which is really the
place were the clearing should be done anyway. This allows
simplification of the logic from a clearing loop to the head skb
clearing only.

Besides, the other changes made in the previous patches to
tcp_use_frto made it impossible for the non-SACKed FRTO to be
entered if other than the head has been rexmitted.

With SACK-enhanced FRTO (and Appendix B), however, there can be
a number retransmissions in flight when RTO expires (same thing
could happen before this patchset also with non-SACK FRTO). To
not introduce any jumpiness into the packet counting during FRTO,
instead of clearing RETRANS bits from skbs during entry, do it
later on.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a283fc12186..3ef7e9e0796 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1268,7 +1268,11 @@ int tcp_use_frto(struct sock *sk)
 
 /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
  * recovery a bit and use heuristics in tcp_process_frto() to detect if
- * the RTO was spurious.
+ * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
+ * keep retrans_out counting accurate (with SACK F-RTO, other than head
+ * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
+ * bits are handled if the Loss state is really to be entered (in
+ * tcp_enter_frto_loss).
  *
  * Do like tcp_enter_loss() would; when RTO expires the second time it
  * does:
@@ -1289,17 +1293,13 @@ void tcp_enter_frto(struct sock *sk)
 		tcp_ca_event(sk, CA_EVENT_FRTO);
 	}
 
-	/* Have to clear retransmission markers here to keep the bookkeeping
-	 * in shape, even though we are not yet in Loss state.
-	 * If something was really lost, it is eventually caught up
-	 * in tcp_enter_frto_loss.
-	 */
-	tp->retrans_out = 0;
 	tp->undo_marker = tp->snd_una;
 	tp->undo_retrans = 0;
 
-	sk_stream_for_retrans_queue(skb, sk) {
+	skb = skb_peek(&sk->sk_write_queue);
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
 		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+		tp->retrans_out -= tcp_skb_pcount(skb);
 	}
 	tcp_sync_left_out(tp);
 
@@ -1313,7 +1313,7 @@ void tcp_enter_frto(struct sock *sk)
  * which indicates that we should follow the traditional RTO recovery,
  * i.e. mark everything lost and do go-back-N retransmission.
  */
-static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments)
+static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -1322,10 +1322,21 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments)
 	tp->sacked_out = 0;
 	tp->lost_out = 0;
 	tp->fackets_out = 0;
+	tp->retrans_out = 0;
 
 	sk_stream_for_retrans_queue(skb, sk) {
 		cnt += tcp_skb_pcount(skb);
-		TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+		/*
+		 * Count the retransmission made on RTO correctly (only when
+		 * waiting for the first ACK and did not get it)...
+		 */
+		if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) {
+			tp->retrans_out += tcp_skb_pcount(skb);
+			/* ...enter this if branch just for the first segment */
+			flag |= FLAG_DATA_ACKED;
+		} else {
+			TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+		}
 		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
 
 			/* Do not mark those segments lost that were
@@ -2550,7 +2561,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		inet_csk(sk)->icsk_retransmits = 0;
 
 	if (!before(tp->snd_una, tp->frto_highmark)) {
-		tcp_enter_frto_loss(sk, tp->frto_counter + 1);
+		tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
 		return 1;
 	}
 
@@ -2562,7 +2573,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		return 1;
 
 	if (!(flag&FLAG_DATA_ACKED)) {
-		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3));
+		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), flag);
 		return 1;
 	}
 
-- 
cgit 


From 66e93e45c09affa407750cc06398492e8b897848 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:13:47 -0800
Subject: [TCP] FRTO: Fake cwnd for ssthresh callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TCP without FRTO would be in Loss state with small cwnd. FRTO,
however, leaves cwnd (typically) to a larger value which causes
ssthresh to become too large in case RTO is triggered again
compared to what conventional recovery would do. Because
consecutive RTOs result in only a single ssthresh reduction,
RTO+cumulative ACK+RTO pattern is required to trigger this
event.

A large comment is included for congestion control module writers
trying to figure out what CA_EVENT_FRTO handler should do because
there exists a remote possibility of incompatibility between
FRTO and module defined ssthresh functions.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3ef7e9e0796..055721d8495 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1289,7 +1289,31 @@ void tcp_enter_frto(struct sock *sk)
 	    ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
 	     !icsk->icsk_retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(sk);
-		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		/* Our state is too optimistic in ssthresh() call because cwnd
+		 * is not reduced until tcp_enter_frto_loss() when previous FRTO
+		 * recovery has not yet completed. Pattern would be this: RTO,
+		 * Cumulative ACK, RTO (2xRTO for the same segment does not end
+		 * up here twice).
+		 * RFC4138 should be more specific on what to do, even though
+		 * RTO is quite unlikely to occur after the first Cumulative ACK
+		 * due to back-off and complexity of triggering events ...
+		 */
+		if (tp->frto_counter) {
+			u32 stored_cwnd;
+			stored_cwnd = tp->snd_cwnd;
+			tp->snd_cwnd = 2;
+			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+			tp->snd_cwnd = stored_cwnd;
+		} else {
+			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		}
+		/* ... in theory, cong.control module could do "any tricks" in
+		 * ssthresh(), which means that ca_state, lost bits and lost_out
+		 * counter would have to be faked before the call occurs. We
+		 * consider that too expensive, unlikely and hacky, so modules
+		 * using these in ssthresh() must deal these incompatibility
+		 * issues if they receives CA_EVENT_FRTO and frto_counter != 0
+		 */
 		tcp_ca_event(sk, CA_EVENT_FRTO);
 	}
 
-- 
cgit 


From 288035f915686a9a9e85e0358c5392bb5d7ae58d Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:14:42 -0800
Subject: [TCP]: Prevent reordering adjustments during FRTO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To be honest, I'm not too sure how the reord stuff works in the
first place but this seems necessary.

When FRTO has been active, the one and only retransmission could
be unnecessary but the state and sending order might not be what
the sacktag code expects it to be (to work correctly).

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 055721d8495..df516d4eca9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1224,7 +1224,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 
 	tp->left_out = tp->sacked_out + tp->lost_out;
 
-	if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
+	if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
+	    (tp->frto_highmark && after(tp->snd_una, tp->frto_highmark)))
 		tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
 
 #if FASTRETRANS_DEBUG > 0
-- 
cgit 


From 4dc2665e3634d720a62bd27128fc8781fcdad2dc Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 21 Feb 2007 23:16:11 -0800
Subject: [TCP]: SACK enhanced FRTO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements the SACK-enhanced FRTO given in RFC4138 using the
variant given in Appendix B.

RFC4138, Appendix B:
  "This means that in order to declare timeout spurious, the TCP
   sender must receive an acknowledgment for non-retransmitted
   segment between SND.UNA and RecoveryPoint in algorithm step 3.
   RecoveryPoint is defined in conservative SACK-recovery
   algorithm [RFC3517]"

The basic version of the FRTO algorithm can still be used also
when SACK is enabled. To enabled SACK-enhanced version, tcp_frto
sysctl is set to 2.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 76 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 65 insertions(+), 11 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index df516d4eca9..bb3f234668b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -100,6 +100,7 @@ int sysctl_tcp_abc __read_mostly;
 #define FLAG_ECE		0x40 /* ECE in this ACK				*/
 #define FLAG_DATA_LOST		0x80 /* SACK detected data lossage.		*/
 #define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ONLY_ORIG_SACKED	0x200 /* SACKs only non-rexmit sent before RTO */
 
 #define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 #define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -110,6 +111,8 @@ int sysctl_tcp_abc __read_mostly;
 #define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
 #define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)
 
+#define IsSackFrto() (sysctl_tcp_frto == 0x2)
+
 #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
 
 /* Adapt the MSS value used to make delayed ack decision to the
@@ -1159,6 +1162,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 						/* clear lost hint */
 						tp->retransmit_skb_hint = NULL;
 					}
+					/* SACK enhanced F-RTO detection.
+					 * Set flag if and only if non-rexmitted
+					 * segments below frto_highmark are
+					 * SACKed (RFC4138; Appendix B).
+					 * Clearing correct due to in-order walk
+					 */
+					if (after(end_seq, tp->frto_highmark)) {
+						flag &= ~FLAG_ONLY_ORIG_SACKED;
+					} else {
+						if (!(sacked & TCPCB_RETRANS))
+							flag |= FLAG_ONLY_ORIG_SACKED;
+					}
 				}
 
 				TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
@@ -1240,7 +1255,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 /* F-RTO can only be used if these conditions are satisfied:
  *  - there must be some unsent new data
  *  - the advertised window should allow sending it
- *  - TCP has never retransmitted anything other than head
+ *  - TCP has never retransmitted anything other than head (SACK enhanced
+ *    variant from Appendix B of RFC4138 is more robust here)
  */
 int tcp_use_frto(struct sock *sk)
 {
@@ -1252,6 +1268,9 @@ int tcp_use_frto(struct sock *sk)
 		      tp->snd_una + tp->snd_wnd))
 		return 0;
 
+	if (IsSackFrto())
+		return 1;
+
 	/* Avoid expensive walking of rexmit queue if possible */
 	if (tp->retrans_out > 1)
 		return 0;
@@ -1328,9 +1347,18 @@ void tcp_enter_frto(struct sock *sk)
 	}
 	tcp_sync_left_out(tp);
 
+	/* Earlier loss recovery underway (see RFC4138; Appendix B).
+	 * The last condition is necessary at least in tp->frto_counter case.
+	 */
+	if (IsSackFrto() && (tp->frto_counter ||
+	    ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
+	    after(tp->high_seq, tp->snd_una)) {
+		tp->frto_highmark = tp->high_seq;
+	} else {
+		tp->frto_highmark = tp->snd_nxt;
+	}
 	tcp_set_ca_state(sk, TCP_CA_Disorder);
 	tp->high_seq = tp->snd_nxt;
-	tp->frto_highmark = tp->snd_nxt;
 	tp->frto_counter = 1;
 }
 
@@ -2566,6 +2594,10 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
  * Rationale: if the RTO was spurious, new ACKs should arrive from the
  * original window even after we transmit two new data segments.
  *
+ * SACK version:
+ *   on first step, wait until first cumulative ACK arrives, then move to
+ *   the second step. In second step, the next ACK decides.
+ *
  * F-RTO is implemented (mainly) in four functions:
  *   - tcp_use_frto() is used to determine if TCP is can use F-RTO
  *   - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
@@ -2590,16 +2622,38 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		return 1;
 	}
 
-	/* RFC4138 shortcoming in step 2; should also have case c): ACK isn't
-	 * duplicate nor advances window, e.g., opposite dir data, winupdate
-	 */
-	if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
-	    !(flag&FLAG_FORWARD_PROGRESS))
-		return 1;
+	if (!IsSackFrto() || IsReno(tp)) {
+		/* RFC4138 shortcoming in step 2; should also have case c):
+		 * ACK isn't duplicate nor advances window, e.g., opposite dir
+		 * data, winupdate
+		 */
+		if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
+		    !(flag&FLAG_FORWARD_PROGRESS))
+			return 1;
 
-	if (!(flag&FLAG_DATA_ACKED)) {
-		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), flag);
-		return 1;
+		if (!(flag&FLAG_DATA_ACKED)) {
+			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
+					    flag);
+			return 1;
+		}
+	} else {
+		if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
+			/* Prevent sending of new data. */
+			tp->snd_cwnd = min(tp->snd_cwnd,
+					   tcp_packets_in_flight(tp));
+			return 1;
+		}
+
+		if ((tp->frto_counter == 2) &&
+		    (!(flag&FLAG_FORWARD_PROGRESS) ||
+		     ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
+			/* RFC4138 shortcoming (see comment above) */
+			if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP))
+				return 1;
+
+			tcp_enter_frto_loss(sk, 3, flag);
+			return 1;
+		}
 	}
 
 	if (tp->frto_counter == 1) {
-- 
cgit 


From c5e7af0df5d7234afd8596560d9f570cfc6c18bf Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Fri, 23 Feb 2007 16:22:06 -0800
Subject: [TCP]: Correct reordering detection change (no FRTO case)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The reordering detection must work also when FRTO has not been
used at all which was the original intention of mine, just the
expression of the idea was flawed.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bb3f234668b..f6ba07f0d81 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1240,7 +1240,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	tp->left_out = tp->sacked_out + tp->lost_out;
 
 	if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
-	    (tp->frto_highmark && after(tp->snd_una, tp->frto_highmark)))
+	    (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
 		tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
 
 #if FASTRETRANS_DEBUG > 0
-- 
cgit 


From 3cfe3baaf07c9e40a75f9a70662de56df1c246a8 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Tue, 27 Feb 2007 10:09:49 -0800
Subject: [TCP]: Add two new spurious RTO responses to FRTO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New sysctl tcp_frto_response is added to select amongst these
responses:
	- Rate halving based; reuses CA_CWR state (default)
	- Very conservative; used to be the only one available (=1)
	- Undo cwr; undoes ssthresh and cwnd reductions (=2)

The response with rate halving requires a new parameter to
tcp_enter_cwr because FRTO has already reduced ssthresh and
doing a second reduction there has to be prevented. In addition,
to keep things nice on 80 cols screen, a local variable was
added.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f6ba07f0d81..322e43c5646 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -86,6 +86,7 @@ int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly;
+int sysctl_tcp_frto_response __read_mostly;
 int sysctl_tcp_nometrics_save __read_mostly;
 
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
@@ -762,15 +763,17 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 }
 
 /* Set slow start threshold and cwnd not falling to slow start */
-void tcp_enter_cwr(struct sock *sk)
+void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 
 	tp->prior_ssthresh = 0;
 	tp->bytes_acked = 0;
 	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
 		tp->undo_marker = 0;
-		tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+		if (set_ssthresh)
+			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
 		tp->snd_cwnd = min(tp->snd_cwnd,
 				   tcp_packets_in_flight(tp) + 1U);
 		tp->snd_cwnd_cnt = 0;
@@ -2003,7 +2006,7 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
 		tp->retrans_stamp = 0;
 
 	if (flag&FLAG_ECE)
-		tcp_enter_cwr(sk);
+		tcp_enter_cwr(sk, 1);
 
 	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
 		int state = TCP_CA_Open;
@@ -2579,6 +2582,21 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
 	tcp_moderate_cwnd(tp);
 }
 
+/* A conservative spurious RTO response algorithm: reduce cwnd using
+ * rate halving and continue in congestion avoidance.
+ */
+static void tcp_ratehalving_spur_to_response(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	tcp_enter_cwr(sk, 0);
+	tp->high_seq = tp->frto_highmark; 	/* Smoother w/o this? - ij */
+}
+
+static void tcp_undo_spur_to_response(struct sock *sk)
+{
+	tcp_undo_cwr(sk, 1);
+}
+
 /* F-RTO spurious RTO detection algorithm (RFC4138)
  *
  * F-RTO affects during two new ACKs following RTO (well, almost, see inline
@@ -2661,7 +2679,17 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		tp->frto_counter = 2;
 		return 1;
 	} else /* frto_counter == 2 */ {
-		tcp_conservative_spur_to_response(tp);
+		switch (sysctl_tcp_frto_response) {
+		case 2:
+			tcp_undo_spur_to_response(sk);
+			break;
+		case 1:
+			tcp_conservative_spur_to_response(tp);
+			break;
+		default:
+			tcp_ratehalving_spur_to_response(sk);
+			break;
+		};
 		tp->frto_counter = 0;
 	}
 	return 0;
-- 
cgit 


From e01f9d7793be82e6c252efbd52c399d3eb65abe4 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Fri, 2 Mar 2007 13:27:25 -0800
Subject: [TCP]: Complete icsk-to-local-variable change (in tcp_enter_cwr)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A local variable for icsk was created but this change was
missing. Spotted by Jarek Poplawski.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 322e43c5646..cb715eadf8f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -770,7 +770,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
 
 	tp->prior_ssthresh = 0;
 	tp->bytes_acked = 0;
-	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+	if (icsk->icsk_ca_state < TCP_CA_CWR) {
 		tp->undo_marker = 0;
 		if (set_ssthresh)
 			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-- 
cgit 


From e317f6f69cb95527799d308a9421b7dc1252989a Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Fri, 2 Mar 2007 13:34:19 -0800
Subject: [TCP]: FRTO undo response falls back to ratehalving one if ECEd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Undoing ssthresh is disabled in fastretrans_alert whenever
FLAG_ECE is set by clearing prior_ssthresh. The clearing does
not protect FRTO because FRTO operates before fastretrans_alert.
Moving the clearing of prior_ssthresh earlier seems to be a
suboptimal solution to the FRTO case because then FLAG_ECE will
cause a second ssthresh reduction in try_to_open (the first
occurred when FRTO was entered). So instead, FRTO falls back
immediately to the rate halving response, which switches TCP to
CA_CWR state preventing the latter reduction of ssthresh.

If the first ECE arrived before the ACK after which FRTO is able
to decide RTO as spurious, prior_ssthresh is already cleared.
Thus no undoing for ssthresh occurs. Besides, FLAG_ECE should be
set also in the following ACKs resulting in rate halving response
that sees TCP is already in CA_CWR, which again prevents an extra
ssthresh reduction on that round-trip.

If the first ECE arrived before RTO, ssthresh has already been
adapted and prior_ssthresh remains cleared on entry because TCP
is in CA_CWR (the same applies also to a case where FRTO is
entered more than once and ECE comes in the middle).

High_seq must not be touched after tcp_enter_cwr because CWR
round-trip calculation depends on it.

I believe that after this patch, FRTO should be ECN-safe and
even able to take advantage of synergy benefits.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cb715eadf8f..d894bbcc1d2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2587,14 +2587,15 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
  */
 static void tcp_ratehalving_spur_to_response(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	tcp_enter_cwr(sk, 0);
-	tp->high_seq = tp->frto_highmark; 	/* Smoother w/o this? - ij */
 }
 
-static void tcp_undo_spur_to_response(struct sock *sk)
+static void tcp_undo_spur_to_response(struct sock *sk, int flag)
 {
-	tcp_undo_cwr(sk, 1);
+	if (flag&FLAG_ECE)
+		tcp_ratehalving_spur_to_response(sk);
+	else
+		tcp_undo_cwr(sk, 1);
 }
 
 /* F-RTO spurious RTO detection algorithm (RFC4138)
@@ -2681,7 +2682,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 	} else /* frto_counter == 2 */ {
 		switch (sysctl_tcp_frto_response) {
 		case 2:
-			tcp_undo_spur_to_response(sk);
+			tcp_undo_spur_to_response(sk, flag);
 			break;
 		case 1:
 			tcp_conservative_spur_to_response(tp);
-- 
cgit 


From 9d729f72dca9406025bcfa9c1f660d71d9ef0ff5 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Sun, 4 Mar 2007 16:12:44 -0800
Subject: [NET]: Convert xtime.tv_sec to get_seconds()

Where appropriate, convert references to xtime.tv_sec to the
get_seconds() helper function.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d894bbcc1d2..d0a3630f41a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2933,7 +2933,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
 static inline void tcp_store_ts_recent(struct tcp_sock *tp)
 {
 	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
-	tp->rx_opt.ts_recent_stamp = xtime.tv_sec;
+	tp->rx_opt.ts_recent_stamp = get_seconds();
 }
 
 static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
@@ -2947,7 +2947,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
 		 */
 
 		if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
-		   xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
+		   get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
 			tcp_store_ts_recent(tp);
 	}
 }
@@ -2999,7 +2999,7 @@ static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
-		xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
+		get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
 		!tcp_disordered_ack(sk, skb));
 }
 
-- 
cgit 


From fe067e8ab5e0dc5ca3c54634924c628da92090b4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Wed, 7 Mar 2007 12:12:44 -0800
Subject: [TCP]: Abstract out all write queue operations.

This allows the write queue implementation to be changed,
for example, to one which allows fast interval searching.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 64 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 23 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d0a3630f41a..22d0bb03c5d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1044,7 +1044,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	cached_skb = tp->fastpath_skb_hint;
 	cached_fack_count = tp->fastpath_cnt_hint;
 	if (!cached_skb) {
-		cached_skb = sk->sk_write_queue.next;
+		cached_skb = tcp_write_queue_head(sk);
 		cached_fack_count = 0;
 	}
 
@@ -1061,10 +1061,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 		if (after(end_seq, tp->high_seq))
 			flag |= FLAG_DATA_LOST;
 
-		sk_stream_for_retrans_queue_from(skb, sk) {
+		tcp_for_write_queue_from(skb, sk) {
 			int in_sack, pcount;
 			u8 sacked;
 
+			if (skb == tcp_send_head(sk))
+				break;
+
 			cached_skb = skb;
 			cached_fack_count = fack_count;
 			if (i == first_sack_index) {
@@ -1213,7 +1216,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
 		struct sk_buff *skb;
 
-		sk_stream_for_retrans_queue(skb, sk) {
+		tcp_for_write_queue(skb, sk) {
+			if (skb == tcp_send_head(sk))
+				break;
 			if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
 				break;
 			if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
@@ -1266,8 +1271,8 @@ int tcp_use_frto(struct sock *sk)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 
-	if (!sysctl_tcp_frto || !sk->sk_send_head ||
-		after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
+	if (!sysctl_tcp_frto || !tcp_send_head(sk) ||
+		after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
 		      tp->snd_una + tp->snd_wnd))
 		return 0;
 
@@ -1278,8 +1283,11 @@ int tcp_use_frto(struct sock *sk)
 	if (tp->retrans_out > 1)
 		return 0;
 
-	skb = skb_peek(&sk->sk_write_queue)->next;	/* Skips head */
-	sk_stream_for_retrans_queue_from(skb, sk) {
+	skb = tcp_write_queue_head(sk);
+	skb = tcp_write_queue_next(sk, skb);	/* Skips head */
+	tcp_for_write_queue_from(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
 		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
 			return 0;
 		/* Short-circuit when first non-SACKed skb has been checked */
@@ -1343,7 +1351,7 @@ void tcp_enter_frto(struct sock *sk)
 	tp->undo_marker = tp->snd_una;
 	tp->undo_retrans = 0;
 
-	skb = skb_peek(&sk->sk_write_queue);
+	skb = tcp_write_queue_head(sk);
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
 		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 		tp->retrans_out -= tcp_skb_pcount(skb);
@@ -1380,7 +1388,9 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
 	tp->fackets_out = 0;
 	tp->retrans_out = 0;
 
-	sk_stream_for_retrans_queue(skb, sk) {
+	tcp_for_write_queue(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
 		cnt += tcp_skb_pcount(skb);
 		/*
 		 * Count the retransmission made on RTO correctly (only when
@@ -1468,7 +1478,9 @@ void tcp_enter_loss(struct sock *sk, int how)
 	if (!how)
 		tp->undo_marker = tp->snd_una;
 
-	sk_stream_for_retrans_queue(skb, sk) {
+	tcp_for_write_queue(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
 		cnt += tcp_skb_pcount(skb);
 		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
 			tp->undo_marker = 0;
@@ -1503,14 +1515,14 @@ static int tcp_check_sack_reneging(struct sock *sk)
 	 * receiver _host_ is heavily congested (or buggy).
 	 * Do processing similar to RTO timeout.
 	 */
-	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
+	if ((skb = tcp_write_queue_head(sk)) != NULL &&
 	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
 		struct inet_connection_sock *icsk = inet_csk(sk);
 		NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
 
 		tcp_enter_loss(sk, 1);
 		icsk->icsk_retransmits++;
-		tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
+		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 					  icsk->icsk_rto, TCP_RTO_MAX);
 		return 1;
@@ -1531,7 +1543,7 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
 static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
 {
 	return tp->packets_out &&
-	       tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
+	       tcp_skb_timedout(sk, tcp_write_queue_head(sk));
 }
 
 /* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1726,11 +1738,13 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
 		skb = tp->lost_skb_hint;
 		cnt = tp->lost_cnt_hint;
 	} else {
-		skb = sk->sk_write_queue.next;
+		skb = tcp_write_queue_head(sk);
 		cnt = 0;
 	}
 
-	sk_stream_for_retrans_queue_from(skb, sk) {
+	tcp_for_write_queue_from(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
 		/* TODO: do this better */
 		/* this is not the most efficient way to do this... */
 		tp->lost_skb_hint = skb;
@@ -1777,9 +1791,11 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
 		struct sk_buff *skb;
 
 		skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
-			: sk->sk_write_queue.next;
+			: tcp_write_queue_head(sk);
 
-		sk_stream_for_retrans_queue_from(skb, sk) {
+		tcp_for_write_queue_from(skb, sk) {
+			if (skb == tcp_send_head(sk))
+				break;
 			if (!tcp_skb_timedout(sk, skb))
 				break;
 
@@ -1970,7 +1986,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
 {
 	if (tcp_may_undo(tp)) {
 		struct sk_buff *skb;
-		sk_stream_for_retrans_queue(skb, sk) {
+		tcp_for_write_queue(skb, sk) {
+			if (skb == tcp_send_head(sk))
+				break;
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
 		}
 
@@ -2382,8 +2400,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 		= icsk->icsk_ca_ops->rtt_sample;
 	struct timeval tv = { .tv_sec = 0, .tv_usec = 0 };
 
-	while ((skb = skb_peek(&sk->sk_write_queue)) &&
-	       skb != sk->sk_send_head) {
+	while ((skb = tcp_write_queue_head(sk)) &&
+	       skb != tcp_send_head(sk)) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 		__u8 sacked = scb->sacked;
 
@@ -2446,7 +2464,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 		}
 		tcp_dec_pcount_approx(&tp->fackets_out, skb);
 		tcp_packets_out_dec(tp, skb);
-		__skb_unlink(skb, &sk->sk_write_queue);
+		tcp_unlink_write_queue(skb, sk);
 		sk_stream_free_skb(sk, skb);
 		clear_all_retrans_hints(tp);
 	}
@@ -2495,7 +2513,7 @@ static void tcp_ack_probe(struct sock *sk)
 
 	/* Was it a usable window open? */
 
-	if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
+	if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
 		   tp->snd_una + tp->snd_wnd)) {
 		icsk->icsk_backoff = 0;
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
@@ -2795,7 +2813,7 @@ no_queue:
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
 	 */
-	if (sk->sk_send_head)
+	if (tcp_send_head(sk))
 		tcp_ack_probe(sk);
 	return 1;
 
-- 
cgit 


From 2de979bd7da9c8b39cc0aabb0ab5aa1516d929eb Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Thu, 8 Mar 2007 20:45:19 -0800
Subject: [TCP]: whitespace cleanup

Add whitespace around keywords.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 57 ++++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 22d0bb03c5d..fb025608594 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -578,7 +578,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
 	 * does not matter how to _calculate_ it. Seems, it was trap
 	 * that VJ failed to avoid. 8)
 	 */
-	if(m == 0)
+	if (m == 0)
 		m = 1;
 	if (tp->srtt != 0) {
 		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
@@ -1758,12 +1758,11 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
 
 			/* clear xmit_retransmit_queue hints
 			 *  if this is beyond hint */
-			if(tp->retransmit_skb_hint != NULL &&
-			   before(TCP_SKB_CB(skb)->seq,
-				  TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
-
+			if (tp->retransmit_skb_hint != NULL &&
+			    before(TCP_SKB_CB(skb)->seq,
+				   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
 				tp->retransmit_skb_hint = NULL;
-			}
+
 		}
 	}
 	tcp_sync_left_out(tp);
@@ -2441,7 +2440,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 
 		if (sacked) {
 			if (sacked & TCPCB_RETRANS) {
-				if(sacked & TCPCB_SACKED_RETRANS)
+				if (sacked & TCPCB_SACKED_RETRANS)
 					tp->retrans_out -= tcp_skb_pcount(skb);
 				acked |= FLAG_RETRANS_DATA_ACKED;
 				seq_rtt = -1;
@@ -2840,7 +2839,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 	ptr = (unsigned char *)(th + 1);
 	opt_rx->saw_tstamp = 0;
 
-	while(length>0) {
+	while (length > 0) {
 		int opcode=*ptr++;
 		int opsize;
 
@@ -2856,9 +2855,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 					return;
 				if (opsize > length)
 					return;	/* don't parse partial options */
-				switch(opcode) {
+				switch (opcode) {
 				case TCPOPT_MSS:
-					if(opsize==TCPOLEN_MSS && th->syn && !estab) {
+					if (opsize==TCPOLEN_MSS && th->syn && !estab) {
 						u16 in_mss = ntohs(get_unaligned((__be16 *)ptr));
 						if (in_mss) {
 							if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
@@ -2868,12 +2867,12 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 					}
 					break;
 				case TCPOPT_WINDOW:
-					if(opsize==TCPOLEN_WINDOW && th->syn && !estab)
+					if (opsize==TCPOLEN_WINDOW && th->syn && !estab)
 						if (sysctl_tcp_window_scaling) {
 							__u8 snd_wscale = *(__u8 *) ptr;
 							opt_rx->wscale_ok = 1;
 							if (snd_wscale > 14) {
-								if(net_ratelimit())
+								if (net_ratelimit())
 									printk(KERN_INFO "tcp_parse_options: Illegal window "
 									       "scaling value %d >14 received.\n",
 									       snd_wscale);
@@ -2883,7 +2882,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 						}
 					break;
 				case TCPOPT_TIMESTAMP:
-					if(opsize==TCPOLEN_TIMESTAMP) {
+					if (opsize==TCPOLEN_TIMESTAMP) {
 						if ((estab && opt_rx->tstamp_ok) ||
 						    (!estab && sysctl_tcp_timestamps)) {
 							opt_rx->saw_tstamp = 1;
@@ -2893,7 +2892,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 					}
 					break;
 				case TCPOPT_SACK_PERM:
-					if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
+					if (opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
 						if (sysctl_tcp_sack) {
 							opt_rx->sack_ok = 1;
 							tcp_sack_reset(opt_rx);
@@ -2902,7 +2901,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 					break;
 
 				case TCPOPT_SACK:
-					if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+					if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
 					   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
 					   opt_rx->sack_ok) {
 						TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
@@ -2964,7 +2963,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
 		 * Not only, also it occurs for expired timestamps.
 		 */
 
-		if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
+		if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
 		   get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
 			tcp_store_ts_recent(tp);
 	}
@@ -3223,7 +3222,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
 			 */
 			tp->rx_opt.num_sacks--;
 			tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
-			for(i=this_sack; i < tp->rx_opt.num_sacks; i++)
+			for (i=this_sack; i < tp->rx_opt.num_sacks; i++)
 				sp[i] = sp[i+1];
 			continue;
 		}
@@ -3276,7 +3275,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 		tp->rx_opt.num_sacks--;
 		sp--;
 	}
-	for(; this_sack > 0; this_sack--, sp--)
+	for (; this_sack > 0; this_sack--, sp--)
 		*sp = *(sp-1);
 
 new_sack:
@@ -3302,7 +3301,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 		return;
 	}
 
-	for(this_sack = 0; this_sack < num_sacks; ) {
+	for (this_sack = 0; this_sack < num_sacks; ) {
 		/* Check if the start of the sack is covered by RCV.NXT. */
 		if (!before(tp->rcv_nxt, sp->start_seq)) {
 			int i;
@@ -3358,7 +3357,7 @@ static void tcp_ofo_queue(struct sock *sk)
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-		if(skb->h.th->fin)
+		if (skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
 	}
 }
@@ -3424,9 +3423,9 @@ queue_and_out:
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
 		}
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-		if(skb->len)
+		if (skb->len)
 			tcp_event_data_recv(sk, tp, skb);
-		if(th->fin)
+		if (th->fin)
 			tcp_fin(skb, sk, th);
 
 		if (!skb_queue_empty(&tp->out_of_order_queue)) {
@@ -4323,7 +4322,7 @@ slow_path:
 		goto discard;
 	}
 
-	if(th->rst) {
+	if (th->rst) {
 		tcp_reset(sk);
 		goto discard;
 	}
@@ -4338,7 +4337,7 @@ slow_path:
 	}
 
 step5:
-	if(th->ack)
+	if (th->ack)
 		tcp_ack(sk, skb, FLAG_SLOWPATH);
 
 	tcp_rcv_rtt_measure_ts(sk, skb);
@@ -4626,13 +4625,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		goto discard;
 
 	case TCP_LISTEN:
-		if(th->ack)
+		if (th->ack)
 			return 1;
 
-		if(th->rst)
+		if (th->rst)
 			goto discard;
 
-		if(th->syn) {
+		if (th->syn) {
 			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
 				return 1;
 
@@ -4688,7 +4687,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	}
 
 	/* step 2: check RST bit */
-	if(th->rst) {
+	if (th->rst) {
 		tcp_reset(sk);
 		goto discard;
 	}
@@ -4711,7 +4710,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	if (th->ack) {
 		int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
 
-		switch(sk->sk_state) {
+		switch (sk->sk_state) {
 		case TCP_SYN_RECV:
 			if (acceptable) {
 				tp->copied_seq = tp->rcv_nxt;
-- 
cgit 


From c51957dafa6f960c5c6372aa3da6c8fa71c13730 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 10 Mar 2007 12:47:22 -0300
Subject: [TCP]: Do the layer header setting in tcp_collapse relative to
 skb->data

That is equal to skb->head before skb_reserve, to help in the layer header
changes.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb025608594..e5d1c2c8cea 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3632,11 +3632,13 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 		nskb = alloc_skb(copy+header, GFP_ATOMIC);
 		if (!nskb)
 			return;
+
+		nskb->mac.raw = nskb->data + (skb->mac.raw - skb->head);
+		nskb->nh.raw = nskb->data + (skb->nh.raw - skb->head);
+		nskb->h.raw = nskb->data + (skb->h.raw - skb->head);
+
 		skb_reserve(nskb, header);
 		memcpy(nskb->head, skb->head, header);
-		nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
-		nskb->h.raw = nskb->head + (skb->h.raw-skb->head);
-		nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
 		__skb_insert(nskb, skb->prev, skb, list);
-- 
cgit 


From 31713c333ddbb66d694829082620b69b71c4b09a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 10 Mar 2007 12:48:37 -0300
Subject: [TCP]: Use skb_set_mac_header in tcp_collapse

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e5d1c2c8cea..1ec05bd673a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3633,7 +3633,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 		if (!nskb)
 			return;
 
-		nskb->mac.raw = nskb->data + (skb->mac.raw - skb->head);
+		skb_set_mac_header(nskb, skb->mac.raw - skb->head);
 		nskb->nh.raw = nskb->data + (skb->nh.raw - skb->head);
 		nskb->h.raw = nskb->data + (skb->h.raw - skb->head);
 
-- 
cgit 


From 98e399f82ab3a6d863d1d4a7ea48925cc91c830e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 19 Mar 2007 15:33:04 -0700
Subject: [SK_BUFF]: Introduce skb_mac_header()

For the places where we need a pointer to the mac header, it is still legal to
touch skb->mac.raw directly if just adding to, subtracting from or setting it
to another layer header.

This one also converts some more cases to skb_reset_mac_header() that my
regex missed as it had no spaces before nor after '=', ugh.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1ec05bd673a..f5e019cefc1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3633,7 +3633,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 		if (!nskb)
 			return;
 
-		skb_set_mac_header(nskb, skb->mac.raw - skb->head);
+		skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
 		nskb->nh.raw = nskb->data + (skb->nh.raw - skb->head);
 		nskb->h.raw = nskb->data + (skb->h.raw - skb->head);
 
-- 
cgit 


From d56f90a7c96da5187f0cdf07ee7434fe6aa78bbc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 Apr 2007 20:50:43 -0700
Subject: [SK_BUFF]: Introduce skb_network_header()

For the places where we need a pointer to the network header, it is still legal
to touch skb->nh.raw directly if just adding to, subtracting from or setting it
to another layer header.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f5e019cefc1..00190835cea 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3634,7 +3634,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 			return;
 
 		skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
-		nskb->nh.raw = nskb->data + (skb->nh.raw - skb->head);
+		nskb->nh.raw = nskb->data + (skb_network_header(skb) - skb->head);
 		nskb->h.raw = nskb->data + (skb->h.raw - skb->head);
 
 		skb_reserve(nskb, header);
-- 
cgit 


From c14d2450cb7fe1786e2ec325172baf66922bf597 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 11 Mar 2007 22:39:41 -0300
Subject: [SK_BUFF]: Introduce skb_set_network_header

For the cases where the network header is being set to a offset from skb->data.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 00190835cea..5da823a3225 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3634,7 +3634,8 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 			return;
 
 		skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
-		nskb->nh.raw = nskb->data + (skb_network_header(skb) - skb->head);
+		skb_set_network_header(nskb,
+				       skb_network_header(skb) - skb->head);
 		nskb->h.raw = nskb->data + (skb->h.raw - skb->head);
 
 		skb_reserve(nskb, header);
-- 
cgit 


From 967b05f64e27d04a4c8879addd0e1c52137e2c9e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 13 Mar 2007 13:51:52 -0300
Subject: [SK_BUFF]: Introduce skb_set_transport_header

For the cases where the transport header is being set to a offset from
skb->data.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5da823a3225..2776a8b0133 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3636,7 +3636,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 		skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
 		skb_set_network_header(nskb,
 				       skb_network_header(skb) - skb->head);
-		nskb->h.raw = nskb->data + (skb->h.raw - skb->head);
+		skb_set_transport_header(nskb, skb->h.raw - skb->head);
 
 		skb_reserve(nskb, header);
 		memcpy(nskb->head, skb->head, header);
-- 
cgit 


From aa8223c7bb0b05183e1737881ed21827aa5b9e73 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 Apr 2007 21:04:22 -0700
Subject: [SK_BUFF]: Introduce tcp_hdr(), remove skb->h.th

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2776a8b0133..c1ce3623738 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -148,7 +148,7 @@ static void tcp_measure_rcv_mss(struct sock *sk,
 		     * to handle super-low mtu links fairly.
 		     */
 		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
-		     !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) {
+		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
 			/* Subtract also invariant (if peer is RFC compliant),
 			 * tcp header plus fixed timestamp option length.
 			 * Resulting "len" is MSS free of SACK jitter.
@@ -2559,9 +2559,9 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
 				 struct sk_buff *skb, u32 ack, u32 ack_seq)
 {
 	int flag = 0;
-	u32 nwin = ntohs(skb->h.th->window);
+	u32 nwin = ntohs(tcp_hdr(skb)->window);
 
-	if (likely(!skb->h.th->syn))
+	if (likely(!tcp_hdr(skb)->syn))
 		nwin <<= tp->rx_opt.snd_wscale;
 
 	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
@@ -2766,7 +2766,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		if (TCP_SKB_CB(skb)->sacked)
 			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
 
-		if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
+		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
 			flag |= FLAG_ECE;
 
 		tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
@@ -2833,7 +2833,7 @@ uninteresting_ack:
 void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
 {
 	unsigned char *ptr;
-	struct tcphdr *th = skb->h.th;
+	struct tcphdr *th = tcp_hdr(skb);
 	int length=(th->doff*4)-sizeof(struct tcphdr);
 
 	ptr = (unsigned char *)(th + 1);
@@ -2995,7 +2995,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
 static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcphdr *th = skb->h.th;
+	struct tcphdr *th = tcp_hdr(skb);
 	u32 seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 
@@ -3357,8 +3357,8 @@ static void tcp_ofo_queue(struct sock *sk)
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-		if (skb->h.th->fin)
-			tcp_fin(skb, sk, skb->h.th);
+		if (tcp_hdr(skb)->fin)
+			tcp_fin(skb, sk, tcp_hdr(skb));
 	}
 }
 
@@ -3366,7 +3366,7 @@ static int tcp_prune_queue(struct sock *sk);
 
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcphdr *th = skb->h.th;
+	struct tcphdr *th = tcp_hdr(skb);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int eaten = -1;
 
@@ -3605,7 +3605,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 		 * - bloated or contains data before "start" or
 		 *   overlaps to the next one.
 		 */
-		if (!skb->h.th->syn && !skb->h.th->fin &&
+		if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
 		    (tcp_win_from_space(skb->truesize) > skb->len ||
 		     before(TCP_SKB_CB(skb)->seq, start) ||
 		     (skb->next != tail &&
@@ -3616,7 +3616,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 		start = TCP_SKB_CB(skb)->end_seq;
 		skb = skb->next;
 	}
-	if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+	if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
 		return;
 
 	while (before(start, end)) {
@@ -3665,7 +3665,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 				__kfree_skb(skb);
 				NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
 				skb = next;
-				if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+				if (skb == tail ||
+				    tcp_hdr(skb)->syn ||
+				    tcp_hdr(skb)->fin)
 					return;
 			}
 		}
@@ -4072,7 +4074,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen
 		tcp_rcv_space_adjust(sk);
 
 		if ((tp->ucopy.len == 0) ||
-		    (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) ||
+		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
 		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
 			tp->ucopy.wakeup = 1;
 			sk->sk_data_ready(sk, 0);
-- 
cgit 


From 9c70220b73908f64792422a2c39c593c4792f2c5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 25 Apr 2007 18:04:18 -0700
Subject: [SK_BUFF]: Introduce skb_transport_header(skb)

For the places where we need a pointer to the transport header, it is
still legal to touch skb->h.raw directly if just adding to,
subtracting from or setting it to another layer header.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c1ce3623738..9c3b4c7a50a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -140,7 +140,7 @@ static void tcp_measure_rcv_mss(struct sock *sk,
 		 *
 		 * "len" is invariant segment length, including TCP header.
 		 */
-		len += skb->data - skb->h.raw;
+		len += skb->data - skb_transport_header(skb);
 		if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
 		    /* If PSH is not set, packet should be
 		     * full sized, provided peer TCP is not badly broken.
@@ -940,7 +940,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
+	unsigned char *ptr = (skb_transport_header(ack_skb) +
+			      TCP_SKB_CB(ack_skb)->sacked);
 	struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
 	struct sk_buff *cached_skb;
 	int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
@@ -3634,10 +3635,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 			return;
 
 		skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
-		skb_set_network_header(nskb,
-				       skb_network_header(skb) - skb->head);
-		skb_set_transport_header(nskb, skb->h.raw - skb->head);
-
+		skb_set_network_header(nskb, (skb_network_header(skb) -
+					      skb->head));
+		skb_set_transport_header(nskb, (skb_transport_header(skb) -
+						skb->head));
 		skb_reserve(nskb, header);
 		memcpy(nskb->head, skb->head, header);
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
-- 
cgit 


From 604763722c655c7e3f31ecf6f7b4dafcd26a7a15 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 9 Apr 2007 11:59:39 -0700
Subject: [NET]: Treat CHECKSUM_PARTIAL as CHECKSUM_UNNECESSARY

When a transmitted packet is looped back directly, CHECKSUM_PARTIAL
maps to the semantics of CHECKSUM_UNNECESSARY.  Therefore we should
treat it as such in the stack.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c3b4c7a50a..d1604f59d77 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4009,7 +4009,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 	int err;
 
 	local_bh_enable();
-	if (skb->ip_summed==CHECKSUM_UNNECESSARY)
+	if (skb_csum_unnecessary(skb))
 		err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
 	else
 		err = skb_copy_and_csum_datagram_iovec(skb, hlen,
@@ -4041,7 +4041,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb
 
 static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
 {
-	return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+	return !skb_csum_unnecessary(skb) &&
 		__tcp_checksum_complete_user(sk, skb);
 }
 
@@ -4059,7 +4059,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen
 	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
 		tp->ucopy.dma_chan = get_softnet_dma();
 
-	if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) {
+	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
 
 		dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
 			skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
-- 
cgit 


From 3ff50b7997fe06cd5d276b229967bb52d6b3b6c1 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Fri, 20 Apr 2007 17:09:22 -0700
Subject: [NET]: cleanup extra semicolons

Spring cleaning time...

There seems to be a lot of places in the network code that have
extra bogus semicolons after conditionals.  Most commonly is a
bogus semicolon after: switch() { }

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d1604f59d77..2fbfc2e4209 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2708,7 +2708,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		default:
 			tcp_ratehalving_spur_to_response(sk);
 			break;
-		};
+		}
 		tp->frto_counter = 0;
 	}
 	return 0;
@@ -2915,10 +2915,11 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 					 */
 					break;
 #endif
-				};
+				}
+
 				ptr+=opsize-2;
 				length-=opsize;
-		};
+		}
 	}
 }
 
@@ -3124,7 +3125,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 			printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
 			       __FUNCTION__, sk->sk_state);
 			break;
-	};
+	}
 
 	/* It _is_ possible, that we have something out-of-order _after_ FIN.
 	 * Probably, we should reset in this case. For now drop them.
-- 
cgit 


From 9e412ba7632f71259a53085665d4983b78257b7c Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Fri, 20 Apr 2007 22:18:02 -0700
Subject: [TCP]: Sed magic converts func(sk, tp, ...) -> func(sk, ...)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is (mostly) automated change using magic:

sed -e '/struct sock \*sk/ N' -e '/struct sock \*sk/ N'
    -e '/struct sock \*sk/ N' -e '/struct sock \*sk/ N'
    -e 's|struct sock \*sk,[\n\t ]*struct tcp_sock \*tp\([^{]*\n{\n\)|
	  struct sock \*sk\1\tstruct tcp_sock *tp = tcp_sk(sk);\n|g'
    -e 's|struct sock \*sk, struct tcp_sock \*tp|
	  struct sock \*sk|g' -e 's|sk, tp\([^-]\)|sk\1|g'

Fixed four unused variable (tp) warnings that were introduced.

In addition, manually added newlines after local variables and
tweaked function arguments positioning.

$ gcc --version
gcc (GCC) 4.1.1 20060525 (Red Hat 4.1.1-1)
...
$ codiff -fV built-in.o.old built-in.o.new
net/ipv4/route.c:
  rt_cache_flush |  +14
 1 function changed, 14 bytes added

net/ipv4/tcp.c:
  tcp_setsockopt |   -5
  tcp_sendpage   |  -25
  tcp_sendmsg    |  -16
 3 functions changed, 46 bytes removed

net/ipv4/tcp_input.c:
  tcp_try_undo_recovery |   +3
  tcp_try_undo_dsack    |   +2
  tcp_mark_head_lost    |  -12
  tcp_ack               |  -15
  tcp_event_data_recv   |  -32
  tcp_rcv_state_process |  -10
  tcp_rcv_established   |   +1
 7 functions changed, 6 bytes added, 69 bytes removed, diff: -63

net/ipv4/tcp_output.c:
  update_send_head          |   -9
  tcp_transmit_skb          |  +19
  tcp_cwnd_validate         |   +1
  tcp_write_wakeup          |  -17
  __tcp_push_pending_frames |  -25
  tcp_push_one              |   -8
  tcp_send_fin              |   -4
 7 functions changed, 20 bytes added, 63 bytes removed, diff: -43

built-in.o.new:
 18 functions changed, 40 bytes added, 178 bytes removed, diff: -138

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 145 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 59 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2fbfc2e4209..63338939078 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -235,9 +235,9 @@ static void tcp_fixup_sndbuf(struct sock *sk)
  */
 
 /* Slow part of check#2. */
-static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
-			     const struct sk_buff *skb)
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	/* Optimize this! */
 	int truesize = tcp_win_from_space(skb->truesize)/2;
 	int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
@@ -252,9 +252,11 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
 	return 0;
 }
 
-static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+static void tcp_grow_window(struct sock *sk,
 			    struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	/* Check #1 */
 	if (tp->rcv_ssthresh < tp->window_clamp &&
 	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
@@ -267,7 +269,7 @@ static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
 		if (tcp_win_from_space(skb->truesize) <= skb->len)
 			incr = 2*tp->advmss;
 		else
-			incr = __tcp_grow_window(sk, tp, skb);
+			incr = __tcp_grow_window(sk, skb);
 
 		if (incr) {
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
@@ -330,8 +332,9 @@ static void tcp_init_buffer_space(struct sock *sk)
 }
 
 /* 5. Recalculate window clamp after socket hit its memory bounds. */
-static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
+static void tcp_clamp_window(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	icsk->icsk_ack.quick = 0;
@@ -503,8 +506,9 @@ new_measure:
  * each ACK we send, he increments snd_cwnd and transmits more of his
  * queue.  -DaveM
  */
-static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now;
 
@@ -545,7 +549,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
 	TCP_ECN_check_ce(tp, skb);
 
 	if (skb->len >= 128)
-		tcp_grow_window(sk, tp, skb);
+		tcp_grow_window(sk, skb);
 }
 
 /* Called to compute a smoothed rtt estimate. The data fed to this
@@ -1541,8 +1545,10 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
 	return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
 }
 
-static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
+static inline int tcp_head_timedout(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	return tp->packets_out &&
 	       tcp_skb_timedout(sk, tcp_write_queue_head(sk));
 }
@@ -1640,8 +1646,9 @@ static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
  * Main question: may we further continue forward transmission
  * with the same cwnd?
  */
-static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
+static int tcp_time_to_recover(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 packets_out;
 
 	/* Do not perform any recovery during FRTO algorithm */
@@ -1659,7 +1666,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
 	/* Trick#3 : when we use RFC2988 timer restart, fast
 	 * retransmit can be triggered by timeout of queue head.
 	 */
-	if (tcp_head_timedout(sk, tp))
+	if (tcp_head_timedout(sk))
 		return 1;
 
 	/* Trick#4: It is still not OK... But will it be useful to delay
@@ -1668,7 +1675,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
 	packets_out = tp->packets_out;
 	if (packets_out <= tp->reordering &&
 	    tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
-	    !tcp_may_send_now(sk, tp)) {
+	    !tcp_may_send_now(sk)) {
 		/* We have nothing to send. This connection is limited
 		 * either by receiver window or by application.
 		 */
@@ -1708,8 +1715,10 @@ static void tcp_add_reno_sack(struct sock *sk)
 
 /* Account for ACK, ACKing some data in Reno Recovery phase. */
 
-static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked)
+static void tcp_remove_reno_sacks(struct sock *sk, int acked)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (acked > 0) {
 		/* One ACK acked hole. The rest eat duplicate ACKs. */
 		if (acked-1 >= tp->sacked_out)
@@ -1728,9 +1737,10 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
 }
 
 /* Mark head of queue up as lost. */
-static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
+static void tcp_mark_head_lost(struct sock *sk,
 			       int packets, u32 high_seq)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int cnt;
 
@@ -1771,15 +1781,17 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
 
 /* Account newly detected lost packet(s) */
 
-static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
+static void tcp_update_scoreboard(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (IsFack(tp)) {
 		int lost = tp->fackets_out - tp->reordering;
 		if (lost <= 0)
 			lost = 1;
-		tcp_mark_head_lost(sk, tp, lost, tp->high_seq);
+		tcp_mark_head_lost(sk, lost, tp->high_seq);
 	} else {
-		tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
+		tcp_mark_head_lost(sk, 1, tp->high_seq);
 	}
 
 	/* New heuristics: it is possible only after we switched
@@ -1787,7 +1799,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
 	 * Hence, we can detect timed out packets during fast
 	 * retransmit without falling to slow start.
 	 */
-	if (!IsReno(tp) && tcp_head_timedout(sk, tp)) {
+	if (!IsReno(tp) && tcp_head_timedout(sk)) {
 		struct sk_buff *skb;
 
 		skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
@@ -1867,9 +1879,11 @@ static inline int tcp_packet_delayed(struct tcp_sock *tp)
 /* Undo procedures. */
 
 #if FASTRETRANS_DEBUG > 1
-static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
+static void DBGUNDO(struct sock *sk, const char *msg)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_sock *inet = inet_sk(sk);
+
 	printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
 	       msg,
 	       NIPQUAD(inet->daddr), ntohs(inet->dport),
@@ -1915,13 +1929,15 @@ static inline int tcp_may_undo(struct tcp_sock *tp)
 }
 
 /* People celebrate: "We love our President!" */
-static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
+static int tcp_try_undo_recovery(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (tcp_may_undo(tp)) {
 		/* Happy end! We did not retransmit anything
 		 * or our original transmission succeeded.
 		 */
-		DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+		DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
 		tcp_undo_cwr(sk, 1);
 		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
 			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
@@ -1941,10 +1957,12 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
 }
 
 /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
-static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
+static void tcp_try_undo_dsack(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (tp->undo_marker && !tp->undo_retrans) {
-		DBGUNDO(sk, tp, "D-SACK");
+		DBGUNDO(sk, "D-SACK");
 		tcp_undo_cwr(sk, 1);
 		tp->undo_marker = 0;
 		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
@@ -1953,9 +1971,9 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
 
 /* Undo during fast recovery after partial ACK. */
 
-static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
-				int acked)
+static int tcp_try_undo_partial(struct sock *sk, int acked)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	/* Partial ACK arrived. Force Hoe's retransmit. */
 	int failed = IsReno(tp) || tp->fackets_out>tp->reordering;
 
@@ -1968,7 +1986,7 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
 
 		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
 
-		DBGUNDO(sk, tp, "Hoe");
+		DBGUNDO(sk, "Hoe");
 		tcp_undo_cwr(sk, 0);
 		NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
 
@@ -1982,8 +2000,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
 }
 
 /* Undo during loss recovery after partial ACK. */
-static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
+static int tcp_try_undo_loss(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (tcp_may_undo(tp)) {
 		struct sk_buff *skb;
 		tcp_for_write_queue(skb, sk) {
@@ -1994,7 +2014,7 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
 
 		clear_all_retrans_hints(tp);
 
-		DBGUNDO(sk, tp, "partial loss");
+		DBGUNDO(sk, "partial loss");
 		tp->lost_out = 0;
 		tp->left_out = tp->sacked_out;
 		tcp_undo_cwr(sk, 1);
@@ -2016,8 +2036,10 @@ static inline void tcp_complete_cwr(struct sock *sk)
 	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
 
-static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
+static void tcp_try_to_open(struct sock *sk, int flag)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	tp->left_out = tp->sacked_out;
 
 	if (tp->retrans_out == 0)
@@ -2111,7 +2133,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 	    before(tp->snd_una, tp->high_seq) &&
 	    icsk->icsk_ca_state != TCP_CA_Open &&
 	    tp->fackets_out > tp->reordering) {
-		tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
+		tcp_mark_head_lost(sk, tp->fackets_out-tp->reordering, tp->high_seq);
 		NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
 	}
 
@@ -2127,7 +2149,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		switch (icsk->icsk_ca_state) {
 		case TCP_CA_Loss:
 			icsk->icsk_retransmits = 0;
-			if (tcp_try_undo_recovery(sk, tp))
+			if (tcp_try_undo_recovery(sk))
 				return;
 			break;
 
@@ -2141,7 +2163,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 			break;
 
 		case TCP_CA_Disorder:
-			tcp_try_undo_dsack(sk, tp);
+			tcp_try_undo_dsack(sk);
 			if (!tp->undo_marker ||
 			    /* For SACK case do not Open to allow to undo
 			     * catching for all duplicate ACKs. */
@@ -2154,7 +2176,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		case TCP_CA_Recovery:
 			if (IsReno(tp))
 				tcp_reset_reno_sack(tp);
-			if (tcp_try_undo_recovery(sk, tp))
+			if (tcp_try_undo_recovery(sk))
 				return;
 			tcp_complete_cwr(sk);
 			break;
@@ -2170,14 +2192,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		} else {
 			int acked = prior_packets - tp->packets_out;
 			if (IsReno(tp))
-				tcp_remove_reno_sacks(sk, tp, acked);
-			is_dupack = tcp_try_undo_partial(sk, tp, acked);
+				tcp_remove_reno_sacks(sk, acked);
+			is_dupack = tcp_try_undo_partial(sk, acked);
 		}
 		break;
 	case TCP_CA_Loss:
 		if (flag&FLAG_DATA_ACKED)
 			icsk->icsk_retransmits = 0;
-		if (!tcp_try_undo_loss(sk, tp)) {
+		if (!tcp_try_undo_loss(sk)) {
 			tcp_moderate_cwnd(tp);
 			tcp_xmit_retransmit_queue(sk);
 			return;
@@ -2194,10 +2216,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		}
 
 		if (icsk->icsk_ca_state == TCP_CA_Disorder)
-			tcp_try_undo_dsack(sk, tp);
+			tcp_try_undo_dsack(sk);
 
-		if (!tcp_time_to_recover(sk, tp)) {
-			tcp_try_to_open(sk, tp, flag);
+		if (!tcp_time_to_recover(sk)) {
+			tcp_try_to_open(sk, flag);
 			return;
 		}
 
@@ -2236,8 +2258,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		tcp_set_ca_state(sk, TCP_CA_Recovery);
 	}
 
-	if (is_dupack || tcp_head_timedout(sk, tp))
-		tcp_update_scoreboard(sk, tp);
+	if (is_dupack || tcp_head_timedout(sk))
+		tcp_update_scoreboard(sk);
 	tcp_cwnd_down(sk);
 	tcp_xmit_retransmit_queue(sk);
 }
@@ -2313,8 +2335,10 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
  * RFC2988 recommends to restart timer to now+rto.
  */
 
-static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
+static void tcp_ack_packets_out(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (!tp->packets_out) {
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 	} else {
@@ -2471,7 +2495,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 
 	if (acked&FLAG_ACKED) {
 		tcp_ack_update_rtt(sk, acked, seq_rtt);
-		tcp_ack_packets_out(sk, tp);
+		tcp_ack_packets_out(sk);
 		if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED))
 			(*rtt_sample)(sk, tcp_usrtt(&tv));
 
@@ -2556,9 +2580,10 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack
  * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
  * and in FreeBSD. NetBSD's one is even worse.) is wrong.
  */
-static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
-				 struct sk_buff *skb, u32 ack, u32 ack_seq)
+static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
+				 u32 ack_seq)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	int flag = 0;
 	u32 nwin = ntohs(tcp_hdr(skb)->window);
 
@@ -2576,7 +2601,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
 			 * fast path is recovered for sending TCP.
 			 */
 			tp->pred_flags = 0;
-			tcp_fast_path_check(sk, tp);
+			tcp_fast_path_check(sk);
 
 			if (nwin > tp->max_window) {
 				tp->max_window = nwin;
@@ -2762,7 +2787,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		else
 			NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS);
 
-		flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq);
+		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
 
 		if (TCP_SKB_CB(skb)->sacked)
 			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
@@ -3426,7 +3451,7 @@ queue_and_out:
 		}
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if (skb->len)
-			tcp_event_data_recv(sk, tp, skb);
+			tcp_event_data_recv(sk, skb);
 		if (th->fin)
 			tcp_fin(skb, sk, th);
 
@@ -3443,7 +3468,7 @@ queue_and_out:
 		if (tp->rx_opt.num_sacks)
 			tcp_sack_remove(tp);
 
-		tcp_fast_path_check(sk, tp);
+		tcp_fast_path_check(sk);
 
 		if (eaten > 0)
 			__kfree_skb(skb);
@@ -3734,7 +3759,7 @@ static int tcp_prune_queue(struct sock *sk)
 	NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED);
 
 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
-		tcp_clamp_window(sk, tp);
+		tcp_clamp_window(sk);
 	else if (tcp_memory_pressure)
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
@@ -3803,8 +3828,10 @@ void tcp_cwnd_application_limited(struct sock *sk)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
+static int tcp_should_expand_sndbuf(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	/* If the user specified a specific send buffer setting, do
 	 * not modify it.
 	 */
@@ -3836,7 +3863,7 @@ static void tcp_new_space(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tcp_should_expand_sndbuf(sk, tp)) {
+	if (tcp_should_expand_sndbuf(sk)) {
 		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
 			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
 		    demanded = max_t(unsigned int, tp->snd_cwnd,
@@ -3860,9 +3887,9 @@ static void tcp_check_space(struct sock *sk)
 	}
 }
 
-static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
+static inline void tcp_data_snd_check(struct sock *sk)
 {
-	tcp_push_pending_frames(sk, tp);
+	tcp_push_pending_frames(sk);
 	tcp_check_space(sk);
 }
 
@@ -4196,7 +4223,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				 */
 				tcp_ack(sk, skb, 0);
 				__kfree_skb(skb);
-				tcp_data_snd_check(sk, tp);
+				tcp_data_snd_check(sk);
 				return 0;
 			} else { /* Header too small */
 				TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -4267,12 +4294,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 			}
 
-			tcp_event_data_recv(sk, tp, skb);
+			tcp_event_data_recv(sk, skb);
 
 			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
 				/* Well, only one small jumplet in fast path... */
 				tcp_ack(sk, skb, FLAG_DATA);
-				tcp_data_snd_check(sk, tp);
+				tcp_data_snd_check(sk);
 				if (!inet_csk_ack_scheduled(sk))
 					goto no_ack;
 			}
@@ -4355,7 +4382,7 @@ step5:
 	/* step 7: process the segment text */
 	tcp_data_queue(sk, skb);
 
-	tcp_data_snd_check(sk, tp);
+	tcp_data_snd_check(sk);
 	tcp_ack_snd_check(sk);
 	return 0;
 
@@ -4672,7 +4699,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		/* Do step6 onward by hand. */
 		tcp_urg(sk, skb, th);
 		__kfree_skb(skb);
-		tcp_data_snd_check(sk, tp);
+		tcp_data_snd_check(sk);
 		return 0;
 	}
 
@@ -4864,7 +4891,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 	/* tcp_data could move socket to TIME-WAIT */
 	if (sk->sk_state != TCP_CLOSE) {
-		tcp_data_snd_check(sk, tp);
+		tcp_data_snd_check(sk);
 		tcp_ack_snd_check(sk);
 	}
 
-- 
cgit 


From 164891aadf1721fca4dce473bb0e0998181537c6 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Mon, 23 Apr 2007 22:26:16 -0700
Subject: [TCP]: Congestion control API update.

Do some simple changes to make congestion control API faster/cleaner.
* use ktime_t rather than timeval
* merge rtt sampling into existing ack callback
  this means one indirect call versus two per ack.
* use flags bits to store options/settings

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 63338939078..051f0f815f1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2402,14 +2402,6 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
 	return acked;
 }
 
-static u32 tcp_usrtt(struct timeval *tv)
-{
-	struct timeval now;
-
-	do_gettimeofday(&now);
-	return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec);
-}
-
 /* Remove acknowledged frames from the retransmission queue. */
 static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 {
@@ -2420,9 +2412,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 	int acked = 0;
 	__s32 seq_rtt = -1;
 	u32 pkts_acked = 0;
-	void (*rtt_sample)(struct sock *sk, u32 usrtt)
-		= icsk->icsk_ca_ops->rtt_sample;
-	struct timeval tv = { .tv_sec = 0, .tv_usec = 0 };
+	ktime_t last_ackt = ktime_set(0,0);
 
 	while ((skb = tcp_write_queue_head(sk)) &&
 	       skb != tcp_send_head(sk)) {
@@ -2471,7 +2461,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 				seq_rtt = -1;
 			} else if (seq_rtt < 0) {
 				seq_rtt = now - scb->when;
-				skb_get_timestamp(skb, &tv);
+				last_ackt = skb->tstamp;
 			}
 			if (sacked & TCPCB_SACKED_ACKED)
 				tp->sacked_out -= tcp_skb_pcount(skb);
@@ -2484,7 +2474,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 			}
 		} else if (seq_rtt < 0) {
 			seq_rtt = now - scb->when;
-			skb_get_timestamp(skb, &tv);
+			last_ackt = skb->tstamp;
 		}
 		tcp_dec_pcount_approx(&tp->fackets_out, skb);
 		tcp_packets_out_dec(tp, skb);
@@ -2494,13 +2484,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 	}
 
 	if (acked&FLAG_ACKED) {
+		const struct tcp_congestion_ops *ca_ops
+			= inet_csk(sk)->icsk_ca_ops;
+
 		tcp_ack_update_rtt(sk, acked, seq_rtt);
 		tcp_ack_packets_out(sk);
-		if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED))
-			(*rtt_sample)(sk, tcp_usrtt(&tv));
 
-		if (icsk->icsk_ca_ops->pkts_acked)
-			icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
+		if (ca_ops->pkts_acked)
+			ca_ops->pkts_acked(sk, pkts_acked, last_ackt);
 	}
 
 #if FASTRETRANS_DEBUG > 0
-- 
cgit 


From 575ee7140dabe9b9c4f66f4f867039b97e548867 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Mon, 30 Apr 2007 00:39:55 -0700
Subject: [TCP] FRTO: Delay skb available check until it's mandatory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No new data is needed until the first ACK comes, so no need to check
for application limitedness until then.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 051f0f815f1..6b669898b19 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1265,20 +1265,15 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	return flag;
 }
 
-/* F-RTO can only be used if these conditions are satisfied:
- *  - there must be some unsent new data
- *  - the advertised window should allow sending it
- *  - TCP has never retransmitted anything other than head (SACK enhanced
- *    variant from Appendix B of RFC4138 is more robust here)
+/* F-RTO can only be used if TCP has never retransmitted anything other than
+ * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
  */
 int tcp_use_frto(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 
-	if (!sysctl_tcp_frto || !tcp_send_head(sk) ||
-		after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
-		      tp->snd_una + tp->snd_wnd))
+	if (!sysctl_tcp_frto)
 		return 0;
 
 	if (IsSackFrto())
@@ -2710,6 +2705,14 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 	}
 
 	if (tp->frto_counter == 1) {
+		/* Sending of the next skb must be allowed or no FRTO */
+		if (!tcp_send_head(sk) ||
+		    after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
+				     tp->snd_una + tp->snd_wnd)) {
+			tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
+			return 1;
+		}
+
 		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
 		tp->frto_counter = 2;
 		return 1;
-- 
cgit 


From d551e4541dd60ae53459f77a971f2d6043431f5f Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Mon, 30 Apr 2007 00:42:20 -0700
Subject: [TCP] FRTO: RFC4138 allows Nagle override when new data must be sent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a corner case where less than MSS sized new data thingie
is awaiting in the send queue. For F-RTO to work correctly, a
new data segment must be sent at certain point or F-RTO cannot
be used at all. RFC4138 allows overriding of Nagle at that
point.

Implementation uses frto_counter states 2 and 3 to distinguish
when Nagle override is needed.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6b669898b19..7641b2761a1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2637,7 +2637,9 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
  *                  algorithm is not part of the F-RTO detection algorithm
  *                  given in RFC4138 but can be selected separately).
  * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
- * and TCP falls back to conventional RTO recovery.
+ * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
+ * of Nagle, this is done using frto_counter states 2 and 3, when a new data
+ * segment of any size sent during F-RTO, state 2 is upgraded to 3.
  *
  * Rationale: if the RTO was spurious, new ACKs should arrive from the
  * original window even after we transmit two new data segments.
@@ -2666,7 +2668,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		inet_csk(sk)->icsk_retransmits = 0;
 
 	if (!before(tp->snd_una, tp->frto_highmark)) {
-		tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
+		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
 		return 1;
 	}
 
@@ -2692,7 +2694,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 			return 1;
 		}
 
-		if ((tp->frto_counter == 2) &&
+		if ((tp->frto_counter >= 2) &&
 		    (!(flag&FLAG_FORWARD_PROGRESS) ||
 		     ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
 			/* RFC4138 shortcoming (see comment above) */
@@ -2709,14 +2711,15 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
 		if (!tcp_send_head(sk) ||
 		    after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
 				     tp->snd_una + tp->snd_wnd)) {
-			tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
+			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3),
+					    flag);
 			return 1;
 		}
 
 		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
 		tp->frto_counter = 2;
 		return 1;
-	} else /* frto_counter == 2 */ {
+	} else {
 		switch (sysctl_tcp_frto_response) {
 		case 2:
 			tcp_undo_spur_to_response(sk, flag);
-- 
cgit