From 4e55c736133f4545a4f1b8b3cbcdbc0fa6787b25 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@fedoraproject.org>
Date: Thu, 25 Aug 2016 07:31:24 -0700
Subject: Fix for TPROXY panic (rhbz 1370061)

Fix for known oom regression
---
 0001-OOM-detection-regressions-since-4.7.patch | 121 +++++++++++++++++++++++++
 kernel-panic-TPROXY-vanilla-4.7.1.patch        |  85 +++++++++++++++++
 kernel.spec                                    |  10 ++
 3 files changed, 216 insertions(+)
 create mode 100644 0001-OOM-detection-regressions-since-4.7.patch
 create mode 100644 kernel-panic-TPROXY-vanilla-4.7.1.patch

diff --git a/0001-OOM-detection-regressions-since-4.7.patch b/0001-OOM-detection-regressions-since-4.7.patch
new file mode 100644
index 000000000..4616c7f87
--- /dev/null
+++ b/0001-OOM-detection-regressions-since-4.7.patch
@@ -0,0 +1,121 @@
+From a7f80308bac4013728e33e2bcb9b60eee78f60fb Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@kernel.org>
+Date: Mon, 22 Aug 2016 11:32:49 +0200
+Subject: [PATCH] OOM detection regressions since 4.7
+
+Hi,
+there have been multiple reports [1][2][3][4][5] about pre-mature OOM
+killer invocations since 4.7 which contains oom detection rework. All of
+them were for order-2 (kernel stack) alloaction requests failing because
+of a high fragmentation and compaction failing to make any forward
+progress. While investigating this we have found out that the compaction
+just gives up too early. Vlastimil has been working on compaction
+improvement for quite some time and his series [6] is already sitting
+in mmotm tree. This already helps a lot because it drops some heuristics
+which are more aimed at lower latencies for high orders rather than
+reliability. Joonsoo has then identified further problem with too many
+blocks being marked as unmovable [7] and Vlastimil has prepared a patch
+on top of his series [8] which is also in the mmotm tree now.
+
+That being said, the regression is real and should be fixed for 4.7
+stable users. [6][8] was reported to help and ooms are no longer
+reproducible. I know we are quite late (rc3) in 4.8 but I would vote
+for mergeing those patches and have them in 4.8. For 4.7 I would go
+with a partial revert of the detection rework for high order requests
+(see patch below). This patch is really trivial. If those compaction
+improvements are just too large for 4.8 then we can use the same patch
+as for 4.7 stable for now and revert it in 4.9 after compaction changes
+are merged.
+
+Thoughts?
+
+[1] http://lkml.kernel.org/r/20160731051121.GB307@x4
+[2] http://lkml.kernel.org/r/201608120901.41463.a.miskiewicz@gmail.com
+[3] http://lkml.kernel.org/r/20160801192620.GD31957@dhcp22.suse.cz
+[4] https://lists.opensuse.org/opensuse-kernel/2016-08/msg00021.html
+[5] https://bugzilla.opensuse.org/show_bug.cgi?id=994066
+[6] http://lkml.kernel.org/r/20160810091226.6709-1-vbabka@suse.cz
+[7] http://lkml.kernel.org/r/20160816031222.GC16913@js1304-P5Q-DELUXE
+[8] http://lkml.kernel.org/r/f7a9ea9d-bb88-bfd6-e340-3a933559305a@suse.cz
+---
+ mm/page_alloc.c | 50 ++------------------------------------------------
+ 1 file changed, 2 insertions(+), 48 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 8b3e134..6e35419 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3254,53 +3254,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ 	return NULL;
+ }
+ 
+-static inline bool
+-should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+-		     enum compact_result compact_result, enum migrate_mode *migrate_mode,
+-		     int compaction_retries)
+-{
+-	int max_retries = MAX_COMPACT_RETRIES;
+-
+-	if (!order)
+-		return false;
+-
+-	/*
+-	 * compaction considers all the zone as desperately out of memory
+-	 * so it doesn't really make much sense to retry except when the
+-	 * failure could be caused by weak migration mode.
+-	 */
+-	if (compaction_failed(compact_result)) {
+-		if (*migrate_mode == MIGRATE_ASYNC) {
+-			*migrate_mode = MIGRATE_SYNC_LIGHT;
+-			return true;
+-		}
+-		return false;
+-	}
+-
+-	/*
+-	 * make sure the compaction wasn't deferred or didn't bail out early
+-	 * due to locks contention before we declare that we should give up.
+-	 * But do not retry if the given zonelist is not suitable for
+-	 * compaction.
+-	 */
+-	if (compaction_withdrawn(compact_result))
+-		return compaction_zonelist_suitable(ac, order, alloc_flags);
+-
+-	/*
+-	 * !costly requests are much more important than __GFP_REPEAT
+-	 * costly ones because they are de facto nofail and invoke OOM
+-	 * killer to move on while costly can fail and users are ready
+-	 * to cope with that. 1/4 retries is rather arbitrary but we
+-	 * would need much more detailed feedback from compaction to
+-	 * make a better decision.
+-	 */
+-	if (order > PAGE_ALLOC_COSTLY_ORDER)
+-		max_retries /= 4;
+-	if (compaction_retries <= max_retries)
+-		return true;
+-
+-	return false;
+-}
+ #else
+ static inline struct page *
+ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+@@ -3311,6 +3264,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ 	return NULL;
+ }
+ 
++#endif /* CONFIG_COMPACTION */
++
+ static inline bool
+ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+ 		     enum compact_result compact_result,
+@@ -3337,7 +3292,6 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
+ 	}
+ 	return false;
+ }
+-#endif /* CONFIG_COMPACTION */
+ 
+ /* Perform direct synchronous page reclaim */
+ static int
+-- 
+2.7.4
+
diff --git a/kernel-panic-TPROXY-vanilla-4.7.1.patch b/kernel-panic-TPROXY-vanilla-4.7.1.patch
new file mode 100644
index 000000000..9d045cabe
--- /dev/null
+++ b/kernel-panic-TPROXY-vanilla-4.7.1.patch
@@ -0,0 +1,85 @@
+From patchwork Wed Aug 17 16:04:31 2016
+Content-Type: text/plain; charset="utf-8"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+Subject: kernel panic TPROXY , vanilla 4.7.1
+From: Eric Dumazet <eric.dumazet@gmail.com>
+X-Patchwork-Id: 660174
+X-Patchwork-Delegate: davem@davemloft.net
+Message-Id: <1471449871.29842.3.camel@edumazet-glaptop3.roam.corp.google.com>
+To: Denys Fedoryshchenko <nuclearcat@nuclearcat.com>
+Cc: Linux Kernel Network Developers <netdev@vger.kernel.org>,
+ netfilter-devel@vger.kernel.org
+Date: Wed, 17 Aug 2016 09:04:31 -0700
+
+On Wed, 2016-08-17 at 08:42 -0700, Eric Dumazet wrote:
+> On Wed, 2016-08-17 at 17:31 +0300, Denys Fedoryshchenko wrote:
+> > Hi!
+> > 
+> > Tried to run squid on latest kernel, and hit a panic
+> > Sometimes it just shows warning in dmesg (but doesnt work properly)
+> > [   75.701666] IPv4: Attempt to release TCP socket in state 10 
+> > ffff88102d430780
+> > [   83.866974] squid (2700) used greatest stack depth: 12912 bytes left
+> > [   87.506644] IPv4: Attempt to release TCP socket in state 10 
+> > ffff880078a48780
+> > [  114.704295] IPv4: Attempt to release TCP socket in state 10 
+> > ffff881029f8ad00
+> > 
+> > I cannot catch yet oops/panic message, netconsole not working.
+> > 
+> > After triggering warning message 3 times, i am unable to run squid 
+> > anymore (without reboot), and in netstat it doesnt show port running.
+> > 
+> > firewall is:
+> > *mangle
+> > -A PREROUTING -p tcp -m socket -j DIVERT
+> > -A PREROUTING -p tcp -m tcp --dport 80 -i eno1 -j TPROXY --on-port 3129 
+> > --on-ip 0.0.0.0 --tproxy-mark 0x1/0x1
+> > -A DIVERT -j MARK --set-xmark 0x1/0xffffffff
+> > -A DIVERT -j ACCEPT
+> > 
+> > routing
+> > ip rule add fwmark 1 lookup 100
+> > ip route add local default dev eno1 table 100
+> > 
+> > 
+> > squid config is default with tproxy option
+> > http_port 3129 tproxy
+> > 
+> 
+> Hmppff... sorry for this, I will send a fix.
+> 
+> Thanks for the report !
+> 
+
+
+Could you try the following ?
+
+Thanks !
+
+ net/netfilter/xt_TPROXY.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
+index 7f4414d26a66..663c4c3c9072 100644
+--- a/net/netfilter/xt_TPROXY.c
++++ b/net/netfilter/xt_TPROXY.c
+@@ -127,6 +127,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+ 						    daddr, dport,
+ 						    in->ifindex);
+ 
++			if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
++				sk = NULL;
+ 			/* NOTE: we return listeners even if bound to
+ 			 * 0.0.0.0, those are filtered out in
+ 			 * xt_socket, since xt_TPROXY needs 0 bound
+@@ -195,6 +197,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+ 						   daddr, ntohs(dport),
+ 						   in->ifindex);
+ 
++			if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
++				sk = NULL;
+ 			/* NOTE: we return listeners even if bound to
+ 			 * 0.0.0.0, those are filtered out in
+ 			 * xt_socket, since xt_TPROXY needs 0 bound
diff --git a/kernel.spec b/kernel.spec
index 352d747fb..90fbc8543 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -635,6 +635,12 @@ Patch855: aacraid-Check-size-values-after-double-fetch-from-us.patch
 #rhbz 1365940
 Patch856: 0001-udp-fix-poll-issue-with-zero-sized-packets.patch
 
+#rhbz 13700161
+Patch857: kernel-panic-TPROXY-vanilla-4.7.1.patch
+
+# lkml.kernel.org/r/<20160822093249.GA14916@dhcp22.suse.cz>
+Patch858: 0001-OOM-detection-regressions-since-4.7.patch
+
 # END OF PATCH DEFINITIONS
 
 %endif
@@ -2162,6 +2168,10 @@ fi
 #
 # 
 %changelog
+* Thu Aug 25 2016 Laura Abbott <labbott@fedoraproject.org>
+- Fix for TPROXY panic (rhbz 1370061)
+- Fix for known OOM regression
+
 * Tue Aug 23 2016 Laura Abbott <labbot@fedoraproject.org>
 - Fix for inabiltiy to send zero sized UDP packets (rhbz 1365940)
 
-- 
cgit