diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 10 | ||||
-rw-r--r-- | net/ipv4/ah4.c | 6 | ||||
-rw-r--r-- | net/ipv4/arp.c | 22 | ||||
-rw-r--r-- | net/ipv4/esp4.c | 4 | ||||
-rw-r--r-- | net/ipv4/icmp.c | 12 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 8 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 31 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 277 | ||||
-rw-r--r-- | net/ipv4/inet_lro.c | 4 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 48 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 44 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 10 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 18 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 40 | ||||
-rw-r--r-- | net/ipv4/ipcomp.c | 6 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 35 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 40 | ||||
-rw-r--r-- | net/ipv4/netfilter.c | 4 | ||||
-rw-r--r-- | net/ipv4/proc.c | 66 | ||||
-rw-r--r-- | net/ipv4/raw.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 27 | ||||
-rw-r--r-- | net/ipv4/tcp_diag.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_htcp.c | 14 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 388 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 73 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 129 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 76 | ||||
-rw-r--r-- | net/ipv4/xfrm4_policy.c | 13 | ||||
-rw-r--r-- | net/ipv4/xfrm4_state.c | 3 |
31 files changed, 919 insertions, 499 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e3286814c8d..fe03048c130 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -245,7 +245,7 @@ static inline int inet_netns_ok(struct net *net, int protocol) int hash; struct net_protocol *ipprot; - if (net == &init_net) + if (net_eq(net, &init_net)) return 1; hash = protocol & (MAX_INET_PROTOS - 1); @@ -272,10 +272,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol) int try_loading_module = 0; int err; - if (sock->type != SOCK_RAW && - sock->type != SOCK_DGRAM && - !inet_ehash_secret) - build_ehash_secret(); + if (unlikely(!inet_ehash_secret)) + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) + build_ehash_secret(); sock->state = SS_UNCONNECTED; @@ -1114,6 +1113,7 @@ int inet_sk_rebuild_header(struct sock *sk) }, }, .proto = sk->sk_protocol, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = inet->sport, diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 3f205181712..e878e494296 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -201,6 +201,7 @@ out: static void ah4_err(struct sk_buff *skb, u32 info) { + struct net *net = dev_net(skb->dev); struct iphdr *iph = (struct iphdr *)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; @@ -209,7 +210,7 @@ static void ah4_err(struct sk_buff *skb, u32 info) icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; - x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); + x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) return; printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", @@ -293,9 +294,7 @@ static void ah_destroy(struct xfrm_state *x) return; kfree(ahp->work_icv); - ahp->work_icv = NULL; crypto_free_hash(ahp->tfm); - ahp->tfm = NULL; kfree(ahp); } @@ -316,6 +315,7 @@ static struct net_protocol ah4_protocol = { .handler = xfrm4_rcv, .err_handler = ah4_err, .no_policy = 1, + .netns_ok = 1, }; static int __init ah4_init(void) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 957c87dc8e1..29a74c01d8d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -818,18 +818,18 @@ static int arp_process(struct sk_buff *skb) addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { - n = neigh_event_ns(&arp_tbl, sha, &sip, dev); - if (n) { - int dont_send = 0; - - if (!dont_send) - dont_send |= arp_ignore(in_dev, sip, tip); - if (!dont_send && IN_DEV_ARPFILTER(in_dev)) - dont_send |= arp_filter(sip, tip, dev); - if (!dont_send) - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + int dont_send = 0; - neigh_release(n); + if (!dont_send) + dont_send |= arp_ignore(in_dev,sip,tip); + if (!dont_send && IN_DEV_ARPFILTER(in_dev)) + dont_send |= arp_filter(sip,tip,dev); + if (!dont_send) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + if (n) { + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + neigh_release(n); + } } goto out; } else if (IN_DEV_FORWARD(in_dev)) { diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 95a9c65003f..18bb383ea39 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -413,6 +413,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) static void esp4_err(struct sk_buff *skb, u32 info) { + struct net *net = dev_net(skb->dev); struct iphdr *iph = (struct iphdr *)skb->data; struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; @@ -421,7 +422,7 @@ static void esp4_err(struct sk_buff *skb, u32 info) icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; - x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); + x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", @@ -618,6 +619,7 @@ static struct net_protocol esp4_protocol = { .handler = xfrm4_rcv, .err_handler = esp4_err, .no_policy = 1, + .netns_ok = 1, }; static int __init esp4_init(void) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 21e497efbd7..705b33b184a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -321,12 +321,12 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, } static void icmp_push_reply(struct icmp_bxm *icmp_param, - struct ipcm_cookie *ipc, struct rtable *rt) + struct ipcm_cookie *ipc, struct rtable **rt) { struct sock *sk; struct sk_buff *skb; - sk = icmp_sk(dev_net(rt->u.dst.dev)); + sk = icmp_sk(dev_net((*rt)->u.dst.dev)); if (ip_append_data(sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, @@ -392,7 +392,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) } if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, icmp_param->data.icmph.code)) - icmp_push_reply(icmp_param, &ipc, rt); + icmp_push_reply(icmp_param, &ipc, &rt); ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); @@ -562,7 +562,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) /* No need to clone since we're just using its address. */ rt2 = rt; - err = xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); + err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); switch (err) { case 0: if (rt != rt2) @@ -601,7 +601,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (err) goto relookup_failed; - err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL, + err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL, XFRM_LOOKUP_ICMP); switch (err) { case 0: @@ -635,7 +635,7 @@ route_done: icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); - icmp_push_reply(&icmp_param, &ipc, rt); + icmp_push_reply(&icmp_param, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 36f4cbc7da3..1ccdbba528b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -109,7 +109,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->ib_net == net && tb->port == rover) + if (ib_net(tb) == net && tb->port == rover) goto next; break; next: @@ -137,7 +137,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->ib_net == net && tb->port == snum) + if (ib_net(tb) == net && tb->port == snum) goto tb_found; } tb = NULL; @@ -561,7 +561,7 @@ void inet_csk_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); - atomic_dec(sk->sk_prot->orphan_count); + percpu_counter_dec(sk->sk_prot->orphan_count); sock_put(sk); } @@ -641,7 +641,7 @@ void inet_csk_listen_stop(struct sock *sk) sock_orphan(child); - atomic_inc(sk->sk_prot->orphan_count); + percpu_counter_inc(sk->sk_prot->orphan_count); inet_csk_destroy_sock(child); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 564230dabcb..588a7796e3e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -718,13 +718,15 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) goto skip_listen_ht; - inet_listen_lock(hashinfo); for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct sock *sk; - struct hlist_node *node; + struct hlist_nulls_node *node; + struct inet_listen_hashbucket *ilb; num = 0; - sk_for_each(sk, node, &hashinfo->listening_hash[i]) { + ilb = &hashinfo->listening_hash[i]; + spin_lock_bh(&ilb->lock); + sk_nulls_for_each(sk, node, &ilb->head) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) { @@ -742,7 +744,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) goto syn_recv; if (inet_csk_diag_dump(sk, skb, cb) < 0) { - inet_listen_unlock(hashinfo); + spin_unlock_bh(&ilb->lock); goto done; } @@ -751,7 +753,7 @@ syn_recv: goto next_listen; if (inet_diag_dump_reqs(skb, sk, cb) < 0) { - inet_listen_unlock(hashinfo); + spin_unlock_bh(&ilb->lock); goto done; } @@ -760,12 +762,12 @@ next_listen: cb->args[4] = 0; ++num; } + spin_unlock_bh(&ilb->lock); s_num = 0; cb->args[3] = 0; cb->args[4] = 0; } - inet_listen_unlock(hashinfo); skip_listen_ht: cb->args[0] = 1; s_i = num = s_num = 0; @@ -776,20 +778,21 @@ skip_listen_ht: for (i = s_i; i < hashinfo->ehash_size; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; - rwlock_t *lock = inet_ehash_lockp(hashinfo, i); + spinlock_t *lock = inet_ehash_lockp(hashinfo, i); struct sock *sk; - struct hlist_node *node; + struct hlist_nulls_node *node; num = 0; - if (hlist_empty(&head->chain) && hlist_empty(&head->twchain)) + if (hlist_nulls_empty(&head->chain) && + hlist_nulls_empty(&head->twchain)) continue; if (i > s_i) s_num = 0; - read_lock_bh(lock); - sk_for_each(sk, node, &head->chain) { + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) @@ -803,7 +806,7 @@ skip_listen_ht: r->id.idiag_dport) goto next_normal; if (inet_csk_diag_dump(sk, skb, cb) < 0) { - read_unlock_bh(lock); + spin_unlock_bh(lock); goto done; } next_normal: @@ -825,14 +828,14 @@ next_normal: r->id.idiag_dport) goto next_dying; if (inet_twsk_diag_dump(tw, skb, cb) < 0) { - read_unlock_bh(lock); + spin_unlock_bh(lock); goto done; } next_dying: ++num; } } - read_unlock_bh(lock); + spin_unlock_bh(lock); } done: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 44981906fb9..6a1045da48d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -35,7 +35,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb != NULL) { - tb->ib_net = hold_net(net); + write_pnet(&tb->ib_net, hold_net(net)); tb->port = snum; tb->fastreuse = 0; INIT_HLIST_HEAD(&tb->owners); @@ -51,7 +51,7 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); - release_net(tb->ib_net); + release_net(ib_net(tb)); kmem_cache_free(cachep, tb); } } @@ -110,33 +110,29 @@ void __inet_inherit_port(struct sock *sk, struct sock *child) EXPORT_SYMBOL_GPL(__inet_inherit_port); -/* - * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. - * Look, when several writers sleep and reader wakes them up, all but one - * immediately hit write lock and grab all the cpus. Exclusive sleep solves - * this, _but_ remember, it adds useless work on UP machines (wake up each - * exclusive lock release). It should be ifdefed really. - */ -void inet_listen_wlock(struct inet_hashinfo *hashinfo) - __acquires(hashinfo->lhash_lock) +static inline int compute_score(struct sock *sk, struct net *net, + const unsigned short hnum, const __be32 daddr, + const int dif) { - write_lock(&hashinfo->lhash_lock); - - if (atomic_read(&hashinfo->lhash_users)) { - DEFINE_WAIT(wait); + int score = -1; + struct inet_sock *inet = inet_sk(sk); - for (;;) { - prepare_to_wait_exclusive(&hashinfo->lhash_wait, - &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&hashinfo->lhash_users)) - break; - write_unlock_bh(&hashinfo->lhash_lock); - schedule(); - write_lock_bh(&hashinfo->lhash_lock); + if (net_eq(sock_net(sk), net) && inet->num == hnum && + !ipv6_only_sock(sk)) { + __be32 rcv_saddr = inet->rcv_saddr; + score = sk->sk_family == PF_INET ? 1 : 0; + if (rcv_saddr) { + if (rcv_saddr != daddr) + return -1; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 2; } - - finish_wait(&hashinfo->lhash_wait, &wait); } + return score; } /* @@ -145,72 +141,48 @@ void inet_listen_wlock(struct inet_hashinfo *hashinfo) * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise. */ -static struct sock *inet_lookup_listener_slow(struct net *net, - const struct hlist_head *head, - const __be32 daddr, - const unsigned short hnum, - const int dif) -{ - struct sock *result = NULL, *sk; - const struct hlist_node *node; - int hiscore = -1; - - sk_for_each(sk, node, head) { - const struct inet_sock *inet = inet_sk(sk); - - if (net_eq(sock_net(sk), net) && inet->num == hnum && - !ipv6_only_sock(sk)) { - const __be32 rcv_saddr = inet->rcv_saddr; - int score = sk->sk_family == PF_INET ? 1 : 0; - - if (rcv_saddr) { - if (rcv_saddr != daddr) - continue; - score += 2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score += 2; - } - if (score == 5) - return sk; - if (score > hiscore) { - hiscore = score; - result = sk; - } - } - } - return result; -} -/* Optimize the common listener case. */ + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, const __be32 daddr, const unsigned short hnum, const int dif) { - struct sock *sk = NULL; - const struct hlist_head *head; - - read_lock(&hashinfo->lhash_lock); - head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; - if (!hlist_empty(head)) { - const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); - - if (inet->num == hnum && !sk->sk_node.next && - (!inet->rcv_saddr || inet->rcv_saddr == daddr) && - (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && - !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) - goto sherry_cache; - sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); + struct sock *sk, *result; + struct hlist_nulls_node *node; + unsigned int hash = inet_lhashfn(net, hnum); + struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; + int score, hiscore; + + rcu_read_lock(); +begin: + result = NULL; + hiscore = -1; + sk_nulls_for_each_rcu(sk, node, &ilb->head) { + score = compute_score(sk, net, hnum, daddr, dif); + if (score > hiscore) { + result = sk; + hiscore = score; + } } - if (sk) { -sherry_cache: - sock_hold(sk); + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) + goto begin; + if (result) { + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score(result, net, hnum, daddr, + dif) < hiscore)) { + sock_put(result); + goto begin; + } } - read_unlock(&hashinfo->lhash_lock); - return sk; + rcu_read_unlock(); + return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); @@ -223,35 +195,65 @@ struct sock * __inet_lookup_established(struct net *net, INET_ADDR_COOKIE(acookie, saddr, daddr) const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; - const struct hlist_node *node; + const struct hlist_nulls_node *node; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); - struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); - rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); + unsigned int slot = hash & (hashinfo->ehash_size - 1); + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - prefetch(head->chain.first); - read_lock(lock); - sk_for_each(sk, node, &head->chain) { + rcu_read_lock(); +begin: + sk_nulls_for_each_rcu(sk, node, &head->chain) { if (INET_MATCH(sk, net, hash, acookie, - saddr, daddr, ports, dif)) - goto hit; /* You sunk my battleship! */ + saddr, daddr, ports, dif)) { + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) + goto begintw; + if (unlikely(!INET_MATCH(sk, net, hash, acookie, + saddr, daddr, ports, dif))) { + sock_put(sk); + goto begin; + } + goto out; + } } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begin; +begintw: /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &head->twchain) { + sk_nulls_for_each_rcu(sk, node, &head->twchain) { if (INET_TW_MATCH(sk, net, hash, acookie, - saddr, daddr, ports, dif)) - goto hit; + saddr, daddr, ports, dif)) { + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { + sk = NULL; + goto out; + } + if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie, + saddr, daddr, ports, dif))) { + sock_put(sk); + goto begintw; + } + goto out; + } } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begintw; sk = NULL; out: - read_unlock(lock); + rcu_read_unlock(); return sk; -hit: - sock_hold(sk); - goto out; } EXPORT_SYMBOL_GPL(__inet_lookup_established); @@ -270,16 +272,15 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct net *net = sock_net(sk); unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - rwlock_t *lock = inet_ehash_lockp(hinfo, hash); + spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; - const struct hlist_node *node; + const struct hlist_nulls_node *node; struct inet_timewait_sock *tw; - prefetch(head->chain.first); - write_lock(lock); + spin_lock(lock); /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &head->twchain) { + sk_nulls_for_each(sk2, node, &head->twchain) { tw = inet_twsk(sk2); if (INET_TW_MATCH(sk2, net, hash, acookie, @@ -293,7 +294,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, tw = NULL; /* And established part... */ - sk_for_each(sk2, node, &head->chain) { + sk_nulls_for_each(sk2, node, &head->chain) { if (INET_MATCH(sk2, net, hash, acookie, saddr, daddr, ports, dif)) goto not_unique; @@ -306,9 +307,9 @@ unique: inet->sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); + __sk_nulls_add_node_rcu(sk, &head->chain); + spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - write_unlock(lock); if (twp) { *twp = tw; @@ -324,7 +325,7 @@ unique: return 0; not_unique: - write_unlock(lock); + spin_unlock(lock); return -EADDRNOTAVAIL; } @@ -338,8 +339,8 @@ static inline u32 inet_sk_port_offset(const struct sock *sk) void __inet_hash_nolisten(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct hlist_head *list; - rwlock_t *lock; + struct hlist_nulls_head *list; + spinlock_t *lock; struct inet_ehash_bucket *head; WARN_ON(!sk_unhashed(sk)); @@ -349,18 +350,17 @@ void __inet_hash_nolisten(struct sock *sk) list = &head->chain; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - write_lock(lock); - __sk_add_node(sk, list); + spin_lock(lock); + __sk_nulls_add_node_rcu(sk, list); + spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - write_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); static void __inet_hash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct hlist_head *list; - rwlock_t *lock; + struct inet_listen_hashbucket *ilb; if (sk->sk_state != TCP_LISTEN) { __inet_hash_nolisten(sk); @@ -368,14 +368,12 @@ static void __inet_hash(struct sock *sk) } WARN_ON(!sk_unhashed(sk)); - list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &hashinfo->lhash_lock; + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - inet_listen_wlock(hashinfo); - __sk_add_node(sk, list); + spin_lock(&ilb->lock); + __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - write_unlock(lock); - wake_up(&hashinfo->lhash_wait); + spin_unlock(&ilb->lock); } void inet_hash(struct sock *sk) @@ -390,27 +388,23 @@ EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { - rwlock_t *lock; struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + spinlock_t *lock; + int done; if (sk_unhashed(sk)) - goto out; + return; - if (sk->sk_state == TCP_LISTEN) { - local_bh_disable(); - inet_listen_wlock(hashinfo); - lock = &hashinfo->lhash_lock; - } else { + if (sk->sk_state == TCP_LISTEN) + lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; + else lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - write_lock_bh(lock); - } - if (__sk_del_node_init(sk)) + spin_lock_bh(lock); + done =__sk_nulls_del_node_init_rcu(sk); + if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - write_unlock_bh(lock); -out: - if (sk->sk_state == TCP_LISTEN) - wake_up(&hashinfo->lhash_wait); + spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(inet_unhash); @@ -449,7 +443,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, * unique enough. */ inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->ib_net == net && tb->port == port) { + if (ib_net(tb) == net && tb->port == port) { WARN_ON(hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) goto next_port; @@ -524,3 +518,16 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, } EXPORT_SYMBOL_GPL(inet_hash_connect); + +void inet_hashinfo_init(struct inet_hashinfo *h) +{ + int i; + + for (i = 0; i < INET_LHTABLE_SIZE; i++) { + spin_lock_init(&h->listening_hash[i].lock); + INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, + i + LISTENING_NULLS_BASE); + } +} + +EXPORT_SYMBOL_GPL(inet_hashinfo_init); diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index cfd034a2b96..6a667dae315 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -120,7 +120,7 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); tcph->check = 0; - tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0); + tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, lro_desc->ip_tot_len - @@ -135,7 +135,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) __wsum tcp_ps_hdr_csum; tcp_csum = ~csum_unfold(tcph->check); - tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum); + tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum); tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, len + TCP_HDR_LEN(tcph), diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1c5fd38f882..8554d0ea171 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_bind_hashbucket *bhead; struct inet_bind_bucket *tb; /* Unlink from established hashes. */ - rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); + spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); - write_lock(lock); - if (hlist_unhashed(&tw->tw_node)) { - write_unlock(lock); + spin_lock(lock); + if (hlist_nulls_unhashed(&tw->tw_node)) { + spin_unlock(lock); return; } - __hlist_del(&tw->tw_node); - sk_node_init(&tw->tw_node); - write_unlock(lock); + hlist_nulls_del_rcu(&tw->tw_node); + sk_nulls_node_init(&tw->tw_node); + spin_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, @@ -76,7 +76,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); - rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in @@ -90,17 +90,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); - write_lock(lock); + spin_lock(lock); - /* Step 2: Remove SK from established hash. */ - if (__sk_del_node_init(sk)) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - - /* Step 3: Hash TW into TIMEWAIT chain. */ - inet_twsk_add_node(tw, &ehead->twchain); + /* + * Step 2: Hash TW into TIMEWAIT chain. + * Should be done before removing sk from established chain + * because readers are lockless and search established first. + */ atomic_inc(&tw->tw_refcnt); + inet_twsk_add_node_rcu(tw, &ehead->twchain); - write_unlock(lock); + /* Step 3: Remove SK from established hash. */ + if (__sk_nulls_del_node_init_rcu(sk)) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + + spin_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); @@ -416,17 +420,17 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, { struct inet_timewait_sock *tw; struct sock *sk; - struct hlist_node *node; + struct hlist_nulls_node *node; int h; local_bh_disable(); for (h = 0; h < (hashinfo->ehash_size); h++) { struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, h); - rwlock_t *lock = inet_ehash_lockp(hashinfo, h); + spinlock_t *lock = inet_ehash_lockp(hashinfo, h); restart: - write_lock(lock); - sk_for_each(sk, node, &head->twchain) { + spin_lock(lock); + sk_nulls_for_each(sk, node, &head->twchain) { tw = inet_twsk(sk); if (!net_eq(twsk_net(tw), net) || @@ -434,13 +438,13 @@ restart: continue; atomic_inc(&tw->tw_refcnt); - write_unlock(lock); + spin_unlock(lock); inet_twsk_deschedule(tw, twdr); inet_twsk_put(tw); goto restart; } - write_unlock(lock); + spin_unlock(lock); } local_bh_enable(); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 191ef758813..0101521f366 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -126,8 +126,6 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev); /* Fallback tunnel: no source, no destination, no key, no options */ -static int ipgre_fb_tunnel_init(struct net_device *dev); - #define HASH_SIZE 16 static int ipgre_net_id; @@ -1142,6 +1140,7 @@ static int ipgre_open(struct net_device *dev) static int ipgre_close(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); + if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; in_dev = inetdev_by_index(dev_net(dev), t->mlink); @@ -1155,14 +1154,22 @@ static int ipgre_close(struct net_device *dev) #endif +static const struct net_device_ops ipgre_netdev_ops = { + .ndo_init = ipgre_tunnel_init, + .ndo_uninit = ipgre_tunnel_uninit, +#ifdef CONFIG_NET_IPGRE_BROADCAST + .ndo_open = ipgre_open, + .ndo_stop = ipgre_close, +#endif + .ndo_start_xmit = ipgre_tunnel_xmit, + .ndo_do_ioctl = ipgre_tunnel_ioctl, + .ndo_change_mtu = ipgre_tunnel_change_mtu, +}; + static void ipgre_tunnel_setup(struct net_device *dev) { - dev->init = ipgre_tunnel_init; - dev->uninit = ipgre_tunnel_uninit; + dev->netdev_ops = &ipgre_netdev_ops; dev->destructor = free_netdev; - dev->hard_start_xmit = ipgre_tunnel_xmit; - dev->do_ioctl = ipgre_tunnel_ioctl; - dev->change_mtu = ipgre_tunnel_change_mtu; dev->type = ARPHRD_IPGRE; dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; @@ -1194,8 +1201,6 @@ static int ipgre_tunnel_init(struct net_device *dev) return -EINVAL; dev->flags = IFF_BROADCAST; dev->header_ops = &ipgre_header_ops; - dev->open = ipgre_open; - dev->stop = ipgre_close; } #endif } else @@ -1204,7 +1209,7 @@ static int ipgre_tunnel_init(struct net_device *dev) return 0; } -static int ipgre_fb_tunnel_init(struct net_device *dev) +static void ipgre_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -1220,7 +1225,6 @@ static int ipgre_fb_tunnel_init(struct net_device *dev) dev_hold(dev); ign->tunnels_wc[0] = tunnel; - return 0; } @@ -1264,9 +1268,9 @@ static int ipgre_init_net(struct net *net) err = -ENOMEM; goto err_alloc_dev; } - - ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init; dev_net_set(ign->fb_tunnel_dev, net); + + ipgre_fb_tunnel_init(ign->fb_tunnel_dev); ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; if ((err = register_netdev(ign->fb_tunnel_dev))) @@ -1397,16 +1401,22 @@ static int ipgre_tap_init(struct net_device *dev) return 0; } +static const struct net_device_ops ipgre_tap_netdev_ops = { + .ndo_init = ipgre_tap_init, + .ndo_uninit = ipgre_tunnel_uninit, + .ndo_start_xmit = ipgre_tunnel_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ipgre_tunnel_change_mtu, +}; + static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); - dev->init = ipgre_tap_init; - dev->uninit = ipgre_tunnel_uninit; + dev->netdev_ops = &ipgre_netdev_ops; dev->destructor = free_netdev; - dev->hard_start_xmit = ipgre_tunnel_xmit; - dev->change_mtu = ipgre_tunnel_change_mtu; dev->iflink = 0; dev->features |= NETIF_F_NETNS_LOCAL; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 70bedab03b0..1a58a6fa1dc 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -209,9 +209,17 @@ static int ip_local_deliver_finish(struct sk_buff *skb) hash = protocol & (MAX_INET_PROTOS - 1); ipprot = rcu_dereference(inet_protos[hash]); - if (ipprot != NULL && (net == &init_net || ipprot->netns_ok)) { + if (ipprot != NULL) { int ret; + if (!net_eq(net, &init_net) && !ipprot->netns_ok) { + if (net_ratelimit()) + printk("%s: proto %d isn't netns-ready\n", + __func__, protocol); + kfree_skb(skb); + goto out; + } + if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 46d7be233ea..8ebe86dd72a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -778,7 +778,7 @@ int ip_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - struct ipcm_cookie *ipc, struct rtable *rt, + struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); @@ -793,6 +793,7 @@ int ip_append_data(struct sock *sk, int offset = 0; unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; + struct rtable *rt; if (flags&MSG_PROBE) return 0; @@ -812,7 +813,11 @@ int ip_append_data(struct sock *sk, inet->cork.flags |= IPCORK_OPT; inet->cork.addr = ipc->addr; } - dst_hold(&rt->u.dst); + rt = *rtp; + /* + * We steal reference to this route, caller should not release it + */ + *rtp = NULL; inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); @@ -1279,7 +1284,12 @@ int ip_push_pending_frames(struct sock *sk) skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb->dst = dst_clone(&rt->u.dst); + /* + * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec + * on dst refcount + */ + inet->cork.dst = NULL; + skb->dst = &rt->u.dst; if (iph->protocol == IPPROTO_ICMP) icmp_out_count(net, ((struct icmphdr *) @@ -1391,7 +1401,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, - &ipc, rt, MSG_DONTWAIT); + &ipc, &rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { if (arg->csumoffset >= 0) *((__sum16 *)skb_transport_header(skb) + diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e976efeb145..43c05854d75 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -48,6 +48,7 @@ #define IP_CMSG_RECVOPTS 8 #define IP_CMSG_RETOPTS 16 #define IP_CMSG_PASSSEC 32 +#define IP_CMSG_ORIGDSTADDR 64 /* * SOL_IP control messages. @@ -126,6 +127,27 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) security_release_secctx(secdata, seclen); } +static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) +{ + struct sockaddr_in sin; + struct iphdr *iph = ip_hdr(skb); + __be16 *ports = (__be16 *)skb_transport_header(skb); + + if (skb_transport_offset(skb) + 4 > skb->len) + return; + + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = iph->daddr; + sin.sin_port = ports[1]; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); + + put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); +} void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) { @@ -160,6 +182,12 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) if (flags & 1) ip_cmsg_recv_security(msg, skb); + + if ((flags>>=1) == 0) + return; + if (flags & 1) + ip_cmsg_recv_dstaddr(msg, skb); + } int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) @@ -421,7 +449,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || optname == IP_MULTICAST_TTL || - optname == IP_MULTICAST_LOOP) { + optname == IP_MULTICAST_LOOP || + optname == IP_RECVORIGDSTADDR) { if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -509,6 +538,12 @@ static int do_ip_setsockopt(struct sock *sk, int level, else inet->cmsg_flags &= ~IP_CMSG_PASSSEC; break; + case IP_RECVORIGDSTADDR: + if (val) + inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR; + else + inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR; + break; case IP_TOS: /* This sets both TOS and Precedence */ if (sk->sk_type == SOCK_STREAM) { val &= ~3; @@ -1022,6 +1057,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_PASSSEC: val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; break; + case IP_RECVORIGDSTADDR: + val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0; + break; case IP_TOS: val = inet->tos; break; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index ec8264ae45c..3262ce06294 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -35,7 +35,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) return; spi = htonl(ntohs(ipch->cpi)); - x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, + x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return; @@ -49,7 +49,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct xfrm_state *t; - t = xfrm_state_alloc(); + t = xfrm_state_alloc(&init_net); if (t == NULL) goto out; @@ -85,7 +85,7 @@ static int ipcomp_tunnel_attach(struct xfrm_state *x) int err = 0; struct xfrm_state *t; - t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4, + t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4, x->props.saddr.a4, IPPROTO_IPIP, AF_INET); if (!t) { t = ipcomp_tunnel_create(x); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index b3c3d7b0d11..5079dfbc6f3 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -130,8 +130,8 @@ struct ipip_net { struct net_device *fb_tunnel_dev; }; -static int ipip_fb_tunnel_init(struct net_device *dev); -static int ipip_tunnel_init(struct net_device *dev); +static void ipip_fb_tunnel_init(struct net_device *dev); +static void ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); static DEFINE_RWLOCK(ipip_lock); @@ -245,9 +245,10 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, } nt = netdev_priv(dev); - dev->init = ipip_tunnel_init; nt->parms = *parms; + ipip_tunnel_init(dev); + if (register_netdevice(dev) < 0) goto failed_free; @@ -691,12 +692,17 @@ static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) return 0; } +static const struct net_device_ops ipip_netdev_ops = { + .ndo_uninit = ipip_tunnel_uninit, + .ndo_start_xmit = ipip_tunnel_xmit, + .ndo_do_ioctl = ipip_tunnel_ioctl, + .ndo_change_mtu = ipip_tunnel_change_mtu, + +}; + static void ipip_tunnel_setup(struct net_device *dev) { - dev->uninit = ipip_tunnel_uninit; - dev->hard_start_xmit = ipip_tunnel_xmit; - dev->do_ioctl = ipip_tunnel_ioctl; - dev->change_mtu = ipip_tunnel_change_mtu; + dev->netdev_ops = &ipip_netdev_ops; dev->destructor = free_netdev; dev->type = ARPHRD_TUNNEL; @@ -708,11 +714,9 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; } -static int ipip_tunnel_init(struct net_device *dev) +static void ipip_tunnel_init(struct net_device *dev) { - struct ip_tunnel *tunnel; - - tunnel = netdev_priv(dev); + struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -721,11 +725,9 @@ static int ipip_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip_tunnel_bind_dev(dev); - - return 0; } -static int ipip_fb_tunnel_init(struct net_device *dev) +static void ipip_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -740,7 +742,6 @@ static int ipip_fb_tunnel_init(struct net_device *dev) dev_hold(dev); ipn->tunnels_wc[0] = tunnel; - return 0; } static struct xfrm_tunnel ipip_handler = { @@ -792,10 +793,10 @@ static int ipip_init_net(struct net *net) err = -ENOMEM; goto err_alloc_dev; } - - ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init; dev_net_set(ipn->fb_tunnel_dev, net); + ipip_fb_tunnel_init(ipn->fb_tunnel_dev); + if ((err = register_netdev(ipn->fb_tunnel_dev))) goto err_reg_dev; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 05ed336f798..77fc4d3fdf6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -124,8 +124,8 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) dev = __dev_get_by_name(&init_net, "tunl0"); if (dev) { + const struct net_device_ops *ops = dev->netdev_ops; struct ifreq ifr; - mm_segment_t oldfs; struct ip_tunnel_parm p; memset(&p, 0, sizeof(p)); @@ -137,9 +137,13 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) sprintf(p.name, "dvmrp%d", v->vifc_vifi); ifr.ifr_ifru.ifru_data = (__force void __user *)&p; - oldfs = get_fs(); set_fs(KERNEL_DS); - dev->do_ioctl(dev, &ifr, SIOCDELTUNNEL); - set_fs(oldfs); + if (ops->ndo_do_ioctl) { + mm_segment_t oldfs = get_fs(); + + set_fs(KERNEL_DS); + ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL); + set_fs(oldfs); + } } } @@ -151,9 +155,9 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) dev = __dev_get_by_name(&init_net, "tunl0"); if (dev) { + const struct net_device_ops *ops = dev->netdev_ops; int err; struct ifreq ifr; - mm_segment_t oldfs; struct ip_tunnel_parm p; struct in_device *in_dev; @@ -166,9 +170,14 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) sprintf(p.name, "dvmrp%d", v->vifc_vifi); ifr.ifr_ifru.ifru_data = (__force void __user *)&p; - oldfs = get_fs(); set_fs(KERNEL_DS); - err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); - set_fs(oldfs); + if (ops->ndo_do_ioctl) { + mm_segment_t oldfs = get_fs(); + + set_fs(KERNEL_DS); + err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + } else + err = -EOPNOTSUPP; dev = NULL; @@ -213,12 +222,16 @@ static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) return 0; } +static const struct net_device_ops reg_vif_netdev_ops = { + .ndo_start_xmit = reg_vif_xmit, +}; + static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; - dev->hard_start_xmit = reg_vif_xmit; + dev->netdev_ops = ®_vif_netdev_ops, dev->destructor = free_netdev; } @@ -1945,13 +1958,14 @@ int __init ip_mr_init(void) goto proc_cache_fail; #endif return 0; -reg_notif_fail: - kmem_cache_destroy(mrt_cachep); #ifdef CONFIG_PROC_FS -proc_vif_fail: - unregister_netdevice_notifier(&ip_mr_notifier); proc_cache_fail: proc_net_remove(&init_net, "ip_mr_vif"); +proc_vif_fail: + unregister_netdevice_notifier(&ip_mr_notifier); #endif +reg_notif_fail: + del_timer(&ipmr_expire_timer); + kmem_cache_destroy(mrt_cachep); return err; } diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 7c145d76384..fdf6811c31a 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -66,7 +66,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) #ifdef CONFIG_XFRM if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && xfrm_decode_session(skb, &fl, AF_INET) == 0) - if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0)) + if (xfrm_lookup(net, &skb->dst, &fl, skb->sk, 0)) return -1; #endif @@ -97,7 +97,7 @@ int ip_xfrm_me_harder(struct sk_buff *skb) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - if (xfrm_lookup(&dst, &fl, skb->sk, 0) < 0) + if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) return -1; dst_release(skb->dst); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8f5a403f6f6..614958b7c27 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -54,8 +54,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", sock_prot_inuse_get(net, &tcp_prot), - atomic_read(&tcp_orphan_count), - tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), + (int)percpu_counter_sum_positive(&tcp_orphan_count), + tcp_death_row.tw_count, + (int)percpu_counter_sum_positive(&tcp_sockets_allocated), atomic_read(&tcp_memory_allocated)); seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse_get(net, &udp_prot), @@ -234,46 +235,51 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), + SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), + SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), + SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), SNMP_MIB_SENTINEL }; +static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, + unsigned short *type, int count) +{ + int j; + + if (count) { + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < count; ++j) + seq_printf(seq, " %sType%u", + type[j] & 0x100 ? "Out" : "In", + type[j] & 0xff); + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < count; ++j) + seq_printf(seq, " %lu", vals[j]); + } +} + static void icmpmsg_put(struct seq_file *seq) { #define PERLINE 16 - int j, i, count; - static int out[PERLINE]; + int i, count; + unsigned short type[PERLINE]; + unsigned long vals[PERLINE], val; struct net *net = seq->private; count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { - - if (snmp_fold_field((void **) net->mib.icmpmsg_statistics, i)) - out[count++] = i; - if (count < PERLINE) - continue; - - seq_printf(seq, "\nIcmpMsg:"); - for (j = 0; j < PERLINE; ++j) - seq_printf(seq, " %sType%u", i & 0x100 ? "Out" : "In", - i & 0xff); - seq_printf(seq, "\nIcmpMsg: "); - for (j = 0; j < PERLINE; ++j) - seq_printf(seq, " %lu", - snmp_fold_field((void **) net->mib.icmpmsg_statistics, - out[j])); - seq_putc(seq, '\n'); - } - if (count) { - seq_printf(seq, "\nIcmpMsg:"); - for (j = 0; j < count; ++j) - seq_printf(seq, " %sType%u", out[j] & 0x100 ? "Out" : - "In", out[j] & 0xff); - seq_printf(seq, "\nIcmpMsg:"); - for (j = 0; j < count; ++j) - seq_printf(seq, " %lu", snmp_fold_field((void **) - net->mib.icmpmsg_statistics, out[j])); + val = snmp_fold_field((void **) net->mib.icmpmsg_statistics, i); + if (val) { + type[count] = i; + vals[count++] = val; + } + if (count == PERLINE) { + icmpmsg_put_line(seq, vals, type, count); + count = 0; + } } + icmpmsg_put_line(seq, vals, type, count); #undef PERLINE } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 998fcffc9e1..dff8bc4e0fa 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -572,7 +572,7 @@ back_from_confirm: ipc.addr = rt->rt_dst; lock_sock(sk); err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, - &ipc, rt, msg->msg_flags); + &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0dc0c382676..77bfba97595 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -160,7 +160,6 @@ static struct dst_ops ipv4_dst_ops = { .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .local_out = __ip_local_out, - .entry_size = sizeof(struct rtable), .entries = ATOMIC_INIT(0), }; @@ -2701,7 +2700,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .destroy = ipv4_dst_destroy, .check = ipv4_dst_check, .update_pmtu = ipv4_rt_blackhole_update_pmtu, - .entry_size = sizeof(struct rtable), .entries = ATOMIC_INIT(0), }; @@ -2763,7 +2761,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, flp->fl4_src = (*rp)->rt_src; if (!flp->fl4_dst) flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, + err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, flags ? XFRM_LOOKUP_WAIT : 0); if (err == -EREMOTE) err = ipv4_dst_blackhole(net, rp, flp); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 60c28add96b..01924340862 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -277,8 +277,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; -atomic_t tcp_orphan_count = ATOMIC_INIT(0); - +struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); int sysctl_tcp_mem[3] __read_mostly; @@ -290,9 +289,12 @@ EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_t tcp_memory_allocated; /* Current allocated memory. */ -atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */ - EXPORT_SYMBOL(tcp_memory_allocated); + +/* + * Current number of TCP sockets. + */ +struct percpu_counter tcp_sockets_allocated; EXPORT_SYMBOL(tcp_sockets_allocated); /* @@ -1374,8 +1376,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || - signal_pending(current) || - (flags & MSG_PEEK)) + signal_pending(current)) break; } else { if (sock_flag(sk, SOCK_DONE)) @@ -1835,7 +1836,7 @@ adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); - atomic_inc(sk->sk_prot->orphan_count); + percpu_counter_inc(sk->sk_prot->orphan_count); /* It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); @@ -1886,9 +1887,11 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { + int orphan_count = percpu_counter_read_positive( + sk->sk_prot->orphan_count); + sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, - atomic_read(sk->sk_prot->orphan_count))) { + if (tcp_too_many_orphans(sk, orphan_count)) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many of orphaned " "sockets\n"); @@ -2686,6 +2689,8 @@ void __init tcp_init(void) BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); + percpu_counter_init(&tcp_sockets_allocated, 0); + percpu_counter_init(&tcp_orphan_count, 0); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, @@ -2708,8 +2713,8 @@ void __init tcp_init(void) thash_entries ? 0 : 512 * 1024); tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; for (i = 0; i < tcp_hashinfo.ehash_size; i++) { - INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); - INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); + INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); + INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); } if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 838d491dfda..fcbcd4ff6c5 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -34,7 +34,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, tcp_get_info(sk, info); } -static struct inet_diag_handler tcp_diag_handler = { +static const struct inet_diag_handler tcp_diag_handler = { .idiag_hashinfo = &tcp_hashinfo, .idiag_get_info = tcp_diag_get_info, .idiag_type = TCPDIAG_GETSOCK, diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index af99776146f..937549b8a92 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -69,9 +69,12 @@ static u32 htcp_cwnd_undo(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - ca->last_cong = ca->undo_last_cong; - ca->maxRTT = ca->undo_maxRTT; - ca->old_maxB = ca->undo_old_maxB; + if (ca->undo_last_cong) { + ca->last_cong = ca->undo_last_cong; + ca->maxRTT = ca->undo_maxRTT; + ca->old_maxB = ca->undo_old_maxB; + ca->undo_last_cong = 0; + } return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta); } @@ -268,7 +271,10 @@ static void htcp_state(struct sock *sk, u8 new_state) case TCP_CA_Open: { struct htcp *ca = inet_csk_ca(sk); - ca->last_cong = jiffies; + if (ca->undo_last_cong) { + ca->last_cong = jiffies; + ca->undo_last_cong = 0; + } } break; case TCP_CA_CWR: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 097294b7da3..d67b6e9cc54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1002,7 +1002,8 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) } } -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, + struct sk_buff *skb) { tcp_verify_retransmit_hint(tp, skb); @@ -1241,26 +1242,47 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, * aligned portion of it that matches. Therefore we might need to fragment * which may fail and creates some hassle (caller must handle error case * returns). + * + * FIXME: this could be merged to shift decision code */ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, u32 start_seq, u32 end_seq) { int in_sack, err; unsigned int pkt_len; + unsigned int mss; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); if (tcp_skb_pcount(skb) > 1 && !in_sack && after(TCP_SKB_CB(skb)->end_seq, start_seq)) { - + mss = tcp_skb_mss(skb); in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); - if (!in_sack) + if (!in_sack) { pkt_len = start_seq - TCP_SKB_CB(skb)->seq; - else + if (pkt_len < mss) + pkt_len = mss; + } else { pkt_len = end_seq - TCP_SKB_CB(skb)->seq; - err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); + if (pkt_len < mss) + return -EINVAL; + } + + /* Round if necessary so that SACKs cover only full MSSes + * and/or the remaining small portion (if present) + */ + if (pkt_len > mss) { + unsigned int new_len = (pkt_len / mss) * mss; + if (!in_sack && new_len < pkt_len) { + new_len += mss; + if (new_len > skb->len) + return 0; + } + pkt_len = new_len; + } + err = tcp_fragment(sk, skb, pkt_len, mss); if (err < 0) return err; } @@ -1269,7 +1291,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, } static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, - int *reord, int dup_sack, int fack_count) + int *reord, int dup_sack, int fack_count, + u8 *sackedto, int pcount) { struct tcp_sock *tp = tcp_sk(sk); u8 sacked = TCP_SKB_CB(skb)->sacked; @@ -1294,10 +1317,9 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, * that retransmission is still in flight. */ if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= - ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); - tp->lost_out -= tcp_skb_pcount(skb); - tp->retrans_out -= tcp_skb_pcount(skb); + *sackedto &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out -= pcount; + tp->retrans_out -= pcount; } } else { if (!(sacked & TCPCB_RETRANS)) { @@ -1314,48 +1336,280 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, } if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - tp->lost_out -= tcp_skb_pcount(skb); + *sackedto &= ~TCPCB_LOST; + tp->lost_out -= pcount; } } - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + *sackedto |= TCPCB_SACKED_ACKED; flag |= FLAG_DATA_SACKED; - tp->sacked_out += tcp_skb_pcount(skb); + tp->sacked_out += pcount; - fack_count += tcp_skb_pcount(skb); + fack_count += pcount; /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) - tp->lost_cnt_hint += tcp_skb_pcount(skb); + tp->lost_cnt_hint += pcount; if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; - - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) - tcp_advance_highest_sack(sk, skb); } /* D-SACK. We can detect redundant retransmission in S|R and plain R * frames and clear it. undo_retrans is decreased above, L|R frames * are accounted above as well. */ - if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); + if (dup_sack && (*sackedto & TCPCB_SACKED_RETRANS)) { + *sackedto &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= pcount; } return flag; } +static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + struct sk_buff *skb, unsigned int pcount, + int shifted, int fack_count, int *reord, + int *flag, int mss) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */ + + BUG_ON(!pcount); + + /* Tweak before seqno plays */ + if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint && + !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) + tp->lost_cnt_hint += pcount; + + TCP_SKB_CB(prev)->end_seq += shifted; + TCP_SKB_CB(skb)->seq += shifted; + + skb_shinfo(prev)->gso_segs += pcount; + BUG_ON(skb_shinfo(skb)->gso_segs < pcount); + skb_shinfo(skb)->gso_segs -= pcount; + + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep + * setting gso_size to something. + */ + if (!skb_shinfo(prev)->gso_size) { + skb_shinfo(prev)->gso_size = mss; + skb_shinfo(prev)->gso_type = sk->sk_gso_type; + } + + /* CHECKME: To clear or not to clear? Mimics normal skb currently */ + if (skb_shinfo(skb)->gso_segs <= 1) { + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; + } + + *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked, + pcount); + + /* Difference in this won't matter, both ACKed by the same cumul. ACK */ + TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); + + if (skb->len > 0) { + BUG_ON(!tcp_skb_pcount(skb)); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); + return 0; + } + + /* Whole SKB was eaten :-) */ + + if (skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = prev; + if (skb == tp->scoreboard_skb_hint) + tp->scoreboard_skb_hint = prev; + if (skb == tp->lost_skb_hint) { + tp->lost_skb_hint = prev; + tp->lost_cnt_hint -= tcp_skb_pcount(prev); + } + + TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; + if (skb == tcp_highest_sack(sk)) + tcp_advance_highest_sack(sk, skb); + + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); + + return 1; +} + +/* I wish gso_size would have a bit more sane initialization than + * something-or-zero which complicates things + */ +static int tcp_shift_mss(struct sk_buff *skb) +{ + int mss = tcp_skb_mss(skb); + + if (!mss) + mss = skb->len; + + return mss; +} + +/* Shifting pages past head area doesn't work */ +static int skb_can_shift(struct sk_buff *skb) +{ + return !skb_headlen(skb) && skb_is_nonlinear(skb); +} + +/* Try collapsing SACK blocks spanning across multiple skbs to a single + * skb. + */ +static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, + u32 start_seq, u32 end_seq, + int dup_sack, int *fack_count, + int *reord, int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *prev; + int mss; + int pcount = 0; + int len; + int in_sack; + + if (!sk_can_gso(sk)) + goto fallback; + + /* Normally R but no L won't result in plain S */ + if (!dup_sack && + (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS) + goto fallback; + if (!skb_can_shift(skb)) + goto fallback; + /* This frame is about to be dropped (was ACKed). */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + goto fallback; + + /* Can only happen with delayed DSACK + discard craziness */ + if (unlikely(skb == tcp_write_queue_head(sk))) + goto fallback; + prev = tcp_write_queue_prev(sk, skb); + + if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) + goto fallback; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + if (in_sack) { + len = skb->len; + pcount = tcp_skb_pcount(skb); + mss = tcp_shift_mss(skb); + + /* TODO: Fix DSACKs to not fragment already SACKed and we can + * drop this restriction as unnecessary + */ + if (mss != tcp_shift_mss(prev)) + goto fallback; + } else { + if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) + goto noop; + /* CHECKME: This is non-MSS split case only?, this will + * cause skipped skbs due to advancing loop btw, original + * has that feature too + */ + if (tcp_skb_pcount(skb) <= 1) + goto noop; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); + if (!in_sack) { + /* TODO: head merge to next could be attempted here + * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), + * though it might not be worth of the additional hassle + * + * ...we can probably just fallback to what was done + * previously. We could try merging non-SACKed ones + * as well but it probably isn't going to buy off + * because later SACKs might again split them, and + * it would make skb timestamp tracking considerably + * harder problem. + */ + goto fallback; + } + + len = end_seq - TCP_SKB_CB(skb)->seq; + BUG_ON(len < 0); + BUG_ON(len > skb->len); + + /* MSS boundaries should be honoured or else pcount will + * severely break even though it makes things bit trickier. + * Optimize common case to avoid most of the divides + */ + mss = tcp_skb_mss(skb); + + /* TODO: Fix DSACKs to not fragment already SACKed and we can + * drop this restriction as unnecessary + */ + if (mss != tcp_shift_mss(prev)) + goto fallback; + + if (len == mss) { + pcount = 1; + } else if (len < mss) { + goto noop; + } else { + pcount = len / mss; + len = pcount * mss; + } + } + + if (!skb_shift(prev, skb, len)) + goto fallback; + if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord, + flag, mss)) + goto out; + + /* Hole filled allows collapsing with the next as well, this is very + * useful when hole on every nth skb pattern happens + */ + if (prev == tcp_write_queue_tail(sk)) + goto out; + skb = tcp_write_queue_next(sk, prev); + + if (!skb_can_shift(skb)) + goto out; + if (skb == tcp_send_head(sk)) + goto out; + if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) + goto out; + + len = skb->len; + if (skb_shift(prev, skb, len)) { + pcount += tcp_skb_pcount(skb); + tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len, + *fack_count, reord, flag, mss); + } + +out: + *fack_count += pcount; + return prev; + +noop: + return skb; + +fallback: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); + return NULL; +} + static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, u32 start_seq, u32 end_seq, int dup_sack_in, int *fack_count, int *reord, int *flag) { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *tmp; + tcp_for_write_queue_from(skb, sk) { int in_sack = 0; int dup_sack = dup_sack_in; @@ -1376,15 +1630,41 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, dup_sack = 1; } - if (in_sack <= 0) - in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, - end_seq); + /* skb reference here is a bit tricky to get right, since + * shifting can eat and free both this skb and the next, + * so not even _safe variant of the loop is enough. + */ + if (in_sack <= 0) { + tmp = tcp_shift_skb_data(sk, skb, start_seq, + end_seq, dup_sack, + fack_count, reord, flag); + if (tmp != NULL) { + if (tmp != skb) { + skb = tmp; + continue; + } + + in_sack = 0; + } else { + in_sack = tcp_match_skb_to_sack(sk, skb, + start_seq, + end_seq); + } + } + if (unlikely(in_sack < 0)) break; - if (in_sack) + if (in_sack) { *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, - *fack_count); + *fack_count, + &(TCP_SKB_CB(skb)->sacked), + tcp_skb_pcount(skb)); + + if (!before(TCP_SKB_CB(skb)->seq, + tcp_highest_sack_seq(tp))) + tcp_advance_highest_sack(sk, skb); + } *fack_count += tcp_skb_pcount(skb); } @@ -1401,7 +1681,7 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, if (skb == tcp_send_head(sk)) break; - if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) + if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) break; *fack_count += tcp_skb_pcount(skb); @@ -1660,7 +1940,7 @@ out: /* Limits sacked_out so that sum with lost_out isn't ever larger than * packets_out. Returns zero if sacked_out adjustement wasn't necessary. */ -int tcp_limit_reno_sacked(struct tcp_sock *tp) +static int tcp_limit_reno_sacked(struct tcp_sock *tp) { u32 holes; @@ -2559,6 +2839,56 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } +/* Do a simple retransmit without using the backoff mechanisms in + * tcp_timer. This is used for path mtu discovery. + * The socket is already locked here. + */ +void tcp_simple_retransmit(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + unsigned int mss = tcp_current_mss(sk, 0); + u32 prior_lost = tp->lost_out; + + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + if (skb->len > mss && + !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + } + tcp_skb_mark_lost_uncond_verify(tp, skb); + } + } + + tcp_clear_retrans_hints_partial(tp); + + if (prior_lost == tp->lost_out) + return; + + if (tcp_is_reno(tp)) + tcp_limit_reno_sacked(tp); + + tcp_verify_left_out(tp); + + /* Don't muck with the congestion window here. + * Reason is that we do not increase amount of _data_ + * in network, but units changed and effective + * cwnd/ssthresh really reduced now. + */ + if (icsk->icsk_ca_state != TCP_CA_Loss) { + tp->high_seq = tp->snd_nxt; + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->prior_ssthresh = 0; + tp->undo_marker = 0; + tcp_set_ca_state(sk, TCP_CA_Loss); + } + tcp_xmit_retransmit_queue(sk); +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d49233f409b..26b9030747c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -97,11 +97,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) } #endif -struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { - .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock), - .lhash_users = ATOMIC_INIT(0), - .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), -}; +struct inet_hashinfo tcp_hashinfo; static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) { @@ -492,7 +488,7 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) skb->csum_offset = offsetof(struct tcphdr, check); } else { th->check = tcp_v4_check(len, inet->saddr, inet->daddr, - csum_partial((char *)th, + csum_partial(th, th->doff << 2, skb->csum)); } @@ -726,7 +722,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, th->check = tcp_v4_check(skb->len, ireq->loc_addr, ireq->rmt_addr, - csum_partial((char *)th, skb->len, + csum_partial(th, skb->len, skb->csum)); err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, @@ -1801,7 +1797,7 @@ static int tcp_v4_init_sock(struct sock *sk) sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; - atomic_inc(&tcp_sockets_allocated); + percpu_counter_inc(&tcp_sockets_allocated); return 0; } @@ -1849,7 +1845,7 @@ void tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } - atomic_dec(&tcp_sockets_allocated); + percpu_counter_dec(&tcp_sockets_allocated); } EXPORT_SYMBOL(tcp_v4_destroy_sock); @@ -1857,32 +1853,35 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) +static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) { - return hlist_empty(head) ? NULL : + return hlist_nulls_empty(head) ? NULL : list_entry(head->first, struct inet_timewait_sock, tw_node); } static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) { - return tw->tw_node.next ? - hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; + return !is_a_nulls(tw->tw_node.next) ? + hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; } static void *listening_get_next(struct seq_file *seq, void *cur) { struct inet_connection_sock *icsk; - struct hlist_node *node; + struct hlist_nulls_node *node; struct sock *sk = cur; + struct inet_listen_hashbucket *ilb; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); if (!sk) { st->bucket = 0; - sk = sk_head(&tcp_hashinfo.listening_hash[0]); + ilb = &tcp_hashinfo.listening_hash[0]; + spin_lock_bh(&ilb->lock); + sk = sk_nulls_head(&ilb->head); goto get_sk; } - + ilb = &tcp_hashinfo.listening_hash[st->bucket]; ++st->num; if (st->state == TCP_SEQ_STATE_OPENREQ) { @@ -1915,7 +1914,7 @@ get_req: sk = sk_next(sk); } get_sk: - sk_for_each_from(sk, node) { + sk_nulls_for_each_from(sk, node) { if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { cur = sk; goto out; @@ -1932,8 +1931,11 @@ start_req: } read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } + spin_unlock_bh(&ilb->lock); if (++st->bucket < INET_LHTABLE_SIZE) { - sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); + ilb = &tcp_hashinfo.listening_hash[st->bucket]; + spin_lock_bh(&ilb->lock); + sk = sk_nulls_head(&ilb->head); goto get_sk; } cur = NULL; @@ -1954,8 +1956,8 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) static inline int empty_bucket(struct tcp_iter_state *st) { - return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && - hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); + return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && + hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); } static void *established_get_first(struct seq_file *seq) @@ -1966,16 +1968,16 @@ static void *established_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { struct sock *sk; - struct hlist_node *node; + struct hlist_nulls_node *node; struct inet_timewait_sock *tw; - rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); + spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); /* Lockless fast path for the common case of empty buckets */ if (empty_bucket(st)) continue; - read_lock_bh(lock); - sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family || !net_eq(sock_net(sk), net)) { continue; @@ -1993,7 +1995,7 @@ static void *established_get_first(struct seq_file *seq) rc = tw; goto out; } - read_unlock_bh(lock); + spin_unlock_bh(lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -2004,7 +2006,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) { struct sock *sk = cur; struct inet_timewait_sock *tw; - struct hlist_node *node; + struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); @@ -2021,7 +2023,7 @@ get_tw: cur = tw; goto out; } - read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); st->state = TCP_SEQ_STATE_ESTABLISHED; /* Look for next non empty bucket */ @@ -2031,12 +2033,12 @@ get_tw: if (st->bucket >= tcp_hashinfo.ehash_size) return NULL; - read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); - sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); + spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); } else - sk = sk_next(sk); + sk = sk_nulls_next(sk); - sk_for_each_from(sk, node) { + sk_nulls_for_each_from(sk, node) { if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) goto found; } @@ -2066,12 +2068,10 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) void *rc; struct tcp_iter_state *st = seq->private; - inet_listen_lock(&tcp_hashinfo); st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_idx(seq, &pos); if (!rc) { - inet_listen_unlock(&tcp_hashinfo); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_idx(seq, pos); } @@ -2103,7 +2103,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { - inet_listen_unlock(&tcp_hashinfo); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_first(seq); } @@ -2130,12 +2129,12 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - inet_listen_unlock(&tcp_hashinfo); + spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); break; case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) - read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); break; } } @@ -2375,6 +2374,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, @@ -2404,6 +2404,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { void __init tcp_v4_init(void) { + inet_hashinfo_init(&tcp_hashinfo); if (register_pernet_device(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a524627923a..76f840917bc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -722,7 +722,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { - if (skb->len <= mss_now || !sk_can_gso(sk)) { + if (skb->len <= mss_now || !sk_can_gso(sk) || + tcp_urg_mode(tcp_sk(sk))) { /* Avoid the costly divide in the normal * non-TSO case. */ @@ -1163,7 +1164,9 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, { int tso_segs = tcp_skb_pcount(skb); - if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { + if (!tso_segs || + (tso_segs > 1 && (tcp_skb_mss(skb) != mss_now || + tcp_urg_mode(tcp_sk(sk))))) { tcp_set_skb_tso_segs(sk, skb, mss_now); tso_segs = tcp_skb_pcount(skb); } @@ -1766,46 +1769,22 @@ u32 __tcp_select_window(struct sock *sk) return window; } -/* Attempt to collapse two adjacent SKB's during retransmission. */ -static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, - int mss_now) +/* Collapses two adjacent SKB's during retransmission. */ +static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); int skb_size, next_skb_size; u16 flags; - /* The first test we must make is that neither of these two - * SKB's are still referenced by someone else. - */ - if (skb_cloned(skb) || skb_cloned(next_skb)) - return; - skb_size = skb->len; next_skb_size = next_skb->len; flags = TCP_SKB_CB(skb)->flags; - /* Also punt if next skb has been SACK'd. */ - if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) - return; - - /* Next skb is out of window. */ - if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp))) - return; - - /* Punt if not enough space exists in the first SKB for - * the data in the second, or the total combined payload - * would exceed the MSS. - */ - if ((next_skb_size > skb_tailroom(skb)) || - ((skb_size + next_skb_size) > mss_now)) - return; - BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); tcp_highest_sack_combine(sk, next_skb, skb); - /* Ok. We will be able to collapse the packet. */ tcp_unlink_write_queue(next_skb, sk); skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), @@ -1847,54 +1826,60 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, sk_wmem_free_skb(sk, next_skb); } -/* Do a simple retransmit without using the backoff mechanisms in - * tcp_timer. This is used for path mtu discovery. - * The socket is already locked here. - */ -void tcp_simple_retransmit(struct sock *sk) +static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) +{ + if (tcp_skb_pcount(skb) > 1) + return 0; + /* TODO: SACK collapsing could be used to remove this condition */ + if (skb_shinfo(skb)->nr_frags != 0) + return 0; + if (skb_cloned(skb)) + return 0; + if (skb == tcp_send_head(sk)) + return 0; + /* Some heurestics for collapsing over SACK'd could be invented */ + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + return 0; + + return 1; +} + +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, + int space) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - unsigned int mss = tcp_current_mss(sk, 0); - u32 prior_lost = tp->lost_out; + struct sk_buff *skb = to, *tmp; + int first = 1; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) + if (!sysctl_tcp_retrans_collapse) + return; + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) + return; + + tcp_for_write_queue_from_safe(skb, tmp, sk) { + if (!tcp_can_collapse(sk, skb)) break; - if (skb->len > mss && - !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - } - tcp_skb_mark_lost_uncond_verify(tp, skb); - } - } - tcp_clear_retrans_hints_partial(tp); + space -= skb->len; - if (prior_lost == tp->lost_out) - return; + if (first) { + first = 0; + continue; + } - if (tcp_is_reno(tp)) - tcp_limit_reno_sacked(tp); + if (space < 0) + break; + /* Punt if not enough space exists in the first SKB for + * the data in the second + */ + if (skb->len > skb_tailroom(to)) + break; - tcp_verify_left_out(tp); + if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) + break; - /* Don't muck with the congestion window here. - * Reason is that we do not increase amount of _data_ - * in network, but units changed and effective - * cwnd/ssthresh really reduced now. - */ - if (icsk->icsk_ca_state != TCP_CA_Loss) { - tp->high_seq = tp->snd_nxt; - tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->prior_ssthresh = 0; - tp->undo_marker = 0; - tcp_set_ca_state(sk, TCP_CA_Loss); + tcp_collapse_retrans(sk, to); } - tcp_xmit_retransmit_queue(sk); } /* This retransmits one SKB. Policy decisions and retransmit queue @@ -1946,17 +1931,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return -ENOMEM; /* We'll try again later. */ } - /* Collapse two adjacent packets if worthwhile and we can. */ - if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && - (skb->len < (cur_mss >> 1)) && - (!tcp_skb_is_last(sk, skb)) && - (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && - (skb_shinfo(skb)->nr_frags == 0 && - skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && - (tcp_skb_pcount(skb) == 1 && - tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && - (sysctl_tcp_retrans_collapse != 0)) - tcp_retrans_try_collapse(sk, skb, cur_mss); + tcp_retrans_try_collapse(sk, skb, cur_mss); /* Some Solaris stacks overoptimize and ignore the FIN on a * retransmit when old data is attached. So strip it off diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3df339e3e36..cc4e6d27ded 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -65,7 +65,7 @@ static void tcp_write_err(struct sock *sk) static int tcp_out_of_resources(struct sock *sk, int do_reset) { struct tcp_sock *tp = tcp_sk(sk); - int orphans = atomic_read(&tcp_orphan_count); + int orphans = percpu_counter_read_positive(&tcp_orphan_count); /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7e4d9c87115..cf5ab0581eb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -127,9 +127,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct sock *sk2)) { struct sock *sk2; - struct hlist_node *node; + struct hlist_nulls_node *node; - sk_for_each(sk2, node, &hslot->head) + sk_nulls_for_each(sk2, node, &hslot->head) if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_hash == num && @@ -189,12 +189,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - /* - * We need that previous write to sk->sk_hash committed - * before write to sk->next done in following add_node() variant - */ - smp_wmb(); - sk_add_node_rcu(sk, &hslot->head); + sk_nulls_add_node_rcu(sk, &hslot->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } error = 0; @@ -261,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int dif, struct udp_table *udptable) { struct sock *sk, *result; - struct hlist_node *node, *next; + struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; @@ -271,13 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, begin: result = NULL; badness = -1; - sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { - /* - * lockless reader, and SLAB_DESTROY_BY_RCU items: - * We must check this item was not moved to another chain - */ - if (udp_hashfn(net, sk->sk_hash) != hash) - goto begin; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { @@ -285,6 +274,14 @@ begin: badness = score; } } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != hash) + goto begin; + if (result) { if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; @@ -320,19 +317,20 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, } EXPORT_SYMBOL_GPL(udp4_lib_lookup); -static inline struct sock *udp_v4_mcast_next(struct sock *sk, +static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif) { - struct hlist_node *node; + struct hlist_nulls_node *node; struct sock *s = sk; unsigned short hnum = ntohs(loc_port); - sk_for_each_from(s, node) { + sk_nulls_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); - if (s->sk_hash != hnum || + if (!net_eq(sock_net(s), net) || + s->sk_hash != hnum || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || @@ -668,6 +666,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, .saddr = saddr, .tos = tos } }, .proto = sk->sk_protocol, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = inet->sport, .dport = dport } } }; @@ -720,7 +719,7 @@ do_append_data: up->len += ulen; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, - sizeof(struct udphdr), &ipc, rt, + sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_flush_pending_frames(sk); @@ -971,16 +970,18 @@ int udp_disconnect(struct sock *sk, int flags) void udp_lib_unhash(struct sock *sk) { - struct udp_table *udptable = sk->sk_prot->h.udp_table; - unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); - struct udp_hslot *hslot = &udptable->hash[hash]; + if (sk_hashed(sk)) { + struct udp_table *udptable = sk->sk_prot->h.udp_table; + unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); + struct udp_hslot *hslot = &udptable->hash[hash]; - spin_lock_bh(&hslot->lock); - if (sk_del_node_init_rcu(sk)) { - inet_sk(sk)->num = 0; - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + spin_lock_bh(&hslot->lock); + if (sk_nulls_del_node_init_rcu(sk)) { + inet_sk(sk)->num = 0; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + } + spin_unlock_bh(&hslot->lock); } - spin_unlock_bh(&hslot->lock); } EXPORT_SYMBOL(udp_lib_unhash); @@ -1129,17 +1130,18 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif; spin_lock(&hslot->lock); - sk = sk_head(&hslot->head); + sk = sk_nulls_head(&hslot->head); dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); + sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { struct sock *sknext = NULL; do { struct sk_buff *skb1 = skb; - sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, - uh->source, saddr, dif); + sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, + daddr, uh->source, saddr, + dif); if (sknext) skb1 = skb_clone(skb, GFP_ATOMIC); @@ -1558,10 +1560,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) struct net *net = seq_file_net(seq); for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { - struct hlist_node *node; + struct hlist_nulls_node *node; struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; spin_lock_bh(&hslot->lock); - sk_for_each(sk, node, &hslot->head) { + sk_nulls_for_each(sk, node, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == state->family) @@ -1580,7 +1582,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) struct net *net = seq_file_net(seq); do { - sk = sk_next(sk); + sk = sk_nulls_next(sk); } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { @@ -1751,7 +1753,7 @@ void __init udp_table_init(struct udp_table *table) int i; for (i = 0; i < UDP_HTABLE_SIZE; i++) { - INIT_HLIST_HEAD(&table->hash[i].head); + INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); spin_lock_init(&table->hash[i].lock); } } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index f9a775b7e79..2ad24ba31f9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -18,7 +18,8 @@ static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, +static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, + xfrm_address_t *saddr, xfrm_address_t *daddr) { struct flowi fl = { @@ -36,19 +37,20 @@ static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, if (saddr) fl.fl4_src = saddr->a4; - err = __ip_route_output_key(&init_net, &rt, &fl); + err = __ip_route_output_key(net, &rt, &fl); dst = &rt->u.dst; if (err) dst = ERR_PTR(err); return dst; } -static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) +static int xfrm4_get_saddr(struct net *net, + xfrm_address_t *saddr, xfrm_address_t *daddr) { struct dst_entry *dst; struct rtable *rt; - dst = xfrm4_dst_lookup(0, NULL, daddr); + dst = xfrm4_dst_lookup(net, 0, NULL, daddr); if (IS_ERR(dst)) return -EHOSTUNREACH; @@ -187,7 +189,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) static inline int xfrm4_garbage_collect(struct dst_ops *ops) { - xfrm4_policy_afinfo.garbage_collect(); + xfrm4_policy_afinfo.garbage_collect(&init_net); return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); } @@ -246,7 +248,6 @@ static struct dst_ops xfrm4_dst_ops = { .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, .gc_thresh = 1024, - .entry_size = sizeof(struct xfrm_dst), .entries = ATOMIC_INIT(0), }; diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 07735ed280d..1ef1366a0a0 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -13,8 +13,6 @@ #include <linux/ipsec.h> #include <linux/netfilter_ipv4.h> -static struct xfrm_state_afinfo xfrm4_state_afinfo; - static int xfrm4_init_flags(struct xfrm_state *x) { if (ipv4_config.no_pmtu_disc) @@ -33,6 +31,7 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, x->sel.dport_mask = htons(0xffff); x->sel.sport = xfrm_flowi_sport(fl); x->sel.sport_mask = htons(0xffff); + x->sel.family = AF_INET; x->sel.prefixlen_d = 32; x->sel.prefixlen_s = 32; x->sel.proto = fl->proto; |