From 19baf839ff4a8daa1f2a7400897094fc18e4f5e9 Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Tue, 21 Jun 2005 12:43:18 -0700 Subject: [IPV4]: Add LC-Trie FIB lookup algorithm. Signed-off-by: Robert Olsson Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 26 + net/ipv4/Makefile | 4 +- net/ipv4/af_inet.c | 12 + net/ipv4/fib_trie.c | 2454 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 2495 insertions(+), 1 deletion(-) create mode 100644 net/ipv4/fib_trie.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6d3e8b1bd1f..05107e0dc14 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -1,6 +1,32 @@ # # IP configuration # +choice + prompt "Choose IP: FIB lookup"" + depends on INET + default IP_FIB_HASH + +config IP_FIB_HASH + bool "FIB_HASH" + ---help--- + Current FIB is very proven and good enough for most users. + +config IP_FIB_TRIE + bool "FIB_TRIE" + ---help--- + Use new experimental LC-trie as FIB lookup algoritm. + This improves lookup performance + + LC-trie is described in: + + IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson + IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 + An experimental study of compression methods for dynamic tries + Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. + http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ + +endchoice + config IP_MULTICAST bool "IP: multicasting" depends on INET diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 8b379627ebb..65d57d8e1ad 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -7,8 +7,10 @@ obj-y := utils.o route.o inetpeer.o protocol.o \ ip_output.o ip_sockglue.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ - sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o +obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o +obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 03942f13394..658e7977924 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1119,6 +1119,10 @@ module_init(inet_init); #ifdef CONFIG_PROC_FS extern int fib_proc_init(void); extern void fib_proc_exit(void); +#ifdef CONFIG_IP_FIB_TRIE +extern int fib_stat_proc_init(void); +extern void fib_stat_proc_exit(void); +#endif extern int ip_misc_proc_init(void); extern int raw_proc_init(void); extern void raw_proc_exit(void); @@ -1139,11 +1143,19 @@ static int __init ipv4_proc_init(void) goto out_udp; if (fib_proc_init()) goto out_fib; +#ifdef CONFIG_IP_FIB_TRIE + if (fib_stat_proc_init()) + goto out_fib_stat; + #endif if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: +#ifdef CONFIG_IP_FIB_TRIE + fib_stat_proc_exit(); +out_fib_stat: +#endif fib_proc_exit(); out_fib: udp4_proc_exit(); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c new file mode 100644 index 00000000000..c0ece94fc63 --- /dev/null +++ b/net/ipv4/fib_trie.c @@ -0,0 +1,2454 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Robert Olsson Uppsala Universitet + * & Swedish University of Agricultural Sciences. + * + * Jens Laas Swedish University of + * Agricultural Sciences. + * + * Hans Liss Uppsala Universitet + * + * This work is based on the LPC-trie which is originally descibed in: + * + * An experimental study of compression methods for dynamic tries + * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. + * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ + * + * + * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson + * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 + * + * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $ + * + * + * Code from fib_hash has been reused which includes the following header: + * + * + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 FIB: lookup engine and maintenance routines. + * + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define VERSION "0.323" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fib_lookup.h" + +#undef CONFIG_IP_FIB_TRIE_STATS +#define MAX_CHILDS 16384 + +#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n))) +#define KEYLENGTH (8*sizeof(t_key)) +#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) +#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) + +static DEFINE_RWLOCK(fib_lock); + +typedef unsigned int t_key; + +#define T_TNODE 0 +#define T_LEAF 1 +#define NODE_TYPE_MASK 0x1UL +#define NODE_PARENT(_node) \ +((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK)) +#define NODE_SET_PARENT(_node, _ptr) \ +((_node)->_parent = (((unsigned long)(_ptr)) | \ + ((_node)->_parent & NODE_TYPE_MASK))) +#define NODE_INIT_PARENT(_node, _type) \ +((_node)->_parent = (_type)) +#define NODE_TYPE(_node) \ +((_node)->_parent & NODE_TYPE_MASK) + +#define IS_TNODE(n) (!(n->_parent & T_LEAF)) +#define IS_LEAF(n) (n->_parent & T_LEAF) + +struct node { + t_key key; + unsigned long _parent; +}; + +struct leaf { + t_key key; + unsigned long _parent; + struct hlist_head list; +}; + +struct leaf_info { + struct hlist_node hlist; + int plen; + struct list_head falh; +}; + +struct tnode { + t_key key; + unsigned long _parent; + unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ + unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ + unsigned short full_children; /* KEYLENGTH bits needed */ + unsigned short empty_children; /* KEYLENGTH bits needed */ + struct node *child[0]; +}; + +#ifdef CONFIG_IP_FIB_TRIE_STATS +struct trie_use_stats { + unsigned int gets; + unsigned int backtrack; + unsigned int semantic_match_passed; + unsigned int semantic_match_miss; + unsigned int null_node_hit; +}; +#endif + +struct trie_stat { + unsigned int totdepth; + unsigned int maxdepth; + unsigned int tnodes; + unsigned int leaves; + unsigned int nullpointers; + unsigned int nodesizes[MAX_CHILDS]; +}; + +struct trie { + struct node *trie; +#ifdef CONFIG_IP_FIB_TRIE_STATS + struct trie_use_stats stats; +#endif + int size; + unsigned int revision; +}; + +static int trie_debug = 0; + +static int tnode_full(struct tnode *tn, struct node *n); +static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); +static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); +static int tnode_child_length(struct tnode *tn); +static struct node *resize(struct trie *t, struct tnode *tn); +static struct tnode *inflate(struct trie *t, struct tnode *tn); +static struct tnode *halve(struct trie *t, struct tnode *tn); +static void tnode_free(struct tnode *tn); +static void trie_dump_seq(struct seq_file *seq, struct trie *t); +extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); +extern int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, int *last_idx, int *dflt); + +extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id, + struct nlmsghdr *n, struct netlink_skb_parms *req); + +static kmem_cache_t *fn_alias_kmem; +static struct trie *trie_local = NULL, *trie_main = NULL; + +static void trie_bug(char *err) +{ + printk("Trie Bug: %s\n", err); + BUG(); +} + +static inline struct node *tnode_get_child(struct tnode *tn, int i) +{ + if (i >= 1<bits) + trie_bug("tnode_get_child"); + + return tn->child[i]; +} + +static inline int tnode_child_length(struct tnode *tn) +{ + return 1<bits; +} + +/* + _________________________________________________________________ + | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | + ---------------------------------------------------------------- + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + + _________________________________________________________________ + | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | + ----------------------------------------------------------------- + 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + + tp->pos = 7 + tp->bits = 3 + n->pos = 15 + n->bits=4 + KEYLENGTH=32 +*/ + +static inline t_key tkey_extract_bits(t_key a, int offset, int bits) +{ + if (offset < KEYLENGTH) + return ((t_key)(a << offset)) >> (KEYLENGTH - bits); + else + return 0; +} + +static inline int tkey_equals(t_key a, t_key b) +{ + return a == b; +} + +static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) +{ + if (bits == 0 || offset >= KEYLENGTH) + return 1; + bits = bits > KEYLENGTH ? KEYLENGTH : bits; + return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; +} + +static inline int tkey_mismatch(t_key a, int offset, t_key b) +{ + t_key diff = a ^ b; + int i = offset; + + if(!diff) + return 0; + while((diff << i) >> (KEYLENGTH-1) == 0) + i++; + return i; +} + +/* Candiate for fib_semantics */ + +static void fn_free_alias(struct fib_alias *fa) +{ + fib_release_info(fa->fa_info); + kmem_cache_free(fn_alias_kmem, fa); +} + +/* + To understand this stuff, an understanding of keys and all their bits is + necessary. Every node in the trie has a key associated with it, but not + all of the bits in that key are significant. + + Consider a node 'n' and its parent 'tp'. + + If n is a leaf, every bit in its key is significant. Its presence is + necessitaded by path compression, since during a tree traversal (when + searching for a leaf - unless we are doing an insertion) we will completely + ignore all skipped bits we encounter. Thus we need to verify, at the end of + a potentially successful search, that we have indeed been walking the + correct key path. + + Note that we can never "miss" the correct key in the tree if present by + following the wrong path. Path compression ensures that segments of the key + that are the same for all keys with a given prefix are skipped, but the + skipped part *is* identical for each node in the subtrie below the skipped + bit! trie_insert() in this implementation takes care of that - note the + call to tkey_sub_equals() in trie_insert(). + + if n is an internal node - a 'tnode' here, the various parts of its key + have many different meanings. + + Example: + _________________________________________________________________ + | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | + ----------------------------------------------------------------- + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + + _________________________________________________________________ + | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | + ----------------------------------------------------------------- + 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + + tp->pos = 7 + tp->bits = 3 + n->pos = 15 + n->bits=4 + + First, let's just ignore the bits that come before the parent tp, that is + the bits from 0 to (tp->pos-1). They are *known* but at this point we do + not use them for anything. + + The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the + index into the parent's child array. That is, they will be used to find + 'n' among tp's children. + + The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits + for the node n. + + All the bits we have seen so far are significant to the node n. The rest + of the bits are really not needed or indeed known in n->key. + + The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into + n's child array, and will of course be different for each child. + + The rest of the bits, from (n->pos + n->bits) onward, are completely unknown + at this point. + +*/ + +static void check_tnode(struct tnode *tn) +{ + if(tn && tn->pos+tn->bits > 32) { + printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits); + } +} + +static int halve_threshold = 25; +static int inflate_threshold = 50; + +static struct leaf *leaf_new(void) +{ + struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); + if(l) { + NODE_INIT_PARENT(l, T_LEAF); + INIT_HLIST_HEAD(&l->list); + } + return l; +} + +static struct leaf_info *leaf_info_new(int plen) +{ + struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); + li->plen = plen; + INIT_LIST_HEAD(&li->falh); + return li; +} + +static inline void free_leaf(struct leaf *l) +{ + kfree(l); +} + +static inline void free_leaf_info(struct leaf_info *li) +{ + kfree(li); +} + +static struct tnode* tnode_new(t_key key, int pos, int bits) +{ + int nchildren = 1<pos = pos; + tn->bits = bits; + tn->key = key; + tn->full_children = 0; + tn->empty_children = 1< 0) + printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), + (unsigned int) (sizeof(struct node) * 1< 0 ) + printk("FL %p \n", tn); + } + else if(IS_TNODE(tn)) { + kfree(tn); + if(trie_debug > 0 ) + printk("FT %p \n", tn); + } + else { + trie_bug("tnode_free\n"); + } +} + +/* + * Check whether a tnode 'n' is "full", i.e. it is an internal node + * and no bits are skipped. See discussion in dyntree paper p. 6 + */ + +static inline int tnode_full(struct tnode *tn, struct node *n) +{ + if(n == NULL || IS_LEAF(n)) + return 0; + + return ((struct tnode *) n)->pos == tn->pos + tn->bits; +} + +static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) +{ + tnode_put_child_reorg(tn, i, n, -1); +} + + /* + * Add a child at position i overwriting the old value. + * Update the value of full_children and empty_children. + */ + +static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) +{ + struct node *chi; + int isfull; + + if(i >= 1<bits) { + printk("bits=%d, i=%d\n", tn->bits, i); + trie_bug("tnode_put_child_reorg bits"); + } + write_lock_bh(&fib_lock); + chi = tn->child[i]; + + /* update emptyChildren */ + if (n == NULL && chi != NULL) + tn->empty_children++; + else if (n != NULL && chi == NULL) + tn->empty_children--; + + /* update fullChildren */ + if (wasfull == -1) + wasfull = tnode_full(tn, chi); + + isfull = tnode_full(tn, n); + if (wasfull && !isfull) + tn->full_children--; + + else if (!wasfull && isfull) + tn->full_children++; + if(n) + NODE_SET_PARENT(n, tn); + + tn->child[i] = n; + write_unlock_bh(&fib_lock); +} + +static struct node *resize(struct trie *t, struct tnode *tn) +{ + int i; + + if (!tn) + return NULL; + + if(trie_debug) + printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", + tn, inflate_threshold, halve_threshold); + + /* No children */ + if (tn->empty_children == tnode_child_length(tn)) { + tnode_free(tn); + return NULL; + } + /* One child */ + if (tn->empty_children == tnode_child_length(tn) - 1) + for (i = 0; i < tnode_child_length(tn); i++) { + + write_lock_bh(&fib_lock); + if (tn->child[i] != NULL) { + + /* compress one level */ + struct node *n = tn->child[i]; + if(n) + NODE_INIT_PARENT(n, NODE_TYPE(n)); + + write_unlock_bh(&fib_lock); + tnode_free(tn); + return n; + } + write_unlock_bh(&fib_lock); + } + /* + * Double as long as the resulting node has a number of + * nonempty nodes that are above the threshold. + */ + + /* + * From "Implementing a dynamic compressed trie" by Stefan Nilsson of + * the Helsinki University of Technology and Matti Tikkanen of Nokia + * Telecommunications, page 6: + * "A node is doubled if the ratio of non-empty children to all + * children in the *doubled* node is at least 'high'." + * + * 'high' in this instance is the variable 'inflate_threshold'. It + * is expressed as a percentage, so we multiply it with + * tnode_child_length() and instead of multiplying by 2 (since the + * child array will be doubled by inflate()) and multiplying + * the left-hand side by 100 (to handle the percentage thing) we + * multiply the left-hand side by 50. + * + * The left-hand side may look a bit weird: tnode_child_length(tn) + * - tn->empty_children is of course the number of non-null children + * in the current node. tn->full_children is the number of "full" + * children, that is non-null tnodes with a skip value of 0. + * All of those will be doubled in the resulting inflated tnode, so + * we just count them one extra time here. + * + * A clearer way to write this would be: + * + * to_be_doubled = tn->full_children; + * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - + * tn->full_children; + * + * new_child_length = tnode_child_length(tn) * 2; + * + * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / + * new_child_length; + * if (new_fill_factor >= inflate_threshold) + * + * ...and so on, tho it would mess up the while() loop. + * + * anyway, + * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= + * inflate_threshold + * + * avoid a division: + * 100 * (not_to_be_doubled + 2*to_be_doubled) >= + * inflate_threshold * new_child_length + * + * expand not_to_be_doubled and to_be_doubled, and shorten: + * 100 * (tnode_child_length(tn) - tn->empty_children + + * tn->full_children ) >= inflate_threshold * new_child_length + * + * expand new_child_length: + * 100 * (tnode_child_length(tn) - tn->empty_children + + * tn->full_children ) >= + * inflate_threshold * tnode_child_length(tn) * 2 + * + * shorten again: + * 50 * (tn->full_children + tnode_child_length(tn) - + * tn->empty_children ) >= inflate_threshold * + * tnode_child_length(tn) + * + */ + + check_tnode(tn); + + while ((tn->full_children > 0 && + 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= + inflate_threshold * tnode_child_length(tn))) { + + tn = inflate(t, tn); + } + + check_tnode(tn); + + /* + * Halve as long as the number of empty children in this + * node is above threshold. + */ + while (tn->bits > 1 && + 100 * (tnode_child_length(tn) - tn->empty_children) < + halve_threshold * tnode_child_length(tn)) + + tn = halve(t, tn); + + /* Only one child remains */ + + if (tn->empty_children == tnode_child_length(tn) - 1) + for (i = 0; i < tnode_child_length(tn); i++) { + + write_lock_bh(&fib_lock); + if (tn->child[i] != NULL) { + /* compress one level */ + struct node *n = tn->child[i]; + + if(n) + NODE_INIT_PARENT(n, NODE_TYPE(n)); + + write_unlock_bh(&fib_lock); + tnode_free(tn); + return n; + } + write_unlock_bh(&fib_lock); + } + + return (struct node *) tn; +} + +static struct tnode *inflate(struct trie *t, struct tnode *tn) +{ + struct tnode *inode; + struct tnode *oldtnode = tn; + int olen = tnode_child_length(tn); + int i; + + if(trie_debug) + printk("In inflate\n"); + + tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); + + if (!tn) + trie_bug("tnode_new failed"); + + for(i = 0; i < olen; i++) { + struct node *node = tnode_get_child(oldtnode, i); + + /* An empty child */ + if (node == NULL) + continue; + + /* A leaf or an internal node with skipped bits */ + + if(IS_LEAF(node) || ((struct tnode *) node)->pos > + tn->pos + tn->bits - 1) { + if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1, + 1) == 0) + put_child(t, tn, 2*i, node); + else + put_child(t, tn, 2*i+1, node); + continue; + } + + /* An internal node with two children */ + inode = (struct tnode *) node; + + if (inode->bits == 1) { + put_child(t, tn, 2*i, inode->child[0]); + put_child(t, tn, 2*i+1, inode->child[1]); + + tnode_free(inode); + } + + /* An internal node with more than two children */ + else { + struct tnode *left, *right; + int size, j; + + /* We will replace this node 'inode' with two new + * ones, 'left' and 'right', each with half of the + * original children. The two new nodes will have + * a position one bit further down the key and this + * means that the "significant" part of their keys + * (see the discussion near the top of this file) + * will differ by one bit, which will be "0" in + * left's key and "1" in right's key. Since we are + * moving the key position by one step, the bit that + * we are moving away from - the bit at position + * (inode->pos) - is the one that will differ between + * left and right. So... we synthesize that bit in the + * two new keys. + * The mask 'm' below will be a single "one" bit at + * the position (inode->pos) + */ + + t_key m = TKEY_GET_MASK(inode->pos, 1); + + /* Use the old key, but set the new significant + * bit to zero. + */ + left = tnode_new(inode->key&(~m), inode->pos + 1, + inode->bits - 1); + + if(!left) + trie_bug("tnode_new failed"); + + + /* Use the old key, but set the new significant + * bit to one. + */ + right = tnode_new(inode->key|m, inode->pos + 1, + inode->bits - 1); + + if(!right) + trie_bug("tnode_new failed"); + + size = tnode_child_length(left); + for(j = 0; j < size; j++) { + put_child(t, left, j, inode->child[j]); + put_child(t, right, j, inode->child[j + size]); + } + put_child(t, tn, 2*i, resize(t, left)); + put_child(t, tn, 2*i+1, resize(t, right)); + + tnode_free(inode); + } + } + tnode_free(oldtnode); + return tn; +} + +static struct tnode *halve(struct trie *t, struct tnode *tn) +{ + struct tnode *oldtnode = tn; + struct node *left, *right; + int i; + int olen = tnode_child_length(tn); + + if(trie_debug) printk("In halve\n"); + + tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); + + if(!tn) + trie_bug("tnode_new failed"); + + for(i = 0; i < olen; i += 2) { + left = tnode_get_child(oldtnode, i); + right = tnode_get_child(oldtnode, i+1); + + /* At least one of the children is empty */ + if (left == NULL) { + if (right == NULL) /* Both are empty */ + continue; + put_child(t, tn, i/2, right); + } else if (right == NULL) + put_child(t, tn, i/2, left); + + /* Two nonempty children */ + else { + struct tnode *newBinNode = + tnode_new(left->key, tn->pos + tn->bits, 1); + + if(!newBinNode) + trie_bug("tnode_new failed"); + + put_child(t, newBinNode, 0, left); + put_child(t, newBinNode, 1, right); + put_child(t, tn, i/2, resize(t, newBinNode)); + } + } + tnode_free(oldtnode); + return tn; +} + +static void *trie_init(struct trie *t) +{ + if(t) { + t->size = 0; + t->trie = NULL; + t->revision = 0; +#ifdef CONFIG_IP_FIB_TRIE_STATS + memset(&t->stats, 0, sizeof(struct trie_use_stats)); +#endif + } + return t; +} + +static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) +{ + struct hlist_node *node; + struct leaf_info *li; + + hlist_for_each_entry(li, node, head, hlist) { + + if ( li->plen == plen ) + return li; + } + return NULL; +} + +static inline struct list_head * get_fa_head(struct leaf *l, int plen) +{ + struct list_head *fa_head=NULL; + struct leaf_info *li = find_leaf_info(&l->list, plen); + + if(li) + fa_head = &li->falh; + + return fa_head; +} + +static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) +{ + struct leaf_info *li=NULL, *last=NULL; + struct hlist_node *node, *tmp; + + write_lock_bh(&fib_lock); + + if(hlist_empty(head)) + hlist_add_head(&new->hlist, head); + else { + hlist_for_each_entry_safe(li, node, tmp, head, hlist) { + + if (new->plen > li->plen) + break; + + last = li; + } + if(last) + hlist_add_after(&last->hlist, &new->hlist); + else + hlist_add_before(&new->hlist, &li->hlist); + } + write_unlock_bh(&fib_lock); +} + +static struct leaf * +fib_find_node(struct trie *t, u32 key) +{ + int pos; + struct tnode *tn; + struct node *n; + + pos = 0; + n=t->trie; + + while (n != NULL && NODE_TYPE(n) == T_TNODE) { + tn = (struct tnode *) n; + + check_tnode(tn); + + if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { + pos=tn->pos + tn->bits; + n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); + } + else + break; + } + /* Case we have found a leaf. Compare prefixes */ + + if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { + struct leaf *l = (struct leaf *) n; + return l; + } + return NULL; +} + +static struct node *trie_rebalance(struct trie *t, struct tnode *tn) +{ + int i = 0; + int wasfull; + t_key cindex, key; + struct tnode *tp = NULL; + + if(!tn) + BUG(); + + key = tn->key; + i = 0; + + while (tn != NULL && NODE_PARENT(tn) != NULL) { + + if( i > 10 ) { + printk("Rebalance tn=%p \n", tn); + if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn)); + + printk("Rebalance tp=%p \n", tp); + if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp)); + } + + if( i > 12 ) BUG(); + i++; + + tp = NODE_PARENT(tn); + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); + tn = (struct tnode *) resize (t, (struct tnode *)tn); + tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); + + if(!NODE_PARENT(tn)) + break; + + tn = NODE_PARENT(tn); + } + /* Handle last (top) tnode */ + if (IS_TNODE(tn)) + tn = (struct tnode*) resize(t, (struct tnode *)tn); + + return (struct node*) tn; +} + +static struct list_head * +fib_insert_node(struct trie *t, u32 key, int plen) +{ + int pos, newpos; + struct tnode *tp = NULL, *tn = NULL; + struct node *n; + struct leaf *l; + int missbit; + struct list_head *fa_head=NULL; + struct leaf_info *li; + t_key cindex; + + pos = 0; + n=t->trie; + + /* If we point to NULL, stop. Either the tree is empty and we should + * just put a new leaf in if, or we have reached an empty child slot, + * and we should just put our new leaf in that. + * If we point to a T_TNODE, check if it matches our key. Note that + * a T_TNODE might be skipping any number of bits - its 'pos' need + * not be the parent's 'pos'+'bits'! + * + * If it does match the current key, get pos/bits from it, extract + * the index from our key, push the T_TNODE and walk the tree. + * + * If it doesn't, we have to replace it with a new T_TNODE. + * + * If we point to a T_LEAF, it might or might not have the same key + * as we do. If it does, just change the value, update the T_LEAF's + * value, and return it. + * If it doesn't, we need to replace it with a T_TNODE. + */ + + while (n != NULL && NODE_TYPE(n) == T_TNODE) { + tn = (struct tnode *) n; + + check_tnode(tn); + + if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { + tp = tn; + pos=tn->pos + tn->bits; + n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); + + if(n && NODE_PARENT(n) != tn) { + printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); + BUG(); + } + } + else + break; + } + + /* + * n ----> NULL, LEAF or TNODE + * + * tp is n's (parent) ----> NULL or TNODE + */ + + if(tp && IS_LEAF(tp)) + BUG(); + + t->revision++; + + /* Case 1: n is a leaf. Compare prefixes */ + + if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { + struct leaf *l = ( struct leaf *) n; + + li = leaf_info_new(plen); + + if(! li) + BUG(); + + fa_head = &li->falh; + insert_leaf_info(&l->list, li); + goto done; + } + t->size++; + l = leaf_new(); + + if(! l) + BUG(); + + l->key = key; + li = leaf_info_new(plen); + + if(! li) + BUG(); + + fa_head = &li->falh; + insert_leaf_info(&l->list, li); + + /* Case 2: n is NULL, and will just insert a new leaf */ + if (t->trie && n == NULL) { + + NODE_SET_PARENT(l, tp); + + if (!tp) + BUG(); + + else { + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, (struct node *)l); + } + } + /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ + else { + /* + * Add a new tnode here + * first tnode need some special handling + */ + + if (tp) + pos=tp->pos+tp->bits; + else + pos=0; + if(n) { + newpos = tkey_mismatch(key, pos, n->key); + tn = tnode_new(n->key, newpos, 1); + } + else { + newpos = 0; + tn = tnode_new(key, newpos, 1); /* First tnode */ + } + if(!tn) + trie_bug("tnode_pfx_new failed"); + + NODE_SET_PARENT(tn, tp); + + missbit=tkey_extract_bits(key, newpos, 1); + put_child(t, tn, missbit, (struct node *)l); + put_child(t, tn, 1-missbit, n); + + if(tp) { + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); + } + else { + t->trie = (struct node*) tn; /* First tnode */ + tp = tn; + } + } + if(tp && tp->pos+tp->bits > 32) { + printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", + tp, tp->pos, tp->bits, key, plen); + } + /* Rebalance the trie */ + t->trie = trie_rebalance(t, tp); +done:; + return fa_head; +} + +static int +fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, + struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) +{ + struct trie *t = (struct trie *) tb->tb_data; + struct fib_alias *fa, *new_fa; + struct list_head *fa_head=NULL; + struct fib_info *fi; + int plen = r->rtm_dst_len; + int type = r->rtm_type; + u8 tos = r->rtm_tos; + u32 key, mask; + int err; + struct leaf *l; + + if (plen > 32) + return -EINVAL; + + key = 0; + if (rta->rta_dst) + memcpy(&key, rta->rta_dst, 4); + + key = ntohl(key); + + if(trie_debug) + printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); + + mask = ntohl( inet_make_mask(plen) ); + + if(key & ~mask) + return -EINVAL; + + key = key & mask; + + if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL) + goto err; + + l = fib_find_node(t, key); + fa = NULL; + + if(l) { + fa_head = get_fa_head(l, plen); + fa = fib_find_alias(fa_head, tos, fi->fib_priority); + } + + /* Now fa, if non-NULL, points to the first fib alias + * with the same keys [prefix,tos,priority], if such key already + * exists or to the node before which we will insert new one. + * + * If fa is NULL, we will need to allocate a new one and + * insert to the head of f. + * + * If f is NULL, no fib node matched the destination key + * and we need to allocate a new one of those as well. + */ + + if (fa && + fa->fa_info->fib_priority == fi->fib_priority) { + struct fib_alias *fa_orig; + + err = -EEXIST; + if (nlhdr->nlmsg_flags & NLM_F_EXCL) + goto out; + + if (nlhdr->nlmsg_flags & NLM_F_REPLACE) { + struct fib_info *fi_drop; + u8 state; + + write_lock_bh(&fib_lock); + + fi_drop = fa->fa_info; + fa->fa_info = fi; + fa->fa_type = type; + fa->fa_scope = r->rtm_scope; + state = fa->fa_state; + fa->fa_state &= ~FA_S_ACCESSED; + + write_unlock_bh(&fib_lock); + + fib_release_info(fi_drop); + if (state & FA_S_ACCESSED) + rt_cache_flush(-1); + + goto succeeded; + } + /* Error if we find a perfect match which + * uses the same scope, type, and nexthop + * information. + */ + fa_orig = fa; + list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) { + if (fa->fa_tos != tos) + break; + if (fa->fa_info->fib_priority != fi->fib_priority) + break; + if (fa->fa_type == type && + fa->fa_scope == r->rtm_scope && + fa->fa_info == fi) { + goto out; + } + } + if (!(nlhdr->nlmsg_flags & NLM_F_APPEND)) + fa = fa_orig; + } + err = -ENOENT; + if (!(nlhdr->nlmsg_flags&NLM_F_CREATE)) + goto out; + + err = -ENOBUFS; + new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); + if (new_fa == NULL) + goto out; + + new_fa->fa_info = fi; + new_fa->fa_tos = tos; + new_fa->fa_type = type; + new_fa->fa_scope = r->rtm_scope; + new_fa->fa_state = 0; +#if 0 + new_fa->dst = NULL; +#endif + /* + * Insert new entry to the list. + */ + + if(!fa_head) + fa_head = fib_insert_node(t, key, plen); + + write_lock_bh(&fib_lock); + + list_add_tail(&new_fa->fa_list, + (fa ? &fa->fa_list : fa_head)); + + write_unlock_bh(&fib_lock); + + rt_cache_flush(-1); + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); +succeeded: + return 0; +out: + fib_release_info(fi); +err:; + return err; +} + +static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, + struct fib_result *res, int *err) +{ + int i; + t_key mask; + struct leaf_info *li; + struct hlist_head *hhead = &l->list; + struct hlist_node *node; + + hlist_for_each_entry(li, node, hhead, hlist) { + + i = li->plen; + mask = ntohl(inet_make_mask(i)); + if (l->key != (key & mask)) + continue; + + if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) { + *plen = i; +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_passed++; +#endif + return 1; + } +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_miss++; +#endif + } + return 0; +} + +static int +fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +{ + struct trie *t = (struct trie *) tb->tb_data; + int plen, ret = 0; + struct node *n; + struct tnode *pn; + int pos, bits; + t_key key=ntohl(flp->fl4_dst); + int chopped_off; + t_key cindex = 0; + int current_prefix_length = KEYLENGTH; + n = t->trie; + + read_lock(&fib_lock); + if(!n) + goto failed; + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.gets++; +#endif + + /* Just a leaf? */ + if (IS_LEAF(n)) { + if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) ) + goto found; + goto failed; + } + pn = (struct tnode *) n; + chopped_off = 0; + + while (pn) { + + pos = pn->pos; + bits = pn->bits; + + if(!chopped_off) + cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits); + + n = tnode_get_child(pn, cindex); + + if (n == NULL) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.null_node_hit++; +#endif + goto backtrace; + } + + if (IS_TNODE(n)) { +#define HL_OPTIMIZE +#ifdef HL_OPTIMIZE + struct tnode *cn = (struct tnode *)n; + t_key node_prefix, key_prefix, pref_mismatch; + int mp; + + /* + * It's a tnode, and we can do some extra checks here if we + * like, to avoid descending into a dead-end branch. + * This tnode is in the parent's child array at index + * key[p_pos..p_pos+p_bits] but potentially with some bits + * chopped off, so in reality the index may be just a + * subprefix, padded with zero at the end. + * We can also take a look at any skipped bits in this + * tnode - everything up to p_pos is supposed to be ok, + * and the non-chopped bits of the index (se previous + * paragraph) are also guaranteed ok, but the rest is + * considered unknown. + * + * The skipped bits are key[pos+bits..cn->pos]. + */ + + /* If current_prefix_length < pos+bits, we are already doing + * actual prefix matching, which means everything from + * pos+(bits-chopped_off) onward must be zero along some + * branch of this subtree - otherwise there is *no* valid + * prefix present. Here we can only check the skipped + * bits. Remember, since we have already indexed into the + * parent's child array, we know that the bits we chopped of + * *are* zero. + */ + + /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ + + if (current_prefix_length < pos+bits) { + if (tkey_extract_bits(cn->key, current_prefix_length, + cn->pos - current_prefix_length) != 0 || + !(cn->child[0])) + goto backtrace; + } + + /* + * If chopped_off=0, the index is fully validated and we + * only need to look at the skipped bits for this, the new, + * tnode. What we actually want to do is to find out if + * these skipped bits match our key perfectly, or if we will + * have to count on finding a matching prefix further down, + * because if we do, we would like to have some way of + * verifying the existence of such a prefix at this point. + */ + + /* The only thing we can do at this point is to verify that + * any such matching prefix can indeed be a prefix to our + * key, and if the bits in the node we are inspecting that + * do not match our key are not ZERO, this cannot be true. + * Thus, find out where there is a mismatch (before cn->pos) + * and verify that all the mismatching bits are zero in the + * new tnode's key. + */ + + /* Note: We aren't very concerned about the piece of the key + * that precede pn->pos+pn->bits, since these have already been + * checked. The bits after cn->pos aren't checked since these are + * by definition "unknown" at this point. Thus, what we want to + * see is if we are about to enter the "prefix matching" state, + * and in that case verify that the skipped bits that will prevail + * throughout this subtree are zero, as they have to be if we are + * to find a matching prefix. + */ + + node_prefix = MASK_PFX(cn->key, cn->pos); + key_prefix = MASK_PFX(key, cn->pos); + pref_mismatch = key_prefix^node_prefix; + mp = 0; + + /* In short: If skipped bits in this node do not match the search + * key, enter the "prefix matching" state.directly. + */ + if (pref_mismatch) { + while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { + mp++; + pref_mismatch = pref_mismatch <<1; + } + key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); + + if (key_prefix != 0) + goto backtrace; + + if (current_prefix_length >= cn->pos) + current_prefix_length=mp; + } +#endif + pn = (struct tnode *)n; /* Descend */ + chopped_off = 0; + continue; + } + if (IS_LEAF(n)) { + if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret)) + goto found; + } +backtrace: + chopped_off++; + + /* As zero don't change the child key (cindex) */ + while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) { + chopped_off++; + } + + /* Decrease current_... with bits chopped off */ + if (current_prefix_length > pn->pos + pn->bits - chopped_off) + current_prefix_length = pn->pos + pn->bits - chopped_off; + + /* + * Either we do the actual chop off according or if we have + * chopped off all bits in this tnode walk up to our parent. + */ + + if(chopped_off <= pn->bits) + cindex &= ~(1 << (chopped_off-1)); + else { + if( NODE_PARENT(pn) == NULL) + goto failed; + + /* Get Child's index */ + cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); + pn = NODE_PARENT(pn); + chopped_off = 0; + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.backtrack++; +#endif + goto backtrace; + } + } +failed: + ret = 1; +found: + read_unlock(&fib_lock); + return ret; +} + +static int trie_leaf_remove(struct trie *t, t_key key) +{ + t_key cindex; + struct tnode *tp = NULL; + struct node *n = t->trie; + struct leaf *l; + + if(trie_debug) + printk("entering trie_leaf_remove(%p)\n", n); + + /* Note that in the case skipped bits, those bits are *not* checked! + * When we finish this, we will have NULL or a T_LEAF, and the + * T_LEAF may or may not match our key. + */ + + while (n != NULL && IS_TNODE(n)) { + struct tnode *tn = (struct tnode *) n; + check_tnode(tn); + n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); + + if(n && NODE_PARENT(n) != tn) { + printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); + BUG(); + } + } + l = (struct leaf *) n; + + if(!n || !tkey_equals(l->key, key)) + return 0; + + /* + * Key found. + * Remove the leaf and rebalance the tree + */ + + t->revision++; + t->size--; + + tp = NODE_PARENT(n); + tnode_free((struct tnode *) n); + + if(tp) { + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, NULL); + t->trie = trie_rebalance(t, tp); + } + else + t->trie = NULL; + + return 1; +} + +static int +fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, + struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) +{ + struct trie *t = (struct trie *) tb->tb_data; + u32 key, mask; + int plen = r->rtm_dst_len; + u8 tos = r->rtm_tos; + struct fib_alias *fa, *fa_to_delete; + struct list_head *fa_head; + struct leaf *l; + + if (plen > 32) + return -EINVAL; + + key = 0; + if (rta->rta_dst) + memcpy(&key, rta->rta_dst, 4); + + key = ntohl(key); + mask = ntohl( inet_make_mask(plen) ); + + if(key & ~mask) + return -EINVAL; + + key = key & mask; + l = fib_find_node(t, key); + + if(!l) + return -ESRCH; + + fa_head = get_fa_head(l, plen); + fa = fib_find_alias(fa_head, tos, 0); + + if (!fa) + return -ESRCH; + + if (trie_debug) + printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); + + fa_to_delete = NULL; + fa_head = fa->fa_list.prev; + list_for_each_entry(fa, fa_head, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (fa->fa_tos != tos) + break; + + if ((!r->rtm_type || + fa->fa_type == r->rtm_type) && + (r->rtm_scope == RT_SCOPE_NOWHERE || + fa->fa_scope == r->rtm_scope) && + (!r->rtm_protocol || + fi->fib_protocol == r->rtm_protocol) && + fib_nh_match(r, nlhdr, rta, fi) == 0) { + fa_to_delete = fa; + break; + } + } + + if (fa_to_delete) { + int kill_li = 0; + struct leaf_info *li; + + fa = fa_to_delete; + rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); + + l = fib_find_node(t, key); + li = find_leaf_info(&l->list, plen); + + write_lock_bh(&fib_lock); + + list_del(&fa->fa_list); + + if(list_empty(fa_head)) { + hlist_del(&li->hlist); + kill_li = 1; + } + write_unlock_bh(&fib_lock); + + if(kill_li) + free_leaf_info(li); + + if(hlist_empty(&l->list)) + trie_leaf_remove(t, key); + + if (fa->fa_state & FA_S_ACCESSED) + rt_cache_flush(-1); + + fn_free_alias(fa); + return 0; + } + return -ESRCH; +} + +static int trie_flush_list(struct trie *t, struct list_head *head) +{ + struct fib_alias *fa, *fa_node; + int found = 0; + + list_for_each_entry_safe(fa, fa_node, head, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (fi && (fi->fib_flags&RTNH_F_DEAD)) { + + write_lock_bh(&fib_lock); + list_del(&fa->fa_list); + write_unlock_bh(&fib_lock); + + fn_free_alias(fa); + found++; + } + } + return found; +} + +static int trie_flush_leaf(struct trie *t, struct leaf *l) +{ + int found = 0; + struct hlist_head *lih = &l->list; + struct hlist_node *node, *tmp; + struct leaf_info *li = NULL; + + hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { + + found += trie_flush_list(t, &li->falh); + + if (list_empty(&li->falh)) { + + write_lock_bh(&fib_lock); + hlist_del(&li->hlist); + write_unlock_bh(&fib_lock); + + free_leaf_info(li); + } + } + return found; +} + +static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) +{ + struct node *c = (struct node *) thisleaf; + struct tnode *p; + int idx; + + if(c == NULL) { + if(t->trie == NULL) + return NULL; + + if (IS_LEAF(t->trie)) /* trie w. just a leaf */ + return (struct leaf *) t->trie; + + p = (struct tnode*) t->trie; /* Start */ + } + else + p = (struct tnode *) NODE_PARENT(c); + while (p) { + int pos, last; + + /* Find the next child of the parent */ + if(c) + pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); + else + pos = 0; + + last = 1 << p->bits; + for(idx = pos; idx < last ; idx++) { + if( p->child[idx]) { + + /* Decend if tnode */ + + while (IS_TNODE(p->child[idx])) { + p = (struct tnode*) p->child[idx]; + idx = 0; + + /* Rightmost non-NULL branch */ + if( p && IS_TNODE(p) ) + while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++; + + /* Done with this tnode? */ + if( idx >= (1 << p->bits) || p->child[idx] == NULL ) + goto up; + } + return (struct leaf*) p->child[idx]; + } + } +up: + /* No more children go up one step */ + c = (struct node*) p; + p = (struct tnode *) NODE_PARENT(p); + } + return NULL; /* Ready. Root of trie */ +} + +static int fn_trie_flush(struct fib_table *tb) +{ + struct trie *t = (struct trie *) tb->tb_data; + struct leaf *ll = NULL, *l = NULL; + int found = 0, h; + + t->revision++; + + for (h=0; (l = nextleaf(t, l)) != NULL; h++) { + found += trie_flush_leaf(t, l); + + if (ll && hlist_empty(&ll->list)) + trie_leaf_remove(t, ll->key); + ll = l; + } + + if (ll && hlist_empty(&ll->list)) + trie_leaf_remove(t, ll->key); + + if(trie_debug) + printk("trie_flush found=%d\n", found); + return found; +} + +static int trie_last_dflt=-1; + +static void +fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +{ + struct trie *t = (struct trie *) tb->tb_data; + int order, last_idx; + struct fib_info *fi = NULL; + struct fib_info *last_resort; + struct fib_alias *fa = NULL; + struct list_head *fa_head; + struct leaf *l; + + last_idx = -1; + last_resort = NULL; + order = -1; + + read_lock(&fib_lock); + + l = fib_find_node(t, 0); + if(!l) + goto out; + + fa_head = get_fa_head(l, 0); + if(!fa_head) + goto out; + + if (list_empty(fa_head)) + goto out; + + list_for_each_entry(fa, fa_head, fa_list) { + struct fib_info *next_fi = fa->fa_info; + + if (fa->fa_scope != res->scope || + fa->fa_type != RTN_UNICAST) + continue; + + if (next_fi->fib_priority > res->fi->fib_priority) + break; + if (!next_fi->fib_nh[0].nh_gw || + next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + continue; + fa->fa_state |= FA_S_ACCESSED; + + if (fi == NULL) { + if (next_fi != res->fi) + break; + } else if (!fib_detect_death(fi, order, &last_resort, + &last_idx, &trie_last_dflt)) { + if (res->fi) + fib_info_put(res->fi); + res->fi = fi; + atomic_inc(&fi->fib_clntref); + trie_last_dflt = order; + goto out; + } + fi = next_fi; + order++; + } + if (order <= 0 || fi == NULL) { + trie_last_dflt = -1; + goto out; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) { + if (res->fi) + fib_info_put(res->fi); + res->fi = fi; + atomic_inc(&fi->fib_clntref); + trie_last_dflt = order; + goto out; + } + if (last_idx >= 0) { + if (res->fi) + fib_info_put(res->fi); + res->fi = last_resort; + if (last_resort) + atomic_inc(&last_resort->fib_clntref); + } + trie_last_dflt = last_idx; + out:; + read_unlock(&fib_lock); +} + +static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, + struct sk_buff *skb, struct netlink_callback *cb) +{ + int i, s_i; + struct fib_alias *fa; + + u32 xkey=htonl(key); + + s_i=cb->args[3]; + i = 0; + + list_for_each_entry(fa, fah, fa_list) { + if (i < s_i) { + i++; + continue; + } + if (fa->fa_info->fib_nh == NULL) { + printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen); + i++; + continue; + } + if (fa->fa_info == NULL) { + printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen); + i++; + continue; + } + + if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, + tb->tb_id, + fa->fa_type, + fa->fa_scope, + &xkey, + plen, + fa->fa_tos, + fa->fa_info) < 0) { + cb->args[3] = i; + return -1; + } + i++; + } + cb->args[3]=i; + return skb->len; +} + +static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) +{ + int h, s_h; + struct list_head *fa_head; + struct leaf *l = NULL; + s_h=cb->args[2]; + + for (h=0; (l = nextleaf(t, l)) != NULL; h++) { + + if (h < s_h) + continue; + if (h > s_h) + memset(&cb->args[3], 0, + sizeof(cb->args) - 3*sizeof(cb->args[0])); + + fa_head = get_fa_head(l, plen); + + if(!fa_head) + continue; + + if(list_empty(fa_head)) + continue; + + if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { + cb->args[2]=h; + return -1; + } + } + cb->args[2]=h; + return skb->len; +} + +static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) +{ + int m, s_m; + struct trie *t = (struct trie *) tb->tb_data; + + s_m = cb->args[1]; + + read_lock(&fib_lock); + for (m=0; m<=32; m++) { + + if (m < s_m) + continue; + if (m > s_m) + memset(&cb->args[2], 0, + sizeof(cb->args) - 2*sizeof(cb->args[0])); + + if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { + cb->args[1] = m; + goto out; + } + } + read_unlock(&fib_lock); + cb->args[1] = m; + return skb->len; + out: + read_unlock(&fib_lock); + return -1; +} + +/* Fix more generic FIB names for init later */ + +#ifdef CONFIG_IP_MULTIPLE_TABLES +struct fib_table * fib_hash_init(int id) +#else +struct fib_table * __init fib_hash_init(int id) +#endif +{ + struct fib_table *tb; + struct trie *t; + + if (fn_alias_kmem == NULL) + fn_alias_kmem = kmem_cache_create("ip_fib_alias", + sizeof(struct fib_alias), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), + GFP_KERNEL); + if (tb == NULL) + return NULL; + + tb->tb_id = id; + tb->tb_lookup = fn_trie_lookup; + tb->tb_insert = fn_trie_insert; + tb->tb_delete = fn_trie_delete; + tb->tb_flush = fn_trie_flush; + tb->tb_select_default = fn_trie_select_default; + tb->tb_dump = fn_trie_dump; + memset(tb->tb_data, 0, sizeof(struct trie)); + + t = (struct trie *) tb->tb_data; + + trie_init(t); + + if (id == RT_TABLE_LOCAL) + trie_local=t; + else if (id == RT_TABLE_MAIN) + trie_main=t; + + if (id == RT_TABLE_LOCAL) + printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); + + return tb; +} + +/* Trie dump functions */ + +static void putspace_seq(struct seq_file *seq, int n) +{ + while (n--) seq_printf(seq, " "); +} + +static void printbin_seq(struct seq_file *seq, unsigned int v, int bits) +{ + while (bits--) + seq_printf(seq, "%s", (v & (1<: "); + seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); + + if (IS_LEAF(n)) + seq_printf(seq, "key=%d.%d.%d.%d\n", + n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); + else { + int plen=((struct tnode *)n)->pos; + t_key prf=MASK_PFX(n->key, plen); + seq_printf(seq, "key=%d.%d.%d.%d/%d\n", + prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); + } + if (IS_LEAF(n)) { + struct leaf *l=(struct leaf *)n; + struct fib_alias *fa; + int i; + for (i=32; i>=0; i--) + if(find_leaf_info(&l->list, i)) { + + struct list_head *fa_head = get_fa_head(l, i); + + if(!fa_head) + continue; + + if(list_empty(fa_head)) + continue; + + putspace_seq(seq, indent+2); + seq_printf(seq, "{/%d...dumping}\n", i); + + + list_for_each_entry(fa, fa_head, fa_list) { + putspace_seq(seq, indent+2); + if (fa->fa_info->fib_nh == NULL) { + seq_printf(seq, "Error _fib_nh=NULL\n"); + continue; + } + if (fa->fa_info == NULL) { + seq_printf(seq, "Error fa_info=NULL\n"); + continue; + } + + seq_printf(seq, "{type=%d scope=%d TOS=%d}\n", + fa->fa_type, + fa->fa_scope, + fa->fa_tos); + } + } + } + else if (IS_TNODE(n)) { + struct tnode *tn=(struct tnode *)n; + putspace_seq(seq, indent); seq_printf(seq, "| "); + seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos)); + printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); + seq_printf(seq, "}\n"); + putspace_seq(seq, indent); seq_printf(seq, "| "); + seq_printf(seq, "{pos=%d", tn->pos); + seq_printf(seq, " (skip=%d bits)", tn->pos - pend); + seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits)); + putspace_seq(seq, indent); seq_printf(seq, "| "); + seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children); + } +} + +static void trie_dump_seq(struct seq_file *seq, struct trie *t) +{ + struct node *n=t->trie; + int cindex=0; + int indent=1; + int pend=0; + int depth = 0; + + read_lock(&fib_lock); + + seq_printf(seq, "------ trie_dump of t=%p ------\n", t); + if (n) { + printnode_seq(seq, indent, n, pend, cindex, 0); + if (IS_TNODE(n)) { + struct tnode *tn=(struct tnode *)n; + pend = tn->pos+tn->bits; + putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); + indent += 3; + depth++; + + while (tn && cindex < (1 << tn->bits)) { + if (tn->child[cindex]) { + + /* Got a child */ + + printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); + if (IS_LEAF(tn->child[cindex])) { + cindex++; + + } + else { + /* + * New tnode. Decend one level + */ + + depth++; + n=tn->child[cindex]; + tn=(struct tnode *)n; + pend=tn->pos+tn->bits; + putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); + indent+=3; + cindex=0; + } + } + else + cindex++; + + /* + * Test if we are done + */ + + while (cindex >= (1 << tn->bits)) { + + /* + * Move upwards and test for root + * pop off all traversed nodes + */ + + if (NODE_PARENT(tn) == NULL) { + tn = NULL; + n = NULL; + break; + } + else { + cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); + tn = NODE_PARENT(tn); + cindex++; + n=(struct node *)tn; + pend=tn->pos+tn->bits; + indent-=3; + depth--; + } + } + } + } + else n = NULL; + } + else seq_printf(seq, "------ trie is empty\n"); + + read_unlock(&fib_lock); +} + +static struct trie_stat *trie_stat_new(void) +{ + struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); + int i; + + if(s) { + s->totdepth = 0; + s->maxdepth = 0; + s->tnodes = 0; + s->leaves = 0; + s->nullpointers = 0; + + for(i=0; i< MAX_CHILDS; i++) + s->nodesizes[i] = 0; + } + return s; +} + +static struct trie_stat *trie_collect_stats(struct trie *t) +{ + struct node *n=t->trie; + struct trie_stat *s = trie_stat_new(); + int cindex = 0; + int indent = 1; + int pend = 0; + int depth = 0; + + read_lock(&fib_lock); + + if (s) { + if (n) { + if (IS_TNODE(n)) { + struct tnode *tn = (struct tnode *)n; + pend=tn->pos+tn->bits; + indent += 3; + s->nodesizes[tn->bits]++; + depth++; + + while (tn && cindex < (1 << tn->bits)) { + if (tn->child[cindex]) { + /* Got a child */ + + if (IS_LEAF(tn->child[cindex])) { + cindex++; + + /* stats */ + if (depth > s->maxdepth) + s->maxdepth = depth; + s->totdepth += depth; + s->leaves++; + } + + else { + /* + * New tnode. Decend one level + */ + + s->tnodes++; + s->nodesizes[tn->bits]++; + depth++; + + n = tn->child[cindex]; + tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + + indent += 3; + cindex = 0; + } + } + else { + cindex++; + s->nullpointers++; + } + + /* + * Test if we are done + */ + + while (cindex >= (1 << tn->bits)) { + + /* + * Move upwards and test for root + * pop off all traversed nodes + */ + + + if (NODE_PARENT(tn) == NULL) { + tn = NULL; + n = NULL; + break; + } + else { + cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); + tn = NODE_PARENT(tn); + cindex++; + n = (struct node *)tn; + pend=tn->pos+tn->bits; + indent -= 3; + depth--; + } + } + } + } + else n = NULL; + } + } + + read_unlock(&fib_lock); + return s; +} + +#ifdef CONFIG_PROC_FS + +static struct fib_alias *fib_triestat_get_first(struct seq_file *seq) +{ + return NULL; +} + +static struct fib_alias *fib_triestat_get_next(struct seq_file *seq) +{ + return NULL; +} + +static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos) +{ + void *v = NULL; + + if (ip_fib_main_table) + v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN; + return v; +} + +static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq); +} + +static void fib_triestat_seq_stop(struct seq_file *seq, void *v) +{ + +} + +/* + * This outputs /proc/net/fib_triestats + * + * It always works in backward compatibility mode. + * The format of the file is not supposed to be changed. + */ + +static void collect_and_show(struct trie *t, struct seq_file *seq) +{ + int bytes = 0; /* How many bytes are used, a ref is 4 bytes */ + int i, max, pointers; + struct trie_stat *stat; + int avdepth; + + stat = trie_collect_stats(t); + + bytes=0; + seq_printf(seq, "trie=%p\n", t); + + if (stat) { + if (stat->leaves) + avdepth=stat->totdepth*100 / stat->leaves; + else + avdepth=0; + seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); + seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); + + seq_printf(seq, "Leaves: %d\n", stat->leaves); + bytes += sizeof(struct leaf) * stat->leaves; + seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); + bytes += sizeof(struct tnode) * stat->tnodes; + + max = MAX_CHILDS-1; + + while (max >= 0 && stat->nodesizes[max] == 0) + max--; + pointers = 0; + + for (i = 1; i <= max; i++) + if (stat->nodesizes[i] != 0) { + seq_printf(seq, " %d: %d", i, stat->nodesizes[i]); + pointers += (1<nodesizes[i]; + } + seq_printf(seq, "\n"); + seq_printf(seq, "Pointers: %d\n", pointers); + bytes += sizeof(struct node *) * pointers; + seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers); + seq_printf(seq, "Total size: %d kB\n", bytes / 1024); + + kfree(stat); + } + +#ifdef CONFIG_IP_FIB_TRIE_STATS + seq_printf(seq, "Counters:\n---------\n"); + seq_printf(seq,"gets = %d\n", t->stats.gets); + seq_printf(seq,"backtracks = %d\n", t->stats.backtrack); + seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); + seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); + seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); +#ifdef CLEAR_STATS + memset(&(t->stats), 0, sizeof(t->stats)); +#endif +#endif /* CONFIG_IP_FIB_TRIE_STATS */ +} + +static int fib_triestat_seq_show(struct seq_file *seq, void *v) +{ + char bf[128]; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", + sizeof(struct leaf), sizeof(struct tnode)); + if (trie_local) + collect_and_show(trie_local, seq); + + if (trie_main) + collect_and_show(trie_main, seq); + } + else { + snprintf(bf, sizeof(bf), + "*\t%08X\t%08X", 200, 400); + + seq_printf(seq, "%-127s\n", bf); + } + return 0; +} + +static struct seq_operations fib_triestat_seq_ops = { + .start = fib_triestat_seq_start, + .next = fib_triestat_seq_next, + .stop = fib_triestat_seq_stop, + .show = fib_triestat_seq_show, +}; + +static int fib_triestat_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + + rc = seq_open(file, &fib_triestat_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; +out: + return rc; +out_kfree: + goto out; +} + +static struct file_operations fib_triestat_seq_fops = { + .owner = THIS_MODULE, + .open = fib_triestat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init fib_stat_proc_init(void) +{ + if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops)) + return -ENOMEM; + return 0; +} + +void __init fib_stat_proc_exit(void) +{ + proc_net_remove("fib_triestat"); +} + +static struct fib_alias *fib_trie_get_first(struct seq_file *seq) +{ + return NULL; +} + +static struct fib_alias *fib_trie_get_next(struct seq_file *seq) +{ + return NULL; +} + +static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) +{ + void *v = NULL; + + if (ip_fib_main_table) + v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN; + return v; +} + +static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq); +} + +static void fib_trie_seq_stop(struct seq_file *seq, void *v) +{ + +} + +/* + * This outputs /proc/net/fib_trie. + * + * It always works in backward compatibility mode. + * The format of the file is not supposed to be changed. + */ + +static int fib_trie_seq_show(struct seq_file *seq, void *v) +{ + char bf[128]; + + if (v == SEQ_START_TOKEN) { + if (trie_local) + trie_dump_seq(seq, trie_local); + + if (trie_main) + trie_dump_seq(seq, trie_main); + } + + else { + snprintf(bf, sizeof(bf), + "*\t%08X\t%08X", 200, 400); + seq_printf(seq, "%-127s\n", bf); + } + + return 0; +} + +static struct seq_operations fib_trie_seq_ops = { + .start = fib_trie_seq_start, + .next = fib_trie_seq_next, + .stop = fib_trie_seq_stop, + .show = fib_trie_seq_show, +}; + +static int fib_trie_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + + rc = seq_open(file, &fib_trie_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; +out: + return rc; +out_kfree: + goto out; +} + +static struct file_operations fib_trie_seq_fops = { + .owner = THIS_MODULE, + .open = fib_trie_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init fib_proc_init(void) +{ + if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops)) + return -ENOMEM; + return 0; +} + +void __init fib_proc_exit(void) +{ + proc_net_remove("fib_trie"); +} + +#endif /* CONFIG_PROC_FS */ -- cgit From 0d51aa80a9b1db43920c0770c3bb842dd823c005 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Tue, 21 Jun 2005 13:51:04 -0700 Subject: [IPV6]: V6 route events reported with wrong netlink PID and seq number Essentially netlink at the moment always reports a pid and sequence of 0 always for v6 route activities. To understand the repurcassions of this look at: http://lists.quagga.net/pipermail/quagga-dev/2005-June/003507.html While fixing this, i took the liberty to resolve the outstanding issue of IPV6 routes inserted via ioctls to have the correct pids as well. This patch tries to behave as close as possible to the v4 routes i.e maintains whatever PID the socket issuing the command owns as opposed to the process. That made the patch a little bulky. I have tested against both netlink derived utility to add/del routes as well as ioctl derived one. The Quagga folks have tested against quagga. This fixes the problem and so far hasnt been detected to introduce any new issues. Signed-off-by: Jamal Hadi Salim Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 14 +++++----- net/ipv6/anycast.c | 4 +-- net/ipv6/ip6_fib.c | 19 ++++++------- net/ipv6/ndisc.c | 4 +-- net/ipv6/route.c | 78 ++++++++++++++++++++++++++++------------------------- 5 files changed, 62 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 47a30c3188e..14f5c53235f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -695,7 +695,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { if (onlink == 0) { - ip6_del_rt(rt, NULL, NULL); + ip6_del_rt(rt, NULL, NULL, NULL); rt = NULL; } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { rt->rt6i_expires = expires; @@ -1340,7 +1340,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) rtmsg.rtmsg_flags |= RTF_NONEXTHOP; - ip6_route_add(&rtmsg, NULL, NULL); + ip6_route_add(&rtmsg, NULL, NULL, NULL); } /* Create "default" multicast route to the interface */ @@ -1357,7 +1357,7 @@ static void addrconf_add_mroute(struct net_device *dev) rtmsg.rtmsg_ifindex = dev->ifindex; rtmsg.rtmsg_flags = RTF_UP; rtmsg.rtmsg_type = RTMSG_NEWROUTE; - ip6_route_add(&rtmsg, NULL, NULL); + ip6_route_add(&rtmsg, NULL, NULL, NULL); } static void sit_route_add(struct net_device *dev) @@ -1374,7 +1374,7 @@ static void sit_route_add(struct net_device *dev) rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP; rtmsg.rtmsg_ifindex = dev->ifindex; - ip6_route_add(&rtmsg, NULL, NULL); + ip6_route_add(&rtmsg, NULL, NULL, NULL); } static void addrconf_add_lroute(struct net_device *dev) @@ -1467,7 +1467,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { if (rt->rt6i_flags&RTF_EXPIRES) { if (valid_lft == 0) { - ip6_del_rt(rt, NULL, NULL); + ip6_del_rt(rt, NULL, NULL, NULL); rt = NULL; } else { rt->rt6i_expires = rt_expires; @@ -3094,7 +3094,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) switch (event) { case RTM_NEWADDR: dst_hold(&ifp->rt->u.dst); - if (ip6_ins_rt(ifp->rt, NULL, NULL)) + if (ip6_ins_rt(ifp->rt, NULL, NULL, NULL)) dst_release(&ifp->rt->u.dst); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); @@ -3104,7 +3104,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) addrconf_leave_anycast(ifp); addrconf_leave_solict(ifp->idev, &ifp->addr); dst_hold(&ifp->rt->u.dst); - if (ip6_del_rt(ifp->rt, NULL, NULL)) + if (ip6_del_rt(ifp->rt, NULL, NULL, NULL)) dst_free(&ifp->rt->u.dst); else dst_release(&ifp->rt->u.dst); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 5d22ca3cca2..6b729404723 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -337,7 +337,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) write_unlock_bh(&idev->lock); dst_hold(&rt->u.dst); - if (ip6_ins_rt(rt, NULL, NULL)) + if (ip6_ins_rt(rt, NULL, NULL, NULL)) dst_release(&rt->u.dst); addrconf_join_solict(dev, &aca->aca_addr); @@ -380,7 +380,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr) addrconf_leave_solict(idev, &aca->aca_addr); dst_hold(&aca->aca_rt->u.dst); - if (ip6_del_rt(aca->aca_rt, NULL, NULL)) + if (ip6_del_rt(aca->aca_rt, NULL, NULL, NULL)) dst_free(&aca->aca_rt->u.dst); else dst_release(&aca->aca_rt->u.dst); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 405740b75ab..1b354aa9793 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -394,7 +394,7 @@ insert_above: */ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nlmsghdr *nlh) + struct nlmsghdr *nlh, struct netlink_skb_parms *req) { struct rt6_info *iter = NULL; struct rt6_info **ins; @@ -449,7 +449,7 @@ out: *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, nlh); + inet6_rt_notify(RTM_NEWROUTE, rt, nlh, req); rt6_stats.fib_rt_entries++; if ((fn->fn_flags & RTN_RTINFO) == 0) { @@ -479,7 +479,8 @@ void fib6_force_start_gc(void) * with source addr info in sub-trees */ -int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +int fib6_add(struct fib6_node *root, struct rt6_info *rt, + struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req) { struct fib6_node *fn; int err = -ENOMEM; @@ -552,7 +553,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, } #endif - err = fib6_add_rt2node(fn, rt, nlh); + err = fib6_add_rt2node(fn, rt, nlh, req); if (err == 0) { fib6_start_gc(rt); @@ -859,7 +860,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn) } static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, - struct nlmsghdr *nlh, void *_rtattr) + struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req) { struct fib6_walker_t *w; struct rt6_info *rt = *rtp; @@ -915,11 +916,11 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, if (atomic_read(&rt->rt6i_ref) != 1) BUG(); } - inet6_rt_notify(RTM_DELROUTE, rt, nlh); + inet6_rt_notify(RTM_DELROUTE, rt, nlh, req); rt6_release(rt); } -int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req) { struct fib6_node *fn = rt->rt6i_node; struct rt6_info **rtp; @@ -944,7 +945,7 @@ int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) { if (*rtp == rt) { - fib6_del_route(fn, rtp, nlh, _rtattr); + fib6_del_route(fn, rtp, nlh, _rtattr, req); return 0; } } @@ -1073,7 +1074,7 @@ static int fib6_clean_node(struct fib6_walker_t *w) res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; - res = fib6_del(rt, NULL, NULL); + res = fib6_del(rt, NULL, NULL, NULL); if (res) { #if RT6_DEBUG >= 2 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7c291f4e9ed..7ae72d4c9bd 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -955,7 +955,7 @@ static void ndisc_recv_na(struct sk_buff *skb) struct rt6_info *rt; rt = rt6_get_dflt_router(saddr, dev); if (rt) - ip6_del_rt(rt, NULL, NULL); + ip6_del_rt(rt, NULL, NULL, NULL); } out: @@ -1096,7 +1096,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (rt && lifetime == 0) { neigh_clone(neigh); - ip6_del_rt(rt, NULL, NULL); + ip6_del_rt(rt, NULL, NULL, NULL); rt = NULL; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1f5b226c357..878789b3122 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -384,12 +384,13 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, be destroyed. */ -int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, + void *_rtattr, struct netlink_skb_parms *req) { int err; write_lock_bh(&rt6_lock); - err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr); + err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req); write_unlock_bh(&rt6_lock); return err; @@ -400,7 +401,7 @@ int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) */ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, - struct in6_addr *saddr) + struct in6_addr *saddr, struct netlink_skb_parms *req) { int err; struct rt6_info *rt; @@ -432,7 +433,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, dst_hold(&rt->u.dst); - err = ip6_ins_rt(rt, NULL, NULL); + err = ip6_ins_rt(rt, NULL, NULL, req); if (err == 0) return rt; @@ -491,7 +492,8 @@ restart: read_unlock_bh(&rt6_lock); nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr, - &skb->nh.ipv6h->saddr); + &skb->nh.ipv6h->saddr, + &NETLINK_CB(skb)); dst_release(&rt->u.dst); rt = nrt; @@ -551,7 +553,7 @@ restart: dst_hold(&rt->u.dst); read_unlock_bh(&rt6_lock); - nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src); + nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src, NULL); dst_release(&rt->u.dst); rt = nrt; @@ -598,7 +600,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) if (rt) { if (rt->rt6i_flags & RTF_CACHE) - ip6_del_rt(rt, NULL, NULL); + ip6_del_rt(rt, NULL, NULL, NULL); else dst_release(dst); } @@ -787,7 +789,8 @@ int ipv6_get_hoplimit(struct net_device *dev) * */ -int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) +int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, + void *_rtattr, struct netlink_skb_parms *req) { int err; struct rtmsg *r; @@ -974,7 +977,7 @@ install_route: rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); rt->u.dst.dev = dev; rt->rt6i_idev = idev; - return ip6_ins_rt(rt, nlh, _rtattr); + return ip6_ins_rt(rt, nlh, _rtattr, req); out: if (dev) @@ -986,7 +989,7 @@ out: return err; } -int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req) { int err; @@ -994,7 +997,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) rt6_reset_dflt_pointer(NULL); - err = fib6_del(rt, nlh, _rtattr); + err = fib6_del(rt, nlh, _rtattr, req); dst_release(&rt->u.dst); write_unlock_bh(&rt6_lock); @@ -1002,7 +1005,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) return err; } -static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) +static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req) { struct fib6_node *fn; struct rt6_info *rt; @@ -1029,7 +1032,7 @@ static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_r dst_hold(&rt->u.dst); read_unlock_bh(&rt6_lock); - return ip6_del_rt(rt, nlh, _rtattr); + return ip6_del_rt(rt, nlh, _rtattr, req); } } read_unlock_bh(&rt6_lock); @@ -1136,11 +1139,11 @@ source_ok: nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst)); - if (ip6_ins_rt(nrt, NULL, NULL)) + if (ip6_ins_rt(nrt, NULL, NULL, NULL)) goto out; if (rt->rt6i_flags&RTF_CACHE) { - ip6_del_rt(rt, NULL, NULL); + ip6_del_rt(rt, NULL, NULL, NULL); return; } @@ -1204,7 +1207,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, 2. It is gatewayed route or NONEXTHOP route. Action: clone it. */ if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { - nrt = rt6_cow(rt, daddr, saddr); + nrt = rt6_cow(rt, daddr, saddr, NULL); if (!nrt->u.dst.error) { nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; if (allfrag) @@ -1232,7 +1235,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; if (allfrag) nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; - ip6_ins_rt(nrt, NULL, NULL); + ip6_ins_rt(nrt, NULL, NULL, NULL); } out: @@ -1305,7 +1308,7 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, rtmsg.rtmsg_ifindex = dev->ifindex; - ip6_route_add(&rtmsg, NULL, NULL); + ip6_route_add(&rtmsg, NULL, NULL, NULL); return rt6_get_dflt_router(gwaddr, dev); } @@ -1323,7 +1326,7 @@ restart: read_unlock_bh(&rt6_lock); - ip6_del_rt(rt, NULL, NULL); + ip6_del_rt(rt, NULL, NULL, NULL); goto restart; } @@ -1349,10 +1352,10 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg) rtnl_lock(); switch (cmd) { case SIOCADDRT: - err = ip6_route_add(&rtmsg, NULL, NULL); + err = ip6_route_add(&rtmsg, NULL, NULL, NULL); break; case SIOCDELRT: - err = ip6_route_del(&rtmsg, NULL, NULL); + err = ip6_route_del(&rtmsg, NULL, NULL, NULL); break; default: err = -EINVAL; @@ -1546,7 +1549,7 @@ int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) return -EINVAL; - return ip6_route_del(&rtmsg, nlh, arg); + return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb)); } int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) @@ -1556,7 +1559,7 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) return -EINVAL; - return ip6_route_add(&rtmsg, nlh, arg); + return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb)); } struct rt6_rtnl_dump_arg @@ -1566,12 +1569,9 @@ struct rt6_rtnl_dump_arg }; static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, - struct in6_addr *dst, - struct in6_addr *src, - int iif, - int type, u32 pid, u32 seq, - struct nlmsghdr *in_nlh, int prefix, - unsigned int flags) + struct in6_addr *dst, struct in6_addr *src, + int iif, int type, u32 pid, u32 seq, + int prefix, unsigned int flags) { struct rtmsg *rtm; struct nlmsghdr *nlh; @@ -1585,10 +1585,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, } } - if (!pid && in_nlh) { - pid = in_nlh->nlmsg_pid; - } - nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags); rtm = NLMSG_DATA(nlh); rtm->rtm_family = AF_INET6; @@ -1675,7 +1671,7 @@ static int rt6_dump_route(struct rt6_info *rt, void *p_arg) return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, - NULL, prefix, NLM_F_MULTI); + prefix, NLM_F_MULTI); } static int fib6_dump_node(struct fib6_walker_t *w) @@ -1823,7 +1819,7 @@ int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) &fl.fl6_dst, &fl.fl6_src, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).pid, - nlh->nlmsg_seq, nlh, 0, 0); + nlh->nlmsg_seq, 0, 0); if (err < 0) { err = -EMSGSIZE; goto out_free; @@ -1839,17 +1835,25 @@ out_free: goto out; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh) +void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, + struct netlink_skb_parms *req) { struct sk_buff *skb; int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); + u32 pid = current->pid; + u32 seq = 0; + if (req) + pid = req->pid; + if (nlh) + seq = nlh->nlmsg_seq; + skb = alloc_skb(size, gfp_any()); if (!skb) { netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); return; } - if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0, 0) < 0) { + if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) { kfree_skb(skb); netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); return; -- cgit From c9e3e8b6958e02230079e6817862ea2968509866 Mon Sep 17 00:00:00 2001 From: David L Stevens Date: Tue, 21 Jun 2005 13:58:25 -0700 Subject: [IPV6]: multicast join and misc Here is a simplified version of the patch to fix a bug in IPv6 multicasting. It: 1) adds existence check & EADDRINUSE error for regular joins 2) adds an exception for EADDRINUSE in the source-specific multicast join (where a prior join is ok) 3) adds a missing/needed read_lock on sock_mc_list; would've raced with destroying the socket on interface down without 4) adds a "leave group" in the (INCLUDE, empty) source filter case. This frees unneeded socket buffer memory, but also prevents an inappropriate interaction among the 8 socket options that mess with this. Some would fail as if in the group when you aren't really. Item #4 had a locking bug in the last version of this patch; rather than removing the idev->lock read lock only, I've simplified it to remove all lock state in the path and treat it as a direct "leave group" call for the (INCLUDE,empty) case it covers. Tested on an MP machine. :-) Much thanks to HoerdtMickael who reported the original bug. Signed-off-by: David L Stevens Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 5 +++-- net/ipv6/mcast.c | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 279ab86be66..f3ef4c38d31 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -423,11 +423,12 @@ done: psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, &psin6->sin6_addr); - if (retv) + /* prior join w/ different source is ok */ + if (retv && retv != -EADDRINUSE) break; omode = MCAST_INCLUDE; add = 1; - } else /*IP_DROP_SOURCE_MEMBERSHIP */ { + } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 393b6e6f50a..c0ca92e8230 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -188,6 +188,16 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) if (!ipv6_addr_is_multicast(addr)) return -EINVAL; + read_lock_bh(&ipv6_sk_mc_lock); + for (mc_lst=np->ipv6_mc_list; mc_lst; mc_lst=mc_lst->next) { + if ((ifindex == 0 || mc_lst->ifindex == ifindex) && + ipv6_addr_equal(&mc_lst->addr, addr)) { + read_unlock_bh(&ipv6_sk_mc_lock); + return -EADDRINUSE; + } + } + read_unlock_bh(&ipv6_sk_mc_lock); + mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); if (mc_lst == NULL) @@ -349,6 +359,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; int i, j, rv; + int leavegroup = 0; int err; if (pgsr->gsr_group.ss_family != AF_INET6 || @@ -368,6 +379,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, err = -EADDRNOTAVAIL; + read_lock_bh(&ipv6_sk_mc_lock); for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; @@ -401,6 +413,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (rv) /* source not found */ goto done; + /* special case - (INCLUDE, empty) == LEAVE_GROUP */ + if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { + leavegroup = 1; + goto done; + } + /* update the interface filter */ ip6_mc_del_src(idev, group, omode, 1, source, 1); @@ -453,9 +471,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk, /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: + read_unlock_bh(&ipv6_sk_mc_lock); read_unlock_bh(&idev->lock); in6_dev_put(idev); dev_put(dev); + if (leavegroup) + return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; } -- cgit From e45b1be8bcb3643808975a426fa3e201a2588e87 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 21 Jun 2005 14:01:30 -0700 Subject: [NETFILTER]: Kill lockhelp.h Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/arp_tables.c | 1 - net/ipv4/netfilter/ip_conntrack_amanda.c | 7 +-- net/ipv4/netfilter/ip_conntrack_core.c | 84 ++++++++++++++-------------- net/ipv4/netfilter/ip_conntrack_ftp.c | 7 +-- net/ipv4/netfilter/ip_conntrack_irc.c | 7 +-- net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 23 ++++---- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 27 +++++---- net/ipv4/netfilter/ip_conntrack_standalone.c | 22 ++++---- net/ipv4/netfilter/ip_nat_core.c | 32 +++++------ net/ipv4/netfilter/ip_nat_helper.c | 10 ++-- net/ipv4/netfilter/ip_nat_rule.c | 4 +- net/ipv4/netfilter/ip_nat_standalone.c | 5 +- net/ipv4/netfilter/ip_tables.c | 1 - net/ipv4/netfilter/ipt_CLUSTERIP.c | 49 ++++++++-------- net/ipv4/netfilter/ipt_MASQUERADE.c | 10 ++-- net/ipv4/netfilter/ipt_ULOG.c | 15 +++-- net/ipv4/netfilter/ipt_hashlimit.c | 17 +++--- net/ipv4/netfilter/ipt_helper.c | 4 +- net/ipv6/netfilter/ip6_tables.c | 1 - 19 files changed, 158 insertions(+), 168 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index df79f5ed6a0..fa163425668 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -60,7 +60,6 @@ static DECLARE_MUTEX(arpt_mutex); #define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) #define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) -#include #include struct arpt_table_info { diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index 3dbddd06260..a78a320eee0 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -26,7 +26,6 @@ #include #include -#include #include #include @@ -42,7 +41,7 @@ static char *conns[] = { "DATA ", "MESG ", "INDEX " }; /* This is slow, but it's simple. --RR */ static char amanda_buffer[65536]; -static DECLARE_LOCK(amanda_buffer_lock); +static DEFINE_SPINLOCK(amanda_buffer_lock); unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, @@ -76,7 +75,7 @@ static int help(struct sk_buff **pskb, return NF_ACCEPT; } - LOCK_BH(&amanda_buffer_lock); + spin_lock_bh(&amanda_buffer_lock); skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); data = amanda_buffer; data_limit = amanda_buffer + (*pskb)->len - dataoff; @@ -134,7 +133,7 @@ static int help(struct sk_buff **pskb, } out: - UNLOCK_BH(&amanda_buffer_lock); + spin_unlock_bh(&amanda_buffer_lock); return ret; } diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 09e82462297..a7377a331ad 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -38,10 +38,10 @@ #include #include -/* This rwlock protects the main hash table, protocol/helper/expected +/* ip_conntrack_lock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) #include #include @@ -57,7 +57,7 @@ #define DEBUGP(format, args...) #endif -DECLARE_RWLOCK(ip_conntrack_lock); +DEFINE_RWLOCK(ip_conntrack_lock); /* ip_conntrack_standalone needs this */ atomic_t ip_conntrack_count = ATOMIC_INIT(0); @@ -147,7 +147,7 @@ static void destroy_expect(struct ip_conntrack_expect *exp) static void unlink_expect(struct ip_conntrack_expect *exp) { - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + ASSERT_WRITE_LOCK(&ip_conntrack_lock); list_del(&exp->list); /* Logically in destroy_expect, but we hold the lock here. */ exp->master->expecting--; @@ -157,9 +157,9 @@ static void expectation_timed_out(unsigned long ul_expect) { struct ip_conntrack_expect *exp = (void *)ul_expect; - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); unlink_expect(exp); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); destroy_expect(exp); } @@ -209,7 +209,7 @@ clean_from_lists(struct ip_conntrack *ct) unsigned int ho, hr; DEBUGP("clean_from_lists(%p)\n", ct); - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + ASSERT_WRITE_LOCK(&ip_conntrack_lock); ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -240,7 +240,7 @@ destroy_conntrack(struct nf_conntrack *nfct) if (ip_conntrack_destroyed) ip_conntrack_destroyed(ct); - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, @@ -254,7 +254,7 @@ destroy_conntrack(struct nf_conntrack *nfct) } CONNTRACK_STAT_INC(delete); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); if (ct->master) ip_conntrack_put(ct->master); @@ -268,12 +268,12 @@ static void death_by_timeout(unsigned long ul_conntrack) { struct ip_conntrack *ct = (void *)ul_conntrack; - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ CONNTRACK_STAT_INC(delete_list); clean_from_lists(ct); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); ip_conntrack_put(ct); } @@ -282,7 +282,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { - MUST_BE_READ_LOCKED(&ip_conntrack_lock); + ASSERT_READ_LOCK(&ip_conntrack_lock); return tuplehash_to_ctrack(i) != ignored_conntrack && ip_ct_tuple_equal(tuple, &i->tuple); } @@ -294,7 +294,7 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, struct ip_conntrack_tuple_hash *h; unsigned int hash = hash_conntrack(tuple); - MUST_BE_READ_LOCKED(&ip_conntrack_lock); + ASSERT_READ_LOCK(&ip_conntrack_lock); list_for_each_entry(h, &ip_conntrack_hash[hash], list) { if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { CONNTRACK_STAT_INC(found); @@ -313,11 +313,11 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, { struct ip_conntrack_tuple_hash *h; - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); h = __ip_conntrack_find(tuple, ignored_conntrack); if (h) atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); - READ_UNLOCK(&ip_conntrack_lock); + read_unlock_bh(&ip_conntrack_lock); return h; } @@ -352,7 +352,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb) IP_NF_ASSERT(!is_confirmed(ct)); DEBUGP("Confirming conntrack %p\n", ct); - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're @@ -380,12 +380,12 @@ __ip_conntrack_confirm(struct sk_buff **pskb) atomic_inc(&ct->ct_general.use); set_bit(IPS_CONFIRMED_BIT, &ct->status); CONNTRACK_STAT_INC(insert); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); return NF_ACCEPT; } CONNTRACK_STAT_INC(insert_failed); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); return NF_DROP; } @@ -398,9 +398,9 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, { struct ip_conntrack_tuple_hash *h; - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); h = __ip_conntrack_find(tuple, ignored_conntrack); - READ_UNLOCK(&ip_conntrack_lock); + read_unlock_bh(&ip_conntrack_lock); return h != NULL; } @@ -419,13 +419,13 @@ static int early_drop(struct list_head *chain) struct ip_conntrack *ct = NULL; int dropped = 0; - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); if (h) { ct = tuplehash_to_ctrack(h); atomic_inc(&ct->ct_general.use); } - READ_UNLOCK(&ip_conntrack_lock); + read_unlock_bh(&ip_conntrack_lock); if (!ct) return dropped; @@ -508,7 +508,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); exp = find_expectation(tuple); if (exp) { @@ -532,7 +532,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); atomic_inc(&ip_conntrack_count); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); if (exp) { if (exp->expectfn) @@ -723,17 +723,17 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) { struct ip_conntrack_expect *i; - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* choose the the oldest expectation to evict */ list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { if (expect_matches(i, exp) && del_timer(&i->timeout)) { unlink_expect(i); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); destroy_expect(i); return; } } - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); } struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) @@ -808,7 +808,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); list_for_each_entry(i, &ip_conntrack_expect_list, list) { if (expect_matches(i, expect)) { /* Refresh timer: if it's dying, ignore.. */ @@ -832,7 +832,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) ip_conntrack_expect_insert(expect); ret = 0; out: - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); return ret; } @@ -841,7 +841,7 @@ out: void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, const struct ip_conntrack_tuple *newreply) { - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* Should be unconfirmed, so not in hash table yet */ IP_NF_ASSERT(!is_confirmed(conntrack)); @@ -851,15 +851,15 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (!conntrack->master && conntrack->expecting == 0) conntrack->helper = ip_ct_find_helper(newreply); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); } int ip_conntrack_helper_register(struct ip_conntrack_helper *me) { BUG_ON(me->timeout == 0); - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); list_prepend(&helpers, me); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); return 0; } @@ -878,7 +878,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) struct ip_conntrack_expect *exp, *tmp; /* Need write lock here, to delete helper. */ - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); LIST_DELETE(&helpers, me); /* Get rid of expectations */ @@ -893,7 +893,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) for (i = 0; i < ip_conntrack_htable_size; i++) LIST_FIND_W(&ip_conntrack_hash[i], unhelp, struct ip_conntrack_tuple_hash *, me); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); /* Someone could be still looking at the helper in a bh. */ synchronize_net(); @@ -925,14 +925,14 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, ct->timeout.expires = extra_jiffies; ct_add_counters(ct, ctinfo, skb); } else { - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); /* Need del_timer for race avoidance (may already be dying). */ if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); } ct_add_counters(ct, ctinfo, skb); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); } } @@ -997,7 +997,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), { struct ip_conntrack_tuple_hash *h = NULL; - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, struct ip_conntrack_tuple_hash *, iter, data); @@ -1009,7 +1009,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), struct ip_conntrack_tuple_hash *, iter, data); if (h) atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); return h; } @@ -1201,14 +1201,14 @@ int __init ip_conntrack_init(void) } /* Don't NEED lock here, but good form anyway. */ - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); for (i = 0; i < MAX_IP_CT_PROTO; i++) ip_ct_protos[i] = &ip_conntrack_generic_protocol; /* Sew in builtin protocols. */ ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); for (i = 0; i < ip_conntrack_htable_size; i++) INIT_LIST_HEAD(&ip_conntrack_hash[i]); diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index dd86503aa78..fea6dd2a00b 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -16,7 +16,6 @@ #include #include -#include #include #include #include @@ -28,7 +27,7 @@ MODULE_DESCRIPTION("ftp connection tracking helper"); /* This is slow, but it's simple. --RR */ static char ftp_buffer[65536]; -static DECLARE_LOCK(ip_ftp_lock); +static DEFINE_SPINLOCK(ip_ftp_lock); #define MAX_PORTS 8 static int ports[MAX_PORTS]; @@ -319,7 +318,7 @@ static int help(struct sk_buff **pskb, } datalen = (*pskb)->len - dataoff; - LOCK_BH(&ip_ftp_lock); + spin_lock_bh(&ip_ftp_lock); fb_ptr = skb_header_pointer(*pskb, dataoff, (*pskb)->len - dataoff, ftp_buffer); BUG_ON(fb_ptr == NULL); @@ -442,7 +441,7 @@ out_update_nl: if (ends_in_nl) update_nl_seq(seq, ct_ftp_info,dir); out: - UNLOCK_BH(&ip_ftp_lock); + spin_unlock_bh(&ip_ftp_lock); return ret; } diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 33cc7348b6e..cd98772cc33 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -29,7 +29,6 @@ #include #include -#include #include #include #include @@ -41,7 +40,7 @@ static int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; /* This is slow, but it's simple. --RR */ static char irc_buffer[65536]; -static DECLARE_LOCK(irc_buffer_lock); +static DEFINE_SPINLOCK(irc_buffer_lock); unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, @@ -141,7 +140,7 @@ static int help(struct sk_buff **pskb, if (dataoff >= (*pskb)->len) return NF_ACCEPT; - LOCK_BH(&irc_buffer_lock); + spin_lock_bh(&irc_buffer_lock); ib_ptr = skb_header_pointer(*pskb, dataoff, (*pskb)->len - dataoff, irc_buffer); BUG_ON(ib_ptr == NULL); @@ -237,7 +236,7 @@ static int help(struct sk_buff **pskb, } /* while data < ... */ out: - UNLOCK_BH(&irc_buffer_lock); + spin_unlock_bh(&irc_buffer_lock); return ret; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index ff8c34a860f..31d75390bf1 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -26,7 +26,6 @@ #include #include -#include #if 0 #define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) @@ -35,7 +34,7 @@ #endif /* Protects conntrack->proto.sctp */ -static DECLARE_RWLOCK(sctp_lock); +static DEFINE_RWLOCK(sctp_lock); /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR @@ -199,9 +198,9 @@ static int sctp_print_conntrack(struct seq_file *s, DEBUGP(__FUNCTION__); DEBUGP("\n"); - READ_LOCK(&sctp_lock); + read_lock_bh(&sctp_lock); state = conntrack->proto.sctp.state; - READ_UNLOCK(&sctp_lock); + read_unlock_bh(&sctp_lock); return seq_printf(s, "%s ", sctp_conntrack_names[state]); } @@ -343,13 +342,13 @@ static int sctp_packet(struct ip_conntrack *conntrack, oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; for_each_sctp_chunk (skb, sch, _sch, offset, count) { - WRITE_LOCK(&sctp_lock); + write_lock_bh(&sctp_lock); /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { /* Sec 8.5.1 (A) */ if (sh->vtag != 0) { - WRITE_UNLOCK(&sctp_lock); + write_unlock_bh(&sctp_lock); return -1; } } else if (sch->type == SCTP_CID_ABORT) { @@ -357,7 +356,7 @@ static int sctp_packet(struct ip_conntrack *conntrack, if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) && !(sh->vtag == conntrack->proto.sctp.vtag [1 - CTINFO2DIR(ctinfo)])) { - WRITE_UNLOCK(&sctp_lock); + write_unlock_bh(&sctp_lock); return -1; } } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { @@ -366,13 +365,13 @@ static int sctp_packet(struct ip_conntrack *conntrack, && !(sh->vtag == conntrack->proto.sctp.vtag [1 - CTINFO2DIR(ctinfo)] && (sch->flags & 1))) { - WRITE_UNLOCK(&sctp_lock); + write_unlock_bh(&sctp_lock); return -1; } } else if (sch->type == SCTP_CID_COOKIE_ECHO) { /* Sec 8.5.1 (D) */ if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { - WRITE_UNLOCK(&sctp_lock); + write_unlock_bh(&sctp_lock); return -1; } } @@ -384,7 +383,7 @@ static int sctp_packet(struct ip_conntrack *conntrack, if (newconntrack == SCTP_CONNTRACK_MAX) { DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", CTINFO2DIR(ctinfo), sch->type, oldsctpstate); - WRITE_UNLOCK(&sctp_lock); + write_unlock_bh(&sctp_lock); return -1; } @@ -396,7 +395,7 @@ static int sctp_packet(struct ip_conntrack *conntrack, ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), sizeof(_inithdr), &_inithdr); if (ih == NULL) { - WRITE_UNLOCK(&sctp_lock); + write_unlock_bh(&sctp_lock); return -1; } DEBUGP("Setting vtag %x for dir %d\n", @@ -405,7 +404,7 @@ static int sctp_packet(struct ip_conntrack *conntrack, } conntrack->proto.sctp.state = newconntrack; - WRITE_UNLOCK(&sctp_lock); + write_unlock_bh(&sctp_lock); } ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 721ddbf522b..809dfed766d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -36,7 +36,6 @@ #include #include #include -#include #if 0 #define DEBUGP printk @@ -46,7 +45,7 @@ #endif /* Protects conntrack->proto.tcp */ -static DECLARE_RWLOCK(tcp_lock); +static DEFINE_RWLOCK(tcp_lock); /* "Be conservative in what you do, be liberal in what you accept from others." @@ -330,9 +329,9 @@ static int tcp_print_conntrack(struct seq_file *s, { enum tcp_conntrack state; - READ_LOCK(&tcp_lock); + read_lock_bh(&tcp_lock); state = conntrack->proto.tcp.state; - READ_UNLOCK(&tcp_lock); + read_unlock_bh(&tcp_lock); return seq_printf(s, "%s ", tcp_conntrack_names[state]); } @@ -738,14 +737,14 @@ void ip_conntrack_tcp_update(struct sk_buff *skb, end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); - WRITE_LOCK(&tcp_lock); + write_lock_bh(&tcp_lock); /* * We have to worry for the ack in the reply packet only... */ if (after(end, conntrack->proto.tcp.seen[dir].td_end)) conntrack->proto.tcp.seen[dir].td_end = end; conntrack->proto.tcp.last_end = end; - WRITE_UNLOCK(&tcp_lock); + write_unlock_bh(&tcp_lock); DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, @@ -857,7 +856,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); - WRITE_LOCK(&tcp_lock); + write_lock_bh(&tcp_lock); old_state = conntrack->proto.tcp.state; dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); @@ -879,7 +878,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, * that the client cannot but retransmit its SYN and * thus initiate a clean new session. */ - WRITE_UNLOCK(&tcp_lock); + write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: killing out of sync session "); @@ -894,7 +893,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, conntrack->proto.tcp.last_end = segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); - WRITE_UNLOCK(&tcp_lock); + write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: invalid packet ignored "); @@ -904,7 +903,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); - WRITE_UNLOCK(&tcp_lock); + write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: invalid state "); @@ -918,13 +917,13 @@ static int tcp_packet(struct ip_conntrack *conntrack, conntrack->proto.tcp.seen[dir].td_end)) { /* Attempt to reopen a closed connection. * Delete this connection and look up again. */ - WRITE_UNLOCK(&tcp_lock); + write_unlock_bh(&tcp_lock); if (del_timer(&conntrack->timeout)) conntrack->timeout.function((unsigned long) conntrack); return -NF_REPEAT; } else { - WRITE_UNLOCK(&tcp_lock); + write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, "ip_ct_tcp: invalid SYN"); @@ -949,7 +948,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, if (!tcp_in_window(&conntrack->proto.tcp, dir, index, skb, iph, th)) { - WRITE_UNLOCK(&tcp_lock); + write_unlock_bh(&tcp_lock); return -NF_ACCEPT; } in_window: @@ -972,7 +971,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; - WRITE_UNLOCK(&tcp_lock); + write_unlock_bh(&tcp_lock); if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { /* If only reply is a RST, we can consider ourselves not to diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index bc59f7b3980..42dc9510287 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -28,8 +28,8 @@ #include #include -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) #include #include @@ -119,7 +119,7 @@ static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) static void *ct_seq_start(struct seq_file *seq, loff_t *pos) { - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); return ct_get_idx(seq, *pos); } @@ -131,7 +131,7 @@ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) static void ct_seq_stop(struct seq_file *s, void *v) { - READ_UNLOCK(&ip_conntrack_lock); + read_unlock_bh(&ip_conntrack_lock); } static int ct_seq_show(struct seq_file *s, void *v) @@ -140,7 +140,7 @@ static int ct_seq_show(struct seq_file *s, void *v) const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); struct ip_conntrack_protocol *proto; - MUST_BE_READ_LOCKED(&ip_conntrack_lock); + ASSERT_READ_LOCK(&ip_conntrack_lock); IP_NF_ASSERT(conntrack); /* we only want to print DIR_ORIGINAL */ @@ -239,7 +239,7 @@ static void *exp_seq_start(struct seq_file *s, loff_t *pos) /* strange seq_file api calls stop even if we fail, * thus we need to grab lock since stop unlocks */ - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); if (list_empty(e)) return NULL; @@ -267,7 +267,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) static void exp_seq_stop(struct seq_file *s, void *v) { - READ_UNLOCK(&ip_conntrack_lock); + read_unlock_bh(&ip_conntrack_lock); } static int exp_seq_show(struct seq_file *s, void *v) @@ -921,22 +921,22 @@ int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto) { int ret = 0; - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { ret = -EBUSY; goto out; } ip_ct_protos[proto->proto] = proto; out: - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); return ret; } void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) { - WRITE_LOCK(&ip_conntrack_lock); + write_lock_bh(&ip_conntrack_lock); ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; - WRITE_UNLOCK(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); /* Somebody could be still looking at the proto in bh. */ synchronize_net(); diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 9fc6f93af0d..739b6dde1c8 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -22,8 +22,8 @@ #include #include -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) #include #include @@ -41,7 +41,7 @@ #define DEBUGP(format, args...) #endif -DECLARE_RWLOCK(ip_nat_lock); +DEFINE_RWLOCK(ip_nat_lock); /* Calculated at init based on memory size */ static unsigned int ip_nat_htable_size; @@ -65,9 +65,9 @@ static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn) if (!(conn->status & IPS_NAT_DONE_MASK)) return; - WRITE_LOCK(&ip_nat_lock); + write_lock_bh(&ip_nat_lock); list_del(&conn->nat.info.bysource); - WRITE_UNLOCK(&ip_nat_lock); + write_unlock_bh(&ip_nat_lock); } /* We do checksum mangling, so if they were wrong before they're still @@ -142,7 +142,7 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple, unsigned int h = hash_by_src(tuple); struct ip_conntrack *ct; - READ_LOCK(&ip_nat_lock); + read_lock_bh(&ip_nat_lock); list_for_each_entry(ct, &bysource[h], nat.info.bysource) { if (same_src(ct, tuple)) { /* Copy source part from reply tuple. */ @@ -151,12 +151,12 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple, result->dst = tuple->dst; if (in_range(result, range)) { - READ_UNLOCK(&ip_nat_lock); + read_unlock_bh(&ip_nat_lock); return 1; } } } - READ_UNLOCK(&ip_nat_lock); + read_unlock_bh(&ip_nat_lock); return 0; } @@ -297,9 +297,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack, unsigned int srchash = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] .tuple); - WRITE_LOCK(&ip_nat_lock); + write_lock_bh(&ip_nat_lock); list_add(&info->bysource, &bysource[srchash]); - WRITE_UNLOCK(&ip_nat_lock); + write_unlock_bh(&ip_nat_lock); } /* It's done. */ @@ -474,23 +474,23 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto) { int ret = 0; - WRITE_LOCK(&ip_nat_lock); + write_lock_bh(&ip_nat_lock); if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { ret = -EBUSY; goto out; } ip_nat_protos[proto->protonum] = proto; out: - WRITE_UNLOCK(&ip_nat_lock); + write_unlock_bh(&ip_nat_lock); return ret; } /* Noone stores the protocol anywhere; simply delete it. */ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) { - WRITE_LOCK(&ip_nat_lock); + write_lock_bh(&ip_nat_lock); ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; - WRITE_UNLOCK(&ip_nat_lock); + write_unlock_bh(&ip_nat_lock); /* Someone could be still looking at the proto in a bh. */ synchronize_net(); @@ -509,13 +509,13 @@ int __init ip_nat_init(void) return -ENOMEM; /* Sew in builtin protocols. */ - WRITE_LOCK(&ip_nat_lock); + write_lock_bh(&ip_nat_lock); for (i = 0; i < MAX_IP_NAT_PROTO; i++) ip_nat_protos[i] = &ip_nat_unknown_protocol; ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; - WRITE_UNLOCK(&ip_nat_lock); + write_unlock_bh(&ip_nat_lock); for (i = 0; i < ip_nat_htable_size; i++) { INIT_LIST_HEAD(&bysource[i]); diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c index 1637b96d8c0..9cd51f180dc 100644 --- a/net/ipv4/netfilter/ip_nat_helper.c +++ b/net/ipv4/netfilter/ip_nat_helper.c @@ -28,8 +28,8 @@ #include #include -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) #include #include @@ -47,7 +47,7 @@ #define DUMP_OFFSET(x) #endif -static DECLARE_LOCK(ip_nat_seqofs_lock); +static DEFINE_SPINLOCK(ip_nat_seqofs_lock); /* Setup TCP sequence correction given this change at this sequence */ static inline void @@ -70,7 +70,7 @@ adjust_tcp_sequence(u32 seq, DEBUGP("ip_nat_resize_packet: Seq_offset before: "); DUMP_OFFSET(this_way); - LOCK_BH(&ip_nat_seqofs_lock); + spin_lock_bh(&ip_nat_seqofs_lock); /* SYN adjust. If it's uninitialized, or this is after last * correction, record it: we don't handle more than one @@ -82,7 +82,7 @@ adjust_tcp_sequence(u32 seq, this_way->offset_before = this_way->offset_after; this_way->offset_after += sizediff; } - UNLOCK_BH(&ip_nat_seqofs_lock); + spin_unlock_bh(&ip_nat_seqofs_lock); DEBUGP("ip_nat_resize_packet: Seq_offset after: "); DUMP_OFFSET(this_way); diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c index 581f097f5a2..60d70fa41a1 100644 --- a/net/ipv4/netfilter/ip_nat_rule.c +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -19,8 +19,8 @@ #include #include -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) #include #include diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 79f56f662b3..bc59d0d6e89 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -31,8 +31,8 @@ #include #include -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) #include #include @@ -373,7 +373,6 @@ static int init_or_cleanup(int init) cleanup_rule_init: ip_nat_rule_cleanup(); cleanup_nothing: - MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock); return ret; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 8a54f92b849..c88dfcd38c5 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -67,7 +67,6 @@ static DECLARE_MUTEX(ipt_mutex); /* Must have mutex */ #define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) #define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) -#include #include #if 0 diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 0f12e3a3dc7..dc4362b57cf 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -29,7 +29,6 @@ #include #include #include -#include #define CLUSTERIP_VERSION "0.6" @@ -41,6 +40,8 @@ #define DEBUGP #endif +#define ASSERT_READ_LOCK(x) + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("iptables target for CLUSTERIP"); @@ -67,7 +68,7 @@ static LIST_HEAD(clusterip_configs); /* clusterip_lock protects the clusterip_configs list _AND_ the configurable * data within all structurses (num_local_nodes, local_nodes[]) */ -static DECLARE_RWLOCK(clusterip_lock); +static DEFINE_RWLOCK(clusterip_lock); #ifdef CONFIG_PROC_FS static struct file_operations clusterip_proc_fops; @@ -82,9 +83,9 @@ clusterip_config_get(struct clusterip_config *c) { static inline void clusterip_config_put(struct clusterip_config *c) { if (atomic_dec_and_test(&c->refcount)) { - WRITE_LOCK(&clusterip_lock); + write_lock_bh(&clusterip_lock); list_del(&c->list); - WRITE_UNLOCK(&clusterip_lock); + write_unlock_bh(&clusterip_lock); dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); dev_put(c->dev); kfree(c); @@ -97,7 +98,7 @@ __clusterip_config_find(u_int32_t clusterip) { struct list_head *pos; - MUST_BE_READ_LOCKED(&clusterip_lock); + ASSERT_READ_LOCK(&clusterip_lock); list_for_each(pos, &clusterip_configs) { struct clusterip_config *c = list_entry(pos, struct clusterip_config, list); @@ -114,14 +115,14 @@ clusterip_config_find_get(u_int32_t clusterip) { struct clusterip_config *c; - READ_LOCK(&clusterip_lock); + read_lock_bh(&clusterip_lock); c = __clusterip_config_find(clusterip); if (!c) { - READ_UNLOCK(&clusterip_lock); + read_unlock_bh(&clusterip_lock); return NULL; } atomic_inc(&c->refcount); - READ_UNLOCK(&clusterip_lock); + read_unlock_bh(&clusterip_lock); return c; } @@ -160,9 +161,9 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip, c->pde->data = c; #endif - WRITE_LOCK(&clusterip_lock); + write_lock_bh(&clusterip_lock); list_add(&c->list, &clusterip_configs); - WRITE_UNLOCK(&clusterip_lock); + write_unlock_bh(&clusterip_lock); return c; } @@ -172,25 +173,25 @@ clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) { int i; - WRITE_LOCK(&clusterip_lock); + write_lock_bh(&clusterip_lock); if (c->num_local_nodes >= CLUSTERIP_MAX_NODES || nodenum > CLUSTERIP_MAX_NODES) { - WRITE_UNLOCK(&clusterip_lock); + write_unlock_bh(&clusterip_lock); return 1; } /* check if we alrady have this number in our array */ for (i = 0; i < c->num_local_nodes; i++) { if (c->local_nodes[i] == nodenum) { - WRITE_UNLOCK(&clusterip_lock); + write_unlock_bh(&clusterip_lock); return 1; } } c->local_nodes[c->num_local_nodes++] = nodenum; - WRITE_UNLOCK(&clusterip_lock); + write_unlock_bh(&clusterip_lock); return 0; } @@ -199,10 +200,10 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) { int i; - WRITE_LOCK(&clusterip_lock); + write_lock_bh(&clusterip_lock); if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) { - WRITE_UNLOCK(&clusterip_lock); + write_unlock_bh(&clusterip_lock); return 1; } @@ -211,12 +212,12 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1)); memmove(&c->local_nodes[i], &c->local_nodes[i+1], size); c->num_local_nodes--; - WRITE_UNLOCK(&clusterip_lock); + write_unlock_bh(&clusterip_lock); return 0; } } - WRITE_UNLOCK(&clusterip_lock); + write_unlock_bh(&clusterip_lock); return 1; } @@ -286,21 +287,21 @@ clusterip_responsible(struct clusterip_config *config, u_int32_t hash) { int i; - READ_LOCK(&clusterip_lock); + read_lock_bh(&clusterip_lock); if (config->num_local_nodes == 0) { - READ_UNLOCK(&clusterip_lock); + read_unlock_bh(&clusterip_lock); return 0; } for (i = 0; i < config->num_local_nodes; i++) { if (config->local_nodes[i] == hash) { - READ_UNLOCK(&clusterip_lock); + read_unlock_bh(&clusterip_lock); return 1; } } - READ_UNLOCK(&clusterip_lock); + read_unlock_bh(&clusterip_lock); return 0; } @@ -578,7 +579,7 @@ static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) struct clusterip_config *c = pde->data; unsigned int *nodeidx; - READ_LOCK(&clusterip_lock); + read_lock_bh(&clusterip_lock); if (*pos >= c->num_local_nodes) return NULL; @@ -608,7 +609,7 @@ static void clusterip_seq_stop(struct seq_file *s, void *v) { kfree(v); - READ_UNLOCK(&clusterip_lock); + read_unlock_bh(&clusterip_lock); } static int clusterip_seq_show(struct seq_file *s, void *v) diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 57e9f6cf1c3..91e74502c3d 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables MASQUERADE target module"); #endif /* Lock protects masq region inside conntrack */ -static DECLARE_RWLOCK(masq_lock); +static DEFINE_RWLOCK(masq_lock); /* FIXME: Multiple targets. --RR */ static int @@ -103,9 +103,9 @@ masquerade_target(struct sk_buff **pskb, return NF_DROP; } - WRITE_LOCK(&masq_lock); + write_lock_bh(&masq_lock); ct->nat.masq_index = out->ifindex; - WRITE_UNLOCK(&masq_lock); + write_unlock_bh(&masq_lock); /* Transfer from original range. */ newrange = ((struct ip_nat_range) @@ -122,9 +122,9 @@ device_cmp(struct ip_conntrack *i, void *ifindex) { int ret; - READ_LOCK(&masq_lock); + read_lock_bh(&masq_lock); ret = (i->nat.masq_index == (int)(long)ifindex); - READ_UNLOCK(&masq_lock); + read_unlock_bh(&masq_lock); return ret; } diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 6f2cefbe16c..52a0076302a 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include @@ -99,8 +98,8 @@ typedef struct { static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ -static struct sock *nflognl; /* our socket */ -static DECLARE_LOCK(ulog_lock); /* spinlock */ +static struct sock *nflognl; /* our socket */ +static DEFINE_SPINLOCK(ulog_lock); /* spinlock */ /* send one ulog_buff_t to userspace */ static void ulog_send(unsigned int nlgroupnum) @@ -135,9 +134,9 @@ static void ulog_timer(unsigned long data) /* lock to protect against somebody modifying our structure * from ipt_ulog_target at the same time */ - LOCK_BH(&ulog_lock); + spin_lock_bh(&ulog_lock); ulog_send(data); - UNLOCK_BH(&ulog_lock); + spin_unlock_bh(&ulog_lock); } static struct sk_buff *ulog_alloc_skb(unsigned int size) @@ -193,7 +192,7 @@ static void ipt_ulog_packet(unsigned int hooknum, ub = &ulog_buffers[groupnum]; - LOCK_BH(&ulog_lock); + spin_lock_bh(&ulog_lock); if (!ub->skb) { if (!(ub->skb = ulog_alloc_skb(size))) @@ -278,7 +277,7 @@ static void ipt_ulog_packet(unsigned int hooknum, ulog_send(groupnum); } - UNLOCK_BH(&ulog_lock); + spin_unlock_bh(&ulog_lock); return; @@ -288,7 +287,7 @@ nlmsg_failure: alloc_failure: PRINTR("ipt_ULOG: Error building netlink message\n"); - UNLOCK_BH(&ulog_lock); + spin_unlock_bh(&ulog_lock); } static unsigned int ipt_ulog_target(struct sk_buff **pskb, diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index f1937190cd7..564b49bfebc 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -37,7 +37,6 @@ #include #include -#include /* FIXME: this is just for IP_NF_ASSERRT */ #include @@ -92,7 +91,7 @@ struct ipt_hashlimit_htable { struct hlist_head hash[0]; /* hashtable itself */ }; -static DECLARE_LOCK(hashlimit_lock); /* protects htables list */ +static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ static HLIST_HEAD(hashlimit_htables); static kmem_cache_t *hashlimit_cachep; @@ -233,9 +232,9 @@ static int htable_create(struct ipt_hashlimit_info *minfo) hinfo->timer.function = htable_gc; add_timer(&hinfo->timer); - LOCK_BH(&hashlimit_lock); + spin_lock_bh(&hashlimit_lock); hlist_add_head(&hinfo->node, &hashlimit_htables); - UNLOCK_BH(&hashlimit_lock); + spin_unlock_bh(&hashlimit_lock); return 0; } @@ -301,15 +300,15 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name) struct ipt_hashlimit_htable *hinfo; struct hlist_node *pos; - LOCK_BH(&hashlimit_lock); + spin_lock_bh(&hashlimit_lock); hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) { if (!strcmp(name, hinfo->pde->name)) { atomic_inc(&hinfo->use); - UNLOCK_BH(&hashlimit_lock); + spin_unlock_bh(&hashlimit_lock); return hinfo; } } - UNLOCK_BH(&hashlimit_lock); + spin_unlock_bh(&hashlimit_lock); return NULL; } @@ -317,9 +316,9 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name) static void htable_put(struct ipt_hashlimit_htable *hinfo) { if (atomic_dec_and_test(&hinfo->use)) { - LOCK_BH(&hashlimit_lock); + spin_lock_bh(&hashlimit_lock); hlist_del(&hinfo->node); - UNLOCK_BH(&hashlimit_lock); + spin_unlock_bh(&hashlimit_lock); htable_destroy(hinfo); } } diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c index 33fdf364d3d..3e7dd014de4 100644 --- a/net/ipv4/netfilter/ipt_helper.c +++ b/net/ipv4/netfilter/ipt_helper.c @@ -53,7 +53,7 @@ match(const struct sk_buff *skb, return ret; } - READ_LOCK(&ip_conntrack_lock); + read_lock_bh(&ip_conntrack_lock); if (!ct->master->helper) { DEBUGP("ipt_helper: master ct %p has no helper\n", exp->expectant); @@ -69,7 +69,7 @@ match(const struct sk_buff *skb, ret ^= !strncmp(ct->master->helper->name, info->name, strlen(ct->master->helper->name)); out_unlock: - READ_UNLOCK(&ip_conntrack_lock); + read_unlock_bh(&ip_conntrack_lock); return ret; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c735276fdd5..73034511c8d 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -71,7 +71,6 @@ static DECLARE_MUTEX(ip6t_mutex); /* Must have mutex */ #define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) #define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) -#include #include #if 0 -- cgit From 18b8afc771102b1b6af97962808291a7d27f52af Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 21 Jun 2005 14:01:57 -0700 Subject: [NETFILTER]: Kill nf_debug Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 3 - net/bridge/br_input.c | 4 - net/bridge/br_netfilter.c | 38 --------- net/core/netfilter.c | 138 --------------------------------- net/core/skbuff.c | 6 -- net/ipv4/ip_input.c | 4 - net/ipv4/ip_output.c | 11 --- net/ipv4/ipvs/ip_vs_xmit.c | 1 - net/ipv4/netfilter/ip_conntrack_core.c | 9 --- net/ipv4/netfilter/ip_nat_helper.c | 3 - net/ipv6/ip6_output.c | 3 - 11 files changed, 220 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index ef9f2095f96..069253f830c 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -57,9 +57,6 @@ int br_forward_finish(struct sk_buff *skb) static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { skb->dev = to->dev; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, br_forward_finish); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8f5f2e73099..9a45e6279c5 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -23,11 +23,7 @@ const unsigned char bridge_ula[6] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; static int br_pass_frame_up_finish(struct sk_buff *skb) { -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif netif_receive_skb(skb); - return 0; } diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index be03d3ad264..03ae4edddac 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -102,10 +102,6 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = skb->nf_bridge; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING); -#endif - if (nf_bridge->mask & BRNF_PKT_TYPE) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->mask ^= BRNF_PKT_TYPE; @@ -182,10 +178,6 @@ static void __br_dnat_complain(void) * --Bart, 20021007 (updated) */ static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) { -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug |= (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_FORWARD); -#endif - if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; skb->nf_bridge->mask |= BRNF_PKT_TYPE; @@ -207,10 +199,6 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb) struct iphdr *iph = skb->nh.iph; struct nf_bridge_info *nf_bridge = skb->nf_bridge; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING); -#endif - if (nf_bridge->mask & BRNF_PKT_TYPE) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->mask ^= BRNF_PKT_TYPE; @@ -382,9 +370,6 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) goto inhdr_error; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug ^= (1 << NF_IP6_PRE_ROUTING); -#endif if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) return NF_DROP; setup_pre_routing(skb); @@ -468,9 +453,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, skb->ip_summed = CHECKSUM_NONE; } -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug ^= (1 << NF_IP_PRE_ROUTING); -#endif if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) return NF_DROP; setup_pre_routing(skb); @@ -517,10 +499,6 @@ static int br_nf_forward_finish(struct sk_buff *skb) struct net_device *in; struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug ^= (1 << NF_BR_FORWARD); -#endif - if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) { in = nf_bridge->physindev; if (nf_bridge->mask & BRNF_PKT_TYPE) { @@ -566,9 +544,6 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb, (*pskb)->nh.raw += VLAN_HLEN; } -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug ^= (1 << NF_BR_FORWARD); -#endif nf_bridge = skb->nf_bridge; if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; @@ -605,10 +580,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb, (*pskb)->nh.raw += VLAN_HLEN; } -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug ^= (1 << NF_BR_FORWARD); -#endif - if (skb->nh.arph->ar_pln != 4) { if (IS_VLAN_ARP) { skb_push(*pskb, VLAN_HLEN); @@ -627,9 +598,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb, /* PF_BRIDGE/LOCAL_OUT ***********************************************/ static int br_nf_local_out_finish(struct sk_buff *skb) { -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug &= ~(1 << NF_BR_LOCAL_OUT); -#endif if (skb->protocol == __constant_htons(ETH_P_8021Q)) { skb_push(skb, VLAN_HLEN); skb->nh.raw -= VLAN_HLEN; @@ -731,10 +699,6 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb, realoutdev, br_nf_local_out_finish, NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1); } else { -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug ^= (1 << NF_IP_LOCAL_OUT); -#endif - NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev, realoutdev, br_nf_local_out_finish, NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1); @@ -779,8 +743,6 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb, printk(KERN_CRIT "br_netfilter: skb->dst == NULL."); goto print_error; } - - skb->nf_debug ^= (1 << NF_IP_POST_ROUTING); #endif /* We assume any code from br_dev_queue_push_xmit onwards doesn't care diff --git a/net/core/netfilter.c b/net/core/netfilter.c index 22a8f127c4a..076c156d5ed 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -141,136 +141,6 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg) up(&nf_sockopt_mutex); } -#ifdef CONFIG_NETFILTER_DEBUG -#include -#include -#include - -static void debug_print_hooks_ip(unsigned int nf_debug) -{ - if (nf_debug & (1 << NF_IP_PRE_ROUTING)) { - printk("PRE_ROUTING "); - nf_debug ^= (1 << NF_IP_PRE_ROUTING); - } - if (nf_debug & (1 << NF_IP_LOCAL_IN)) { - printk("LOCAL_IN "); - nf_debug ^= (1 << NF_IP_LOCAL_IN); - } - if (nf_debug & (1 << NF_IP_FORWARD)) { - printk("FORWARD "); - nf_debug ^= (1 << NF_IP_FORWARD); - } - if (nf_debug & (1 << NF_IP_LOCAL_OUT)) { - printk("LOCAL_OUT "); - nf_debug ^= (1 << NF_IP_LOCAL_OUT); - } - if (nf_debug & (1 << NF_IP_POST_ROUTING)) { - printk("POST_ROUTING "); - nf_debug ^= (1 << NF_IP_POST_ROUTING); - } - if (nf_debug) - printk("Crap bits: 0x%04X", nf_debug); - printk("\n"); -} - -static void nf_dump_skb(int pf, struct sk_buff *skb) -{ - printk("skb: pf=%i %s dev=%s len=%u\n", - pf, - skb->sk ? "(owned)" : "(unowned)", - skb->dev ? skb->dev->name : "(no dev)", - skb->len); - switch (pf) { - case PF_INET: { - const struct iphdr *ip = skb->nh.iph; - __u32 *opt = (__u32 *) (ip + 1); - int opti; - __u16 src_port = 0, dst_port = 0; - - if (ip->protocol == IPPROTO_TCP - || ip->protocol == IPPROTO_UDP) { - struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl); - src_port = ntohs(tcp->source); - dst_port = ntohs(tcp->dest); - } - - printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu" - " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu", - ip->protocol, NIPQUAD(ip->saddr), - src_port, NIPQUAD(ip->daddr), - dst_port, - ntohs(ip->tot_len), ip->tos, ntohs(ip->id), - ntohs(ip->frag_off), ip->ttl); - - for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++) - printk(" O=0x%8.8X", *opt++); - printk("\n"); - } - } -} - -void nf_debug_ip_local_deliver(struct sk_buff *skb) -{ - /* If it's a loopback packet, it must have come through - * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and - * NF_IP_LOCAL_IN. Otherwise, must have gone through - * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */ - if (!skb->dev) { - printk("ip_local_deliver: skb->dev is NULL.\n"); - } else { - if (skb->nf_debug != ((1<nf_debug); - nf_dump_skb(PF_INET, skb); - } - } -} - -void nf_debug_ip_loopback_xmit(struct sk_buff *newskb) -{ - if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT) - | (1 << NF_IP_POST_ROUTING))) { - printk("ip_dev_loopback_xmit: bad owned skb = %p: ", - newskb); - debug_print_hooks_ip(newskb->nf_debug); - nf_dump_skb(PF_INET, newskb); - } -} - -void nf_debug_ip_finish_output2(struct sk_buff *skb) -{ - /* If it's owned, it must have gone through the - * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING. - * Otherwise, must have gone through - * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING. - */ - if (skb->sk) { - if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) - | (1 << NF_IP_POST_ROUTING))) { - printk("ip_finish_output: bad owned skb = %p: ", skb); - debug_print_hooks_ip(skb->nf_debug); - nf_dump_skb(PF_INET, skb); - } - } else { - if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING) - | (1 << NF_IP_FORWARD) - | (1 << NF_IP_POST_ROUTING))) { - /* Fragments, entunnelled packets, TCP RSTs - generated by ipt_REJECT will have no - owners, but still may be local */ - if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) - | (1 << NF_IP_POST_ROUTING))){ - printk("ip_finish_output:" - " bad unowned skb = %p: ",skb); - debug_print_hooks_ip(skb->nf_debug); - nf_dump_skb(PF_INET, skb); - } - } - } -} -#endif /*CONFIG_NETFILTER_DEBUG*/ - /* Call get/setsockopt() */ static int nf_sockopt(struct sock *sk, int pf, int val, char __user *opt, int *len, int get) @@ -488,14 +358,6 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); -#ifdef CONFIG_NETFILTER_DEBUG - if (unlikely((*pskb)->nf_debug & (1 << hook))) { - printk("nf_hook: hook %i already set.\n", hook); - nf_dump_skb(pf, *pskb); - } - (*pskb)->nf_debug |= (1 << hook); -#endif - elem = &nf_hooks[pf][hook]; next_hook: verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f65b3de590a..6d68c03bc05 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -365,9 +365,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) C(nfct); nf_conntrack_get(skb->nfct); C(nfctinfo); -#ifdef CONFIG_NETFILTER_DEBUG - C(nf_debug); -#endif #ifdef CONFIG_BRIDGE_NETFILTER C(nf_bridge); nf_bridge_get(skb->nf_bridge); @@ -432,9 +429,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->nfct = old->nfct; nf_conntrack_get(old->nfct); new->nfctinfo = old->nfctinfo; -#ifdef CONFIG_NETFILTER_DEBUG - new->nf_debug = old->nf_debug; -#endif #ifdef CONFIG_BRIDGE_NETFILTER new->nf_bridge = old->nf_bridge; nf_bridge_get(old->nf_bridge); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 4e47a2658c7..2b7485e6504 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -200,10 +200,6 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb) { int ihl = skb->nh.iph->ihl*4; -#ifdef CONFIG_NETFILTER_DEBUG - nf_debug_ip_local_deliver(skb); -#endif /*CONFIG_NETFILTER_DEBUG*/ - __skb_pull(skb, ihl); /* Free reference early: we don't need it any more, and it may diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 760dc8238d6..ee07aec215a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -107,10 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; BUG_TRAP(newskb->dst); - -#ifdef CONFIG_NETFILTER_DEBUG - nf_debug_ip_loopback_xmit(newskb); -#endif nf_reset(newskb); netif_rx(newskb); return 0; @@ -192,10 +188,6 @@ static inline int ip_finish_output2(struct sk_buff *skb) skb = skb2; } -#ifdef CONFIG_NETFILTER_DEBUG - nf_debug_ip_finish_output2(skb); -#endif /*CONFIG_NETFILTER_DEBUG*/ - nf_reset(skb); if (hh) { @@ -415,9 +407,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->nf_bridge = from->nf_bridge; nf_bridge_get(to->nf_bridge); #endif -#ifdef CONFIG_NETFILTER_DEBUG - to->nf_debug = from->nf_debug; -#endif #endif } diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index de21da00057..a8512a3fd08 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -127,7 +127,6 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) #define IP_VS_XMIT(skb, rt) \ do { \ - nf_reset_debug(skb); \ (skb)->nfcache |= NFC_IPVS_PROPERTY; \ (skb)->ip_summed = CHECKSUM_NONE; \ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a7377a331ad..ffba0ad3c88 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -940,10 +940,6 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, struct sk_buff * ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) { -#ifdef CONFIG_NETFILTER_DEBUG - unsigned int olddebug = skb->nf_debug; -#endif - skb_orphan(skb); local_bh_disable(); @@ -953,12 +949,7 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) if (skb) { ip_send_check(skb->nh.iph); skb->nfcache |= NFC_ALTERED; -#ifdef CONFIG_NETFILTER_DEBUG - /* Packet path as if nothing had happened. */ - skb->nf_debug = olddebug; -#endif } - return skb; } diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c index 9cd51f180dc..158f34f32c0 100644 --- a/net/ipv4/netfilter/ip_nat_helper.c +++ b/net/ipv4/netfilter/ip_nat_helper.c @@ -142,9 +142,6 @@ static int enlarge_skb(struct sk_buff **pskb, unsigned int extra) /* Transfer socket to new skb. */ if ((*pskb)->sk) skb_set_owner_w(nskb, (*pskb)->sk); -#ifdef CONFIG_NETFILTER_DEBUG - nskb->nf_debug = (*pskb)->nf_debug; -#endif kfree_skb(*pskb); *pskb = nskb; return 1; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b78a5358680..06e7cdaeedc 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -484,9 +484,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->nf_bridge = from->nf_bridge; nf_bridge_get(to->nf_bridge); #endif -#ifdef CONFIG_NETFILTER_DEBUG - to->nf_debug = from->nf_debug; -#endif #endif } -- cgit From e98231858bbfd2aca42f93d55133c2fca6df00f9 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 21 Jun 2005 14:02:15 -0700 Subject: [NETFILTER]: Restore netfilter assumptions in IPv6 multicast Netfilter assumes that skb->data == skb->nh.ipv6h Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index c0ca92e8230..562fcd14fde 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1301,15 +1301,6 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size) return NULL; skb_reserve(skb, LL_RESERVED_SPACE(dev)); - if (dev->hard_header) { - unsigned char ha[MAX_ADDR_LEN]; - - ndisc_mc_map(&mld2_all_mcr, ha, dev, 1); - if (dev->hard_header(skb, dev, ETH_P_IPV6,ha,NULL,size) < 0) { - kfree_skb(skb); - return NULL; - } - } if (ipv6_get_lladdr(dev, &addr_buf)) { /* : @@ -1333,6 +1324,30 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size) return skb; } +static inline int mld_dev_queue_xmit2(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + + if (dev->hard_header) { + unsigned char ha[MAX_ADDR_LEN]; + int err; + + ndisc_mc_map(&skb->nh.ipv6h->daddr, ha, dev, 1); + err = dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, skb->len); + if (err < 0) { + kfree_skb(skb); + return err; + } + } + return dev_queue_xmit(skb); +} + +static inline int mld_dev_queue_xmit(struct sk_buff *skb) +{ + return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dev, + mld_dev_queue_xmit2); +} + static void mld_sendpack(struct sk_buff *skb) { struct ipv6hdr *pip6 = skb->nh.ipv6h; @@ -1350,7 +1365,7 @@ static void mld_sendpack(struct sk_buff *skb) pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0)); err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, - dev_queue_xmit); + mld_dev_queue_xmit); if (!err) { ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS); IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); @@ -1656,12 +1671,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) } skb_reserve(skb, LL_RESERVED_SPACE(dev)); - if (dev->hard_header) { - unsigned char ha[MAX_ADDR_LEN]; - ndisc_mc_map(snd_addr, ha, dev, 1); - if (dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len) < 0) - goto out; - } if (ipv6_get_lladdr(dev, &addr_buf)) { /* : @@ -1689,7 +1698,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) idev = in6_dev_get(skb->dev); err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, - dev_queue_xmit); + mld_dev_queue_xmit); if (!err) { if (type == ICMPV6_MGM_REDUCTION) ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS); @@ -1703,10 +1712,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) if (likely(idev != NULL)) in6_dev_put(idev); return; - -out: - IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); - kfree_skb(skb); } static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, -- cgit From 1d3cdb41f52e299f70b66cbb569bff5216f3643a Mon Sep 17 00:00:00 2001 From: Phil Oester Date: Tue, 21 Jun 2005 14:02:42 -0700 Subject: [NETFILTER]: expectation timeouts are compulsory Since expectation timeouts were made compulsory [1], there is no need to check for them in ip_conntrack_expect_insert. [1] https://lists.netfilter.org/pipermail/netfilter-devel/2005-January/018143.html Signed-off-by: Phil Oester Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index ffba0ad3c88..4b78ebeb663 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -760,15 +760,11 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) exp->master->expecting++; list_add(&exp->list, &ip_conntrack_expect_list); - if (exp->master->helper->timeout) { - init_timer(&exp->timeout); - exp->timeout.data = (unsigned long)exp; - exp->timeout.function = expectation_timed_out; - exp->timeout.expires - = jiffies + exp->master->helper->timeout * HZ; - add_timer(&exp->timeout); - } else - exp->timeout.function = NULL; + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); CONNTRACK_STAT_INC(expect_create); } -- cgit From 97216c799a6496163be8c81c9ceed297d92956c5 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 21 Jun 2005 14:03:01 -0700 Subject: [NETFILTER]: Missing owner-field initialization in ip6table_raw I missed this one when fixing up iptable_raw. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/netfilter/ip6table_raw.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 71407beaf79..c2982efd14a 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -129,13 +129,15 @@ static struct nf_hook_ops ip6t_ops[] = { .hook = ip6t_hook, .pf = PF_INET6, .hooknum = NF_IP6_PRE_ROUTING, - .priority = NF_IP6_PRI_FIRST + .priority = NF_IP6_PRI_FIRST, + .owner = THIS_MODULE, }, { .hook = ip6t_hook, .pf = PF_INET6, .hooknum = NF_IP6_LOCAL_OUT, - .priority = NF_IP6_PRI_FIRST + .priority = NF_IP6_PRI_FIRST, + .owner = THIS_MODULE, }, }; -- cgit From e3be8ba79294df5de96692411e122506b40c5aa4 Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Tue, 21 Jun 2005 14:03:23 -0700 Subject: [NETFILTER]: Avoid unncessary checksum validation in UDP connection tracking Signed-off-by: Keir Fraser Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_udp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 5bc28a22462..8c1eaba098d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -120,6 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, * and moreover root might send raw packets. * FIXME: Source route IP option packets --RR */ if (hooknum == NF_IP_PRE_ROUTING + && skb->ip_summed != CHECKSUM_UNNECESSARY && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, udplen, 0))) { -- cgit From 6150bacfec95c7042678667561664efcf10d4508 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 21 Jun 2005 14:03:46 -0700 Subject: [NETFILTER]: Check TCP checksum in ipt_REJECT Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_REJECT.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 266d6497928..91569644602 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -104,10 +104,12 @@ static inline struct rtable *route_reverse(struct sk_buff *skb, static void send_reset(struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; + struct iphdr *iph = oldskb->nh.iph; struct tcphdr _otcph, *oth, *tcph; struct rtable *rt; u_int16_t tmp_port; u_int32_t tmp_addr; + unsigned int tcplen; int needs_ack; int hh_len; @@ -124,7 +126,16 @@ static void send_reset(struct sk_buff *oldskb, int hook) if (oth->rst) return; - /* FIXME: Check checksum --RR */ + /* Check checksum */ + tcplen = oldskb->len - iph->ihl * 4; + if (((hook != NF_IP_LOCAL_IN && oldskb->ip_summed != CHECKSUM_HW) || + (hook == NF_IP_LOCAL_IN && + oldskb->ip_summed != CHECKSUM_UNNECESSARY)) && + csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP, + oldskb->ip_summed == CHECKSUM_HW ? oldskb->csum : + skb_checksum(oldskb, iph->ihl * 4, tcplen, 0))) + return; + if ((rt = route_reverse(oldskb, oth, hook)) == NULL) return; -- cgit From 2715bcf9efc34063e05009f188eb896c462ae925 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 21 Jun 2005 14:06:24 -0700 Subject: [NETFILTER]: Drop conntrack reference in ip_call_ra_chain()/ip_mr_input() Drop reference before handing the packets to raw_rcv() Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 1 + net/ipv4/ipmr.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2b7485e6504..af2ec88bbb2 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -184,6 +184,7 @@ int ip_call_ra_chain(struct sk_buff *skb) raw_rcv(last, skb2); } last = sk; + nf_reset(skb); } } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e21c049ec62..e4f809a93f4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1350,6 +1350,7 @@ int ip_mr_input(struct sk_buff *skb) */ read_lock(&mrt_lock); if (mroute_socket) { + nf_reset(skb); raw_rcv(mroute_socket, skb); read_unlock(&mrt_lock); return 0; -- cgit From 047601bf7c803724bbab9a4fdbad9481cecc12e0 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 21 Jun 2005 14:07:13 -0700 Subject: [NETFILTER]: Fix ip6t_LOG sit tunnel logging Sit tunnel logging is currently broken: MAC=01:23:45:67:89:ab->01:23:45:47:89:ac TUNNEL=123.123. 0.123-> 12.123. 6.123 Apart from the broken IP address, MAC addresses are printed differently for sit tunnels than for everything else. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/netfilter/ip6t_LOG.c | 54 +++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index bfc3d0185d1..c44685e391b 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -366,8 +366,6 @@ ip6t_log_packet(unsigned int hooknum, const char *level_string, const char *prefix) { - struct ipv6hdr *ipv6h = skb->nh.ipv6h; - spin_lock_bh(&log_lock); printk(level_string); printk("%sIN=%s OUT=%s ", @@ -377,39 +375,25 @@ ip6t_log_packet(unsigned int hooknum, if (in && !out) { /* MAC logging for input chain only. */ printk("MAC="); - if (skb->dev && skb->dev->hard_header_len && skb->mac.raw != (void*)ipv6h) { - if (skb->dev->type != ARPHRD_SIT){ - int i; - unsigned char *p = skb->mac.raw; - for (i = 0; i < skb->dev->hard_header_len; i++,p++) - printk("%02x%c", *p, - i==skb->dev->hard_header_len - 1 - ? ' ':':'); - } else { - int i; - unsigned char *p = skb->mac.raw; - if ( p - (ETH_ALEN*2+2) > skb->head ){ - p -= (ETH_ALEN+2); - for (i = 0; i < (ETH_ALEN); i++,p++) - printk("%02x%s", *p, - i == ETH_ALEN-1 ? "->" : ":"); - p -= (ETH_ALEN*2); - for (i = 0; i < (ETH_ALEN); i++,p++) - printk("%02x%c", *p, - i == ETH_ALEN-1 ? ' ' : ':'); - } - - if ((skb->dev->addr_len == 4) && - skb->dev->hard_header_len > 20){ - printk("TUNNEL="); - p = skb->mac.raw + 12; - for (i = 0; i < 4; i++,p++) - printk("%3d%s", *p, - i == 3 ? "->" : "."); - for (i = 0; i < 4; i++,p++) - printk("%3d%c", *p, - i == 3 ? ' ' : '.'); - } + if (skb->dev && skb->dev->hard_header_len && + skb->mac.raw != skb->nh.raw) { + unsigned char *p = skb->mac.raw; + int i; + + if (skb->dev->type == ARPHRD_SIT && + (p -= ETH_HLEN) < skb->head) + p = NULL; + + if (p != NULL) + for (i = 0; i < skb->dev->hard_header_len; i++) + printk("%02x", p[i]); + printk(" "); + + if (skb->dev->type == ARPHRD_SIT) { + struct iphdr *iph = (struct iphdr *)skb->mac.raw; + printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ", + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); } } else printk(" "); -- cgit From 90f66914c89b0be63548d4387d1211280aa7bc8e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 21 Jun 2005 14:43:28 -0700 Subject: [IPV4]: Fix fib_trie.c's args to fib_dump_info(). Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c0ece94fc63..0671569ee6f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1786,7 +1786,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi &xkey, plen, fa->fa_tos, - fa->fa_info) < 0) { + fa->fa_info, 0) < 0) { cb->args[3] = i; return -1; } -- cgit