From 8d24c0b43125ec26cc80e04588477a9a2afc025c Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:14 +0100
Subject: rhashtable: Do hashing inside of rhashtable_lookup_compare()

Hash the key inside of rhashtable_lookup_compare() like
rhashtable_lookup() does. This allows to simplify the hashing
functions and keep them private.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: netfilter-devel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 91 +++++++++++++++++++-------------------------------------
 1 file changed, 30 insertions(+), 61 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6c3c723e902b..1ee0eb636ca3 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -42,69 +42,39 @@ static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
 	return (void *) he - ht->p.head_offset;
 }
 
-static u32 __hashfn(const struct rhashtable *ht, const void *key,
-		      u32 len, u32 hsize)
+static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
 {
-	u32 h;
-
-	h = ht->p.hashfn(key, len, ht->p.hash_rnd);
-
-	return h & (hsize - 1);
-}
-
-/**
- * rhashtable_hashfn - compute hash for key of given length
- * @ht:		hash table to compute for
- * @key:	pointer to key
- * @len:	length of key
- *
- * Computes the hash value using the hash function provided in the 'hashfn'
- * of struct rhashtable_params. The returned value is guaranteed to be
- * smaller than the number of buckets in the hash table.
- */
-u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len)
-{
-	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
-
-	return __hashfn(ht, key, len, tbl->size);
+	return hash & (tbl->size - 1);
 }
-EXPORT_SYMBOL_GPL(rhashtable_hashfn);
 
-static u32 obj_hashfn(const struct rhashtable *ht, const void *ptr, u32 hsize)
+static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 {
-	if (unlikely(!ht->p.key_len)) {
-		u32 h;
-
-		h = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
+	u32 hash;
 
-		return h & (hsize - 1);
-	}
+	if (unlikely(!ht->p.key_len))
+		hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
+	else
+		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
+				    ht->p.hash_rnd);
 
-	return __hashfn(ht, ptr + ht->p.key_offset, ht->p.key_len, hsize);
+	return hash;
 }
 
-/**
- * rhashtable_obj_hashfn - compute hash for hashed object
- * @ht:		hash table to compute for
- * @ptr:	pointer to hashed object
- *
- * Computes the hash value using the hash function `hashfn` respectively
- * 'obj_hashfn' depending on whether the hash table is set up to work with
- * a fixed length key. The returned value is guaranteed to be smaller than
- * the number of buckets in the hash table.
- */
-u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr)
+static u32 key_hashfn(const struct rhashtable *ht, const void *key, u32 len)
 {
 	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	u32 hash;
+
+	hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
 
-	return obj_hashfn(ht, ptr, tbl->size);
+	return rht_bucket_index(tbl, hash);
 }
-EXPORT_SYMBOL_GPL(rhashtable_obj_hashfn);
 
 static u32 head_hashfn(const struct rhashtable *ht,
-		       const struct rhash_head *he, u32 hsize)
+		       const struct bucket_table *tbl,
+		       const struct rhash_head *he)
 {
-	return obj_hashfn(ht, rht_obj(ht, he), hsize);
+	return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
 }
 
 static struct bucket_table *bucket_table_alloc(size_t nbuckets)
@@ -170,9 +140,9 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 * reaches a node that doesn't hash to the same bucket as the
 	 * previous node p. Call the previous node p;
 	 */
-	h = head_hashfn(ht, p, new_tbl->size);
+	h = head_hashfn(ht, new_tbl, p);
 	rht_for_each(he, p->next, ht) {
-		if (head_hashfn(ht, he, new_tbl->size) != h)
+		if (head_hashfn(ht, new_tbl, he) != h)
 			break;
 		p = he;
 	}
@@ -184,7 +154,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	next = NULL;
 	if (he) {
 		rht_for_each(he, he->next, ht) {
-			if (head_hashfn(ht, he, new_tbl->size) == h) {
+			if (head_hashfn(ht, new_tbl, he) == h) {
 				next = he;
 				break;
 			}
@@ -237,9 +207,9 @@ int rhashtable_expand(struct rhashtable *ht)
 	 * single imprecise chain.
 	 */
 	for (i = 0; i < new_tbl->size; i++) {
-		h = i & (old_tbl->size - 1);
+		h = rht_bucket_index(old_tbl, i);
 		rht_for_each(he, old_tbl->buckets[h], ht) {
-			if (head_hashfn(ht, he, new_tbl->size) == i) {
+			if (head_hashfn(ht, new_tbl, he) == i) {
 				RCU_INIT_POINTER(new_tbl->buckets[i], he);
 				break;
 			}
@@ -353,7 +323,7 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	hash = head_hashfn(ht, obj, tbl->size);
+	hash = head_hashfn(ht, tbl, obj);
 	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
 	rcu_assign_pointer(tbl->buckets[hash], obj);
 	ht->nelems++;
@@ -413,7 +383,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	h = head_hashfn(ht, obj, tbl->size);
+	h = head_hashfn(ht, tbl, obj);
 
 	pprev = &tbl->buckets[h];
 	rht_for_each(he, tbl->buckets[h], ht) {
@@ -452,7 +422,7 @@ void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
 
 	BUG_ON(!ht->p.key_len);
 
-	h = __hashfn(ht, key, ht->p.key_len, tbl->size);
+	h = key_hashfn(ht, key, ht->p.key_len);
 	rht_for_each_rcu(he, tbl->buckets[h], ht) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
@@ -467,7 +437,7 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
 /**
  * rhashtable_lookup_compare - search hash table with compare function
  * @ht:		hash table
- * @hash:	hash value of desired entry
+ * @key:	the pointer to the key
  * @compare:	compare function, must return true on match
  * @arg:	argument passed on to compare function
  *
@@ -479,15 +449,14 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
  *
  * Returns the first entry on which the compare function returned true.
  */
-void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash,
+void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg)
 {
 	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 	struct rhash_head *he;
+	u32 hash;
 
-	if (unlikely(hash >= tbl->size))
-		return NULL;
-
+	hash = key_hashfn(ht, key, ht->p.key_len);
 	rht_for_each_rcu(he, tbl->buckets[hash], ht) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
-- 
cgit v1.2.3


From a4b18cda4c2676a4b4b59622b2e0394dc153e00b Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:15 +0100
Subject: rhashtable: Use rht_obj() instead of manual offset calculation

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 1ee0eb636ca3..b658245826a1 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -427,7 +427,7 @@ void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
 			continue;
-		return (void *) he - ht->p.head_offset;
+		return rht_obj(ht, he);
 	}
 
 	return NULL;
@@ -460,7 +460,7 @@ void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 	rht_for_each_rcu(he, tbl->buckets[hash], ht) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
-		return (void *) he - ht->p.head_offset;
+		return rht_obj(ht, he);
 	}
 
 	return NULL;
-- 
cgit v1.2.3


From 88d6ed15acff1cb44b1d1f3c0a393b7f7744957a Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:16 +0100
Subject: rhashtable: Convert bucket iterators to take table and index

This patch is in preparation to introduce per bucket spinlocks. It
extends all iterator macros to take the bucket table and bucket
index. It also introduces a new rht_dereference_bucket() to
handle protected accesses to buckets.

It introduces a barrier() to the RCU iterators to the prevent
the compiler from caching the first element.

The lockdep verifier is introduced as stub which always succeeds
and properly implement in the next patch when the locks are
introduced.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b658245826a1..ce450d095fdf 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -35,6 +35,12 @@ int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 	return ht->p.mutex_is_held(ht->p.parent);
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
+
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+	return 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #endif
 
 static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
@@ -141,7 +147,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 * previous node p. Call the previous node p;
 	 */
 	h = head_hashfn(ht, new_tbl, p);
-	rht_for_each(he, p->next, ht) {
+	rht_for_each_continue(he, p->next, old_tbl, n) {
 		if (head_hashfn(ht, new_tbl, he) != h)
 			break;
 		p = he;
@@ -153,7 +159,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 */
 	next = NULL;
 	if (he) {
-		rht_for_each(he, he->next, ht) {
+		rht_for_each_continue(he, he->next, old_tbl, n) {
 			if (head_hashfn(ht, new_tbl, he) == h) {
 				next = he;
 				break;
@@ -208,7 +214,7 @@ int rhashtable_expand(struct rhashtable *ht)
 	 */
 	for (i = 0; i < new_tbl->size; i++) {
 		h = rht_bucket_index(old_tbl, i);
-		rht_for_each(he, old_tbl->buckets[h], ht) {
+		rht_for_each(he, old_tbl, h) {
 			if (head_hashfn(ht, new_tbl, he) == i) {
 				RCU_INIT_POINTER(new_tbl->buckets[i], he);
 				break;
@@ -286,7 +292,7 @@ int rhashtable_shrink(struct rhashtable *ht)
 		 * to the new bucket.
 		 */
 		for (pprev = &ntbl->buckets[i]; *pprev != NULL;
-		     pprev = &rht_dereference(*pprev, ht)->next)
+		     pprev = &rht_dereference_bucket(*pprev, ntbl, i)->next)
 			;
 		RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]);
 	}
@@ -386,7 +392,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 	h = head_hashfn(ht, tbl, obj);
 
 	pprev = &tbl->buckets[h];
-	rht_for_each(he, tbl->buckets[h], ht) {
+	rht_for_each(he, tbl, h) {
 		if (he != obj) {
 			pprev = &he->next;
 			continue;
@@ -423,7 +429,7 @@ void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
 	BUG_ON(!ht->p.key_len);
 
 	h = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl->buckets[h], ht) {
+	rht_for_each_rcu(he, tbl, h) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
 			continue;
@@ -457,7 +463,7 @@ void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 	u32 hash;
 
 	hash = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl->buckets[hash], ht) {
+	rht_for_each_rcu(he, tbl, hash) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
 		return rht_obj(ht, he);
@@ -625,6 +631,7 @@ static int __init test_rht_lookup(struct rhashtable *ht)
 static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 {
 	unsigned int cnt, rcu_cnt, i, total = 0;
+	struct rhash_head *pos;
 	struct test_obj *obj;
 	struct bucket_table *tbl;
 
@@ -635,14 +642,14 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 		if (!quiet)
 			pr_info(" [%#4x/%zu]", i, tbl->size);
 
-		rht_for_each_entry_rcu(obj, tbl->buckets[i], node) {
+		rht_for_each_entry_rcu(obj, pos, tbl, i, node) {
 			cnt++;
 			total++;
 			if (!quiet)
 				pr_cont(" [%p],", obj);
 		}
 
-		rht_for_each_entry_rcu(obj, tbl->buckets[i], node)
+		rht_for_each_entry_rcu(obj, pos, tbl, i, node)
 			rcu_cnt++;
 
 		if (rcu_cnt != cnt)
@@ -664,7 +671,8 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 static int __init test_rhashtable(struct rhashtable *ht)
 {
 	struct bucket_table *tbl;
-	struct test_obj *obj, *next;
+	struct test_obj *obj;
+	struct rhash_head *pos, *next;
 	int err;
 	unsigned int i;
 
@@ -733,7 +741,7 @@ static int __init test_rhashtable(struct rhashtable *ht)
 error:
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 	for (i = 0; i < tbl->size; i++)
-		rht_for_each_entry_safe(obj, next, tbl->buckets[i], ht, node)
+		rht_for_each_entry_safe(obj, pos, next, tbl, i, node)
 			kfree(obj);
 
 	return err;
-- 
cgit v1.2.3


From b8e1943e9f754219bcfb40bac4a605b5348acb25 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:17 +0100
Subject: rhashtable: Factor out bucket_tail() function

Subsequent patches will require access to the bucket tail. Access
to the tail is relatively cheap as the automatic resizing of the
table should keep the number of entries per bucket to no more
than 0.75 on average.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index ce450d095fdf..0bd29c178910 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -83,6 +83,18 @@ static u32 head_hashfn(const struct rhashtable *ht,
 	return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
 }
 
+static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
+{
+	struct rhash_head __rcu **pprev;
+
+	for (pprev = &tbl->buckets[n];
+	     rht_dereference_bucket(*pprev, tbl, n);
+	     pprev = &rht_dereference_bucket(*pprev, tbl, n)->next)
+		;
+
+	return pprev;
+}
+
 static struct bucket_table *bucket_table_alloc(size_t nbuckets)
 {
 	struct bucket_table *tbl;
@@ -266,7 +278,6 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
 int rhashtable_shrink(struct rhashtable *ht)
 {
 	struct bucket_table *ntbl, *tbl = rht_dereference(ht->tbl, ht);
-	struct rhash_head __rcu **pprev;
 	unsigned int i;
 
 	ASSERT_RHT_MUTEX(ht);
@@ -286,15 +297,9 @@ int rhashtable_shrink(struct rhashtable *ht)
 	 */
 	for (i = 0; i < ntbl->size; i++) {
 		ntbl->buckets[i] = tbl->buckets[i];
+		RCU_INIT_POINTER(*bucket_tail(ntbl, i),
+				 tbl->buckets[i + ntbl->size]);
 
-		/* Link each bucket in the new table to the first bucket
-		 * in the old table that contains entries which will hash
-		 * to the new bucket.
-		 */
-		for (pprev = &ntbl->buckets[i]; *pprev != NULL;
-		     pprev = &rht_dereference_bucket(*pprev, ntbl, i)->next)
-			;
-		RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]);
 	}
 
 	/* Publish the new, valid hash table */
-- 
cgit v1.2.3


From 897362e446436d245972e72c6bc5b33bd7a5c659 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:18 +0100
Subject: nft_hash: Remove rhashtable_remove_pprev()

The removal function of nft_hash currently stores a reference to the
previous element during lookup which is used to optimize removal later
on. This was possible because a lock is held throughout calling
rhashtable_lookup() and rhashtable_remove().

With the introdution of deferred table resizing in parallel to lookups
and insertions, the nftables lock will no longer synchronize all
table mutations and the stored pprev may become invalid.

Removing this optimization makes removal slightly more expensive on
average but allows taking the resize cost out of the insert and
remove path.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: netfilter-devel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 34 +++++++---------------------------
 1 file changed, 7 insertions(+), 27 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 0bd29c178910..e6b85c4a5828 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -344,32 +344,6 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
 
-/**
- * rhashtable_remove_pprev - remove object from hash table given previous element
- * @ht:		hash table
- * @obj:	pointer to hash head inside object
- * @pprev:	pointer to previous element
- *
- * Identical to rhashtable_remove() but caller is alreayd aware of the element
- * in front of the element to be deleted. This is in particular useful for
- * deletion when combined with walking or lookup.
- */
-void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head __rcu **pprev)
-{
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
-
-	ASSERT_RHT_MUTEX(ht);
-
-	RCU_INIT_POINTER(*pprev, obj->next);
-	ht->nelems--;
-
-	if (ht->p.shrink_decision &&
-	    ht->p.shrink_decision(ht, tbl->size))
-		rhashtable_shrink(ht);
-}
-EXPORT_SYMBOL_GPL(rhashtable_remove_pprev);
-
 /**
  * rhashtable_remove - remove object from hash table
  * @ht:		hash table
@@ -403,7 +377,13 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 			continue;
 		}
 
-		rhashtable_remove_pprev(ht, he, pprev);
+		RCU_INIT_POINTER(*pprev, he->next);
+		ht->nelems--;
+
+		if (ht->p.shrink_decision &&
+		    ht->p.shrink_decision(ht, tbl->size))
+			rhashtable_shrink(ht);
+
 		return true;
 	}
 
-- 
cgit v1.2.3


From 97defe1ecf868b8127f8e62395499d6a06e4c4b1 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:20 +0100
Subject: rhashtable: Per bucket locks & deferred expansion/shrinking

Introduces an array of spinlocks to protect bucket mutations. The number
of spinlocks per CPU is configurable and selected based on the hash of
the bucket. This allows for parallel insertions and removals of entries
which do not share a lock.

The patch also defers expansion and shrinking to a worker queue which
allows insertion and removal from atomic context. Insertions and
deletions may occur in parallel to it and are only held up briefly
while the particular bucket is linked or unzipped.

Mutations of the bucket table pointer is protected by a new mutex, read
access is RCU protected.

In the event of an expansion or shrinking, the new bucket table allocated
is exposed as a so called future table as soon as the resize process
starts.  Lookups, deletions, and insertions will briefly use both tables.
The future table becomes the main table after an RCU grace period and
initial linking of the old to the new table was performed. Optimization
of the chains to make use of the new number of buckets follows only the
new table is in use.

The side effect of this is that during that RCU grace period, a bucket
traversal using any rht_for_each() variant on the main table will not see
any insertions performed during the RCU grace period which would at that
point land in the future table. The lookup will see them as it searches
both tables if needed.

Having multiple insertions and removals occur in parallel requires nelems
to become an atomic counter.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 458 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 344 insertions(+), 114 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e6b85c4a5828..312e3437c7bc 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -26,19 +26,42 @@
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4UL
+#define BUCKET_LOCKS_PER_CPU   128UL
+
+enum {
+	RHT_LOCK_NORMAL,
+	RHT_LOCK_NESTED,
+	RHT_LOCK_NESTED2,
+};
+
+/* The bucket lock is selected based on the hash and protects mutations
+ * on a group of hash buckets.
+ *
+ * IMPORTANT: When holding the bucket lock of both the old and new table
+ * during expansions and shrinking, the old bucket lock must always be
+ * acquired first.
+ */
+static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
+{
+	return &tbl->locks[hash & tbl->locks_mask];
+}
 
 #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
+#define ASSERT_BUCKET_LOCK(TBL, HASH) \
+	BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH))
 
 #ifdef CONFIG_PROVE_LOCKING
-int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
-	return ht->p.mutex_is_held(ht->p.parent);
+	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
 {
-	return 1;
+	spinlock_t *lock = bucket_lock(tbl, hash);
+
+	return (debug_locks) ? lockdep_is_held(lock) : 1;
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #endif
@@ -66,7 +89,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 	return hash;
 }
 
-static u32 key_hashfn(const struct rhashtable *ht, const void *key, u32 len)
+static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
 {
 	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 	u32 hash;
@@ -95,7 +118,49 @@ static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
 	return pprev;
 }
 
-static struct bucket_table *bucket_table_alloc(size_t nbuckets)
+static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
+{
+	unsigned int i, size;
+#if defined(CONFIG_PROVE_LOCKING)
+	unsigned int nr_pcpus = 2;
+#else
+	unsigned int nr_pcpus = num_possible_cpus();
+#endif
+
+	nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
+	size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+
+	/* Never allocate more than one lock per bucket */
+	size = min_t(unsigned int, size, tbl->size);
+
+	if (sizeof(spinlock_t) != 0) {
+#ifdef CONFIG_NUMA
+		if (size * sizeof(spinlock_t) > PAGE_SIZE)
+			tbl->locks = vmalloc(size * sizeof(spinlock_t));
+		else
+#endif
+		tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
+					   GFP_KERNEL);
+		if (!tbl->locks)
+			return -ENOMEM;
+		for (i = 0; i < size; i++)
+			spin_lock_init(&tbl->locks[i]);
+	}
+	tbl->locks_mask = size - 1;
+
+	return 0;
+}
+
+static void bucket_table_free(const struct bucket_table *tbl)
+{
+	if (tbl)
+		kvfree(tbl->locks);
+
+	kvfree(tbl);
+}
+
+static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
+					       size_t nbuckets)
 {
 	struct bucket_table *tbl;
 	size_t size;
@@ -110,12 +175,12 @@ static struct bucket_table *bucket_table_alloc(size_t nbuckets)
 
 	tbl->size = nbuckets;
 
-	return tbl;
-}
+	if (alloc_bucket_locks(ht, tbl) < 0) {
+		bucket_table_free(tbl);
+		return NULL;
+	}
 
-static void bucket_table_free(const struct bucket_table *tbl)
-{
-	kvfree(tbl);
+	return tbl;
 }
 
 /**
@@ -126,7 +191,7 @@ static void bucket_table_free(const struct bucket_table *tbl)
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
 {
 	/* Expand table when exceeding 75% load */
-	return ht->nelems > (new_size / 4 * 3);
+	return atomic_read(&ht->nelems) > (new_size / 4 * 3);
 }
 EXPORT_SYMBOL_GPL(rht_grow_above_75);
 
@@ -138,41 +203,59 @@ EXPORT_SYMBOL_GPL(rht_grow_above_75);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 {
 	/* Shrink table beneath 30% load */
-	return ht->nelems < (new_size * 3 / 10);
+	return atomic_read(&ht->nelems) < (new_size * 3 / 10);
 }
 EXPORT_SYMBOL_GPL(rht_shrink_below_30);
 
 static void hashtable_chain_unzip(const struct rhashtable *ht,
 				  const struct bucket_table *new_tbl,
-				  struct bucket_table *old_tbl, size_t n)
+				  struct bucket_table *old_tbl,
+				  size_t old_hash)
 {
 	struct rhash_head *he, *p, *next;
-	unsigned int h;
+	spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL;
+	unsigned int new_hash, new_hash2;
+
+	ASSERT_BUCKET_LOCK(old_tbl, old_hash);
 
 	/* Old bucket empty, no work needed. */
-	p = rht_dereference(old_tbl->buckets[n], ht);
+	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
+				   old_hash);
 	if (!p)
 		return;
 
+	new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
 	/* Advance the old bucket pointer one or more times until it
 	 * reaches a node that doesn't hash to the same bucket as the
 	 * previous node p. Call the previous node p;
 	 */
-	h = head_hashfn(ht, new_tbl, p);
-	rht_for_each_continue(he, p->next, old_tbl, n) {
-		if (head_hashfn(ht, new_tbl, he) != h)
+	rht_for_each_continue(he, p->next, old_tbl, old_hash) {
+		new_hash2 = head_hashfn(ht, new_tbl, he);
+		if (new_hash != new_hash2)
 			break;
 		p = he;
 	}
-	RCU_INIT_POINTER(old_tbl->buckets[n], p->next);
+	rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
+
+	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+
+	/* If we have encountered an entry that maps to a different bucket in
+	 * the new table, lock down that bucket as well as we might cut off
+	 * the end of the chain.
+	 */
+	new_bucket_lock2 = bucket_lock(new_tbl, new_hash);
+	if (new_bucket_lock != new_bucket_lock2)
+		spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2);
 
 	/* Find the subsequent node which does hash to the same
 	 * bucket as node P, or NULL if no such node exists.
 	 */
 	next = NULL;
 	if (he) {
-		rht_for_each_continue(he, he->next, old_tbl, n) {
-			if (head_hashfn(ht, new_tbl, he) == h) {
+		rht_for_each_continue(he, he->next, old_tbl, old_hash) {
+			if (head_hashfn(ht, new_tbl, he) == new_hash) {
 				next = he;
 				break;
 			}
@@ -182,7 +265,23 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	/* Set p's next pointer to that subsequent node pointer,
 	 * bypassing the nodes which do not hash to p's bucket
 	 */
-	RCU_INIT_POINTER(p->next, next);
+	rcu_assign_pointer(p->next, next);
+
+	if (new_bucket_lock != new_bucket_lock2)
+		spin_unlock_bh(new_bucket_lock2);
+	spin_unlock_bh(new_bucket_lock);
+}
+
+static void link_old_to_new(struct bucket_table *new_tbl,
+			    unsigned int new_hash, struct rhash_head *entry)
+{
+	spinlock_t *new_bucket_lock;
+
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
+	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+	rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
+	spin_unlock_bh(new_bucket_lock);
 }
 
 /**
@@ -195,43 +294,59 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
  * This function may only be called in a context where it is safe to call
  * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
  *
- * The caller must ensure that no concurrent table mutations take place.
- * It is however valid to have concurrent lookups if they are RCU protected.
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
  */
 int rhashtable_expand(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 	struct rhash_head *he;
-	unsigned int i, h;
-	bool complete;
+	spinlock_t *old_bucket_lock;
+	unsigned int new_hash, old_hash;
+	bool complete = false;
 
 	ASSERT_RHT_MUTEX(ht);
 
 	if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
 		return 0;
 
-	new_tbl = bucket_table_alloc(old_tbl->size * 2);
+	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
 	ht->shift++;
 
-	/* For each new bucket, search the corresponding old bucket
-	 * for the first entry that hashes to the new bucket, and
-	 * link the new bucket to that entry. Since all the entries
-	 * which will end up in the new bucket appear in the same
-	 * old bucket, this constructs an entirely valid new hash
-	 * table, but with multiple buckets "zipped" together into a
-	 * single imprecise chain.
+	/* Make insertions go into the new, empty table right away. Deletions
+	 * and lookups will be attempted in both tables until we synchronize.
+	 * The synchronize_rcu() guarantees for the new table to be picked up
+	 * so no new additions go into the old table while we relink.
+	 */
+	rcu_assign_pointer(ht->future_tbl, new_tbl);
+	synchronize_rcu();
+
+	/* For each new bucket, search the corresponding old bucket for the
+	 * first entry that hashes to the new bucket, and link the end of
+	 * newly formed bucket chain (containing entries added to future
+	 * table) to that entry. Since all the entries which will end up in
+	 * the new bucket appear in the same old bucket, this constructs an
+	 * entirely valid new hash table, but with multiple buckets
+	 * "zipped" together into a single imprecise chain.
 	 */
-	for (i = 0; i < new_tbl->size; i++) {
-		h = rht_bucket_index(old_tbl, i);
-		rht_for_each(he, old_tbl, h) {
-			if (head_hashfn(ht, new_tbl, he) == i) {
-				RCU_INIT_POINTER(new_tbl->buckets[i], he);
+	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
+		old_hash = rht_bucket_index(old_tbl, new_hash);
+		old_bucket_lock = bucket_lock(old_tbl, old_hash);
+
+		spin_lock_bh(old_bucket_lock);
+		rht_for_each(he, old_tbl, old_hash) {
+			if (head_hashfn(ht, new_tbl, he) == new_hash) {
+				link_old_to_new(new_tbl, new_hash, he);
 				break;
 			}
 		}
+		spin_unlock_bh(old_bucket_lock);
 	}
 
 	/* Publish the new table pointer. Lookups may now traverse
@@ -241,7 +356,7 @@ int rhashtable_expand(struct rhashtable *ht)
 	rcu_assign_pointer(ht->tbl, new_tbl);
 
 	/* Unzip interleaved hash chains */
-	do {
+	while (!complete && !ht->being_destroyed) {
 		/* Wait for readers. All new readers will see the new
 		 * table, and thus no references to the old table will
 		 * remain.
@@ -253,12 +368,17 @@ int rhashtable_expand(struct rhashtable *ht)
 		 * table): ...
 		 */
 		complete = true;
-		for (i = 0; i < old_tbl->size; i++) {
-			hashtable_chain_unzip(ht, new_tbl, old_tbl, i);
-			if (old_tbl->buckets[i] != NULL)
+		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+			old_bucket_lock = bucket_lock(old_tbl, old_hash);
+			spin_lock_bh(old_bucket_lock);
+
+			hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
+			if (old_tbl->buckets[old_hash] != NULL)
 				complete = false;
+
+			spin_unlock_bh(old_bucket_lock);
 		}
-	} while (!complete);
+	}
 
 	bucket_table_free(old_tbl);
 	return 0;
@@ -272,38 +392,65 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
  * This function may only be called in a context where it is safe to call
  * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
  *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
  * The caller must ensure that no concurrent table mutations take place.
  * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
  */
 int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *ntbl, *tbl = rht_dereference(ht->tbl, ht);
-	unsigned int i;
+	struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
+	spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2;
+	unsigned int new_hash;
 
 	ASSERT_RHT_MUTEX(ht);
 
 	if (ht->shift <= ht->p.min_shift)
 		return 0;
 
-	ntbl = bucket_table_alloc(tbl->size / 2);
-	if (ntbl == NULL)
+	new_tbl = bucket_table_alloc(ht, tbl->size / 2);
+	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	ht->shift--;
+	rcu_assign_pointer(ht->future_tbl, new_tbl);
+	synchronize_rcu();
 
-	/* Link each bucket in the new table to the first bucket
-	 * in the old table that contains entries which will hash
-	 * to the new bucket.
+	/* Link the first entry in the old bucket to the end of the
+	 * bucket in the new table. As entries are concurrently being
+	 * added to the new table, lock down the new bucket. As we
+	 * always divide the size in half when shrinking, each bucket
+	 * in the new table maps to exactly two buckets in the old
+	 * table.
+	 *
+	 * As removals can occur concurrently on the old table, we need
+	 * to lock down both matching buckets in the old table.
 	 */
-	for (i = 0; i < ntbl->size; i++) {
-		ntbl->buckets[i] = tbl->buckets[i];
-		RCU_INIT_POINTER(*bucket_tail(ntbl, i),
-				 tbl->buckets[i + ntbl->size]);
-
+	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
+		old_bucket_lock1 = bucket_lock(tbl, new_hash);
+		old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size);
+		new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
+		spin_lock_bh(old_bucket_lock1);
+		spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
+		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
+
+		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
+				   tbl->buckets[new_hash]);
+		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
+				   tbl->buckets[new_hash + new_tbl->size]);
+
+		spin_unlock_bh(new_bucket_lock);
+		spin_unlock_bh(old_bucket_lock2);
+		spin_unlock_bh(old_bucket_lock1);
 	}
 
 	/* Publish the new, valid hash table */
-	rcu_assign_pointer(ht->tbl, ntbl);
+	rcu_assign_pointer(ht->tbl, new_tbl);
+	ht->shift--;
 
 	/* Wait for readers. No new readers will have references to the
 	 * old hash table.
@@ -316,31 +463,63 @@ int rhashtable_shrink(struct rhashtable *ht)
 }
 EXPORT_SYMBOL_GPL(rhashtable_shrink);
 
+static void rht_deferred_worker(struct work_struct *work)
+{
+	struct rhashtable *ht;
+	struct bucket_table *tbl;
+
+	ht = container_of(work, struct rhashtable, run_work.work);
+	mutex_lock(&ht->mutex);
+	tbl = rht_dereference(ht->tbl, ht);
+
+	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
+		rhashtable_expand(ht);
+	else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size))
+		rhashtable_shrink(ht);
+
+	mutex_unlock(&ht->mutex);
+}
+
 /**
  * rhashtable_insert - insert object into hash hash table
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
  *
- * Will automatically grow the table via rhashtable_expand() if the the
- * grow_decision function specified at rhashtable_init() returns true.
+ * Will take a per bucket spinlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket lock.
  *
- * The caller must ensure that no concurrent table mutations occur. It is
- * however valid to have concurrent lookups if they are RCU protected.
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
  */
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
-	u32 hash;
+	struct bucket_table *tbl;
+	spinlock_t *lock;
+	unsigned hash;
 
-	ASSERT_RHT_MUTEX(ht);
+	rcu_read_lock();
 
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	hash = head_hashfn(ht, tbl, obj);
+	lock = bucket_lock(tbl, hash);
+
+	spin_lock_bh(lock);
 	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
 	rcu_assign_pointer(tbl->buckets[hash], obj);
-	ht->nelems++;
+	spin_unlock_bh(lock);
 
-	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
-		rhashtable_expand(ht);
+	atomic_inc(&ht->nelems);
+
+	/* Only grow the table if no resizing is currently in progress. */
+	if (ht->tbl != ht->future_tbl &&
+	    ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
+		schedule_delayed_work(&ht->run_work, 0);
+
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
 
@@ -361,32 +540,56 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  */
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *tbl;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
-	u32 h;
+	spinlock_t *lock;
+	unsigned int hash;
 
-	ASSERT_RHT_MUTEX(ht);
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	hash = head_hashfn(ht, tbl, obj);
 
-	h = head_hashfn(ht, tbl, obj);
+	lock = bucket_lock(tbl, hash);
+	spin_lock_bh(lock);
 
-	pprev = &tbl->buckets[h];
-	rht_for_each(he, tbl, h) {
+restart:
+	pprev = &tbl->buckets[hash];
+	rht_for_each(he, tbl, hash) {
 		if (he != obj) {
 			pprev = &he->next;
 			continue;
 		}
 
-		RCU_INIT_POINTER(*pprev, he->next);
-		ht->nelems--;
+		rcu_assign_pointer(*pprev, obj->next);
+		atomic_dec(&ht->nelems);
 
-		if (ht->p.shrink_decision &&
+		spin_unlock_bh(lock);
+
+		if (ht->tbl != ht->future_tbl &&
+		    ht->p.shrink_decision &&
 		    ht->p.shrink_decision(ht, tbl->size))
-			rhashtable_shrink(ht);
+			schedule_delayed_work(&ht->run_work, 0);
+
+		rcu_read_unlock();
 
 		return true;
 	}
 
+	if (tbl != rht_dereference_rcu(ht->tbl, ht)) {
+		spin_unlock_bh(lock);
+
+		tbl = rht_dereference_rcu(ht->tbl, ht);
+		hash = head_hashfn(ht, tbl, obj);
+
+		lock = bucket_lock(tbl, hash);
+		spin_lock_bh(lock);
+		goto restart;
+	}
+
+	spin_unlock_bh(lock);
+	rcu_read_unlock();
+
 	return false;
 }
 EXPORT_SYMBOL_GPL(rhashtable_remove);
@@ -402,25 +605,35 @@ EXPORT_SYMBOL_GPL(rhashtable_remove);
  * This lookup function may only be used for fixed key hash table (key_len
  * paramter set). It will BUG() if used inappropriately.
  *
- * Lookups may occur in parallel with hash mutations as long as the lookup is
- * guarded by rcu_read_lock(). The caller must take care of this.
+ * Lookups may occur in parallel with hashtable mutations and resizing.
  */
-void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
+void *rhashtable_lookup(struct rhashtable *ht, const void *key)
 {
-	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	const struct bucket_table *tbl, *old_tbl;
 	struct rhash_head *he;
-	u32 h;
+	u32 hash;
 
 	BUG_ON(!ht->p.key_len);
 
-	h = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl, h) {
+	rcu_read_lock();
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	hash = key_hashfn(ht, key, ht->p.key_len);
+restart:
+	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
 			continue;
+		rcu_read_unlock();
 		return rht_obj(ht, he);
 	}
 
+	if (unlikely(tbl != old_tbl)) {
+		tbl = old_tbl;
+		goto restart;
+	}
+
+	rcu_read_unlock();
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup);
@@ -435,25 +648,36 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
  * Traverses the bucket chain behind the provided hash value and calls the
  * specified compare function for each entry.
  *
- * Lookups may occur in parallel with hash mutations as long as the lookup is
- * guarded by rcu_read_lock(). The caller must take care of this.
+ * Lookups may occur in parallel with hashtable mutations and resizing.
  *
  * Returns the first entry on which the compare function returned true.
  */
-void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
+void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg)
 {
-	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	const struct bucket_table *tbl, *old_tbl;
 	struct rhash_head *he;
 	u32 hash;
 
+	rcu_read_lock();
+
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	hash = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl, hash) {
+restart:
+	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
+		rcu_read_unlock();
 		return rht_obj(ht, he);
 	}
 
+	if (unlikely(tbl != old_tbl)) {
+		tbl = old_tbl;
+		goto restart;
+	}
+	rcu_read_unlock();
+
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
@@ -485,9 +709,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
- * #ifdef CONFIG_PROVE_LOCKING
- *	.mutex_is_held = &my_mutex_is_held,
- * #endif
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -507,9 +728,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.head_offset = offsetof(struct test_obj, node),
  *	.hashfn = jhash,
  *	.obj_hashfn = my_hash_fn,
- * #ifdef CONFIG_PROVE_LOCKING
- *	.mutex_is_held = &my_mutex_is_held,
- * #endif
  * };
  */
 int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
@@ -529,18 +747,29 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (params->nelem_hint)
 		size = rounded_hashtable_size(params);
 
-	tbl = bucket_table_alloc(size);
+	memset(ht, 0, sizeof(*ht));
+	mutex_init(&ht->mutex);
+	memcpy(&ht->p, params, sizeof(*params));
+
+	if (params->locks_mul)
+		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
+	else
+		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+
+	tbl = bucket_table_alloc(ht, size);
 	if (tbl == NULL)
 		return -ENOMEM;
 
-	memset(ht, 0, sizeof(*ht));
 	ht->shift = ilog2(tbl->size);
-	memcpy(&ht->p, params, sizeof(*params));
 	RCU_INIT_POINTER(ht->tbl, tbl);
+	RCU_INIT_POINTER(ht->future_tbl, tbl);
 
 	if (!ht->p.hash_rnd)
 		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
 
+	if (ht->p.grow_decision || ht->p.shrink_decision)
+		INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_init);
@@ -553,9 +782,16 @@ EXPORT_SYMBOL_GPL(rhashtable_init);
  * has to make sure that no resizing may happen by unpublishing the hashtable
  * and waiting for the quiescent cycle before releasing the bucket array.
  */
-void rhashtable_destroy(const struct rhashtable *ht)
+void rhashtable_destroy(struct rhashtable *ht)
 {
-	bucket_table_free(ht->tbl);
+	ht->being_destroyed = true;
+
+	mutex_lock(&ht->mutex);
+
+	cancel_delayed_work(&ht->run_work);
+	bucket_table_free(rht_dereference(ht->tbl, ht));
+
+	mutex_unlock(&ht->mutex);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
 
@@ -570,13 +806,6 @@ EXPORT_SYMBOL_GPL(rhashtable_destroy);
 #define TEST_PTR	((void *) 0xdeadbeef)
 #define TEST_NEXPANDS	4
 
-#ifdef CONFIG_PROVE_LOCKING
-static int test_mutex_is_held(void *parent)
-{
-	return 1;
-}
-#endif
-
 struct test_obj {
 	void			*ptr;
 	int			value;
@@ -646,10 +875,10 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 				i, tbl->buckets[i], cnt);
 	}
 
-	pr_info("  Traversal complete: counted=%u, nelems=%zu, entries=%d\n",
-		total, ht->nelems, TEST_ENTRIES);
+	pr_info("  Traversal complete: counted=%u, nelems=%u, entries=%d\n",
+		total, atomic_read(&ht->nelems), TEST_ENTRIES);
 
-	if (total != ht->nelems || total != TEST_ENTRIES)
+	if (total != atomic_read(&ht->nelems) || total != TEST_ENTRIES)
 		pr_warn("Test failed: Total count mismatch ^^^");
 }
 
@@ -688,7 +917,9 @@ static int __init test_rhashtable(struct rhashtable *ht)
 
 	for (i = 0; i < TEST_NEXPANDS; i++) {
 		pr_info("  Table expansion iteration %u...\n", i);
+		mutex_lock(&ht->mutex);
 		rhashtable_expand(ht);
+		mutex_unlock(&ht->mutex);
 
 		rcu_read_lock();
 		pr_info("  Verifying lookups...\n");
@@ -698,7 +929,9 @@ static int __init test_rhashtable(struct rhashtable *ht)
 
 	for (i = 0; i < TEST_NEXPANDS; i++) {
 		pr_info("  Table shrinkage iteration %u...\n", i);
+		mutex_lock(&ht->mutex);
 		rhashtable_shrink(ht);
+		mutex_unlock(&ht->mutex);
 
 		rcu_read_lock();
 		pr_info("  Verifying lookups...\n");
@@ -741,9 +974,6 @@ static int __init test_rht_init(void)
 		.key_offset = offsetof(struct test_obj, value),
 		.key_len = sizeof(int),
 		.hashfn = jhash,
-#ifdef CONFIG_PROVE_LOCKING
-		.mutex_is_held = &test_mutex_is_held,
-#endif
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
 	};
-- 
cgit v1.2.3


From f89bd6f87a53ce5a7d60662429591ebac2745c10 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:21 +0100
Subject: rhashtable: Supports for nulls marker

In order to allow for wider usage of rhashtable, use a special nulls
marker to terminate each chain. The reason for not using the existing
nulls_list is that the prev pointer usage would not be valid as entries
can be linked in two different buckets at the same time.

The 4 nulls base bits can be set through the rhashtable_params structure
like this:

struct rhashtable_params params = {
        [...]
        .nulls_base = (1U << RHT_BASE_SHIFT),
};

This reduces the hash length from 32 bits to 27 bits.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 312e3437c7bc..cbad192d3b3d 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -28,6 +28,9 @@
 #define HASH_MIN_SIZE		4UL
 #define BUCKET_LOCKS_PER_CPU   128UL
 
+/* Base bits plus 1 bit for nulls marker */
+#define HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
+
 enum {
 	RHT_LOCK_NORMAL,
 	RHT_LOCK_NESTED,
@@ -86,7 +89,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
 				    ht->p.hash_rnd);
 
-	return hash;
+	return hash >> HASH_RESERVED_SPACE;
 }
 
 static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
@@ -95,6 +98,7 @@ static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
 	u32 hash;
 
 	hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
+	hash >>= HASH_RESERVED_SPACE;
 
 	return rht_bucket_index(tbl, hash);
 }
@@ -111,7 +115,7 @@ static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
 	struct rhash_head __rcu **pprev;
 
 	for (pprev = &tbl->buckets[n];
-	     rht_dereference_bucket(*pprev, tbl, n);
+	     !rht_is_a_nulls(rht_dereference_bucket(*pprev, tbl, n));
 	     pprev = &rht_dereference_bucket(*pprev, tbl, n)->next)
 		;
 
@@ -164,6 +168,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 {
 	struct bucket_table *tbl;
 	size_t size;
+	int i;
 
 	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
 	tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
@@ -180,6 +185,9 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		return NULL;
 	}
 
+	for (i = 0; i < nbuckets; i++)
+		INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+
 	return tbl;
 }
 
@@ -221,7 +229,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	/* Old bucket empty, no work needed. */
 	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
 				   old_hash);
-	if (!p)
+	if (rht_is_a_nulls(p))
 		return;
 
 	new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
@@ -252,8 +260,8 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	/* Find the subsequent node which does hash to the same
 	 * bucket as node P, or NULL if no such node exists.
 	 */
-	next = NULL;
-	if (he) {
+	INIT_RHT_NULLS_HEAD(next, ht, old_hash);
+	if (!rht_is_a_nulls(he)) {
 		rht_for_each_continue(he, he->next, old_tbl, old_hash) {
 			if (head_hashfn(ht, new_tbl, he) == new_hash) {
 				next = he;
@@ -369,11 +377,15 @@ int rhashtable_expand(struct rhashtable *ht)
 		 */
 		complete = true;
 		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+			struct rhash_head *head;
+
 			old_bucket_lock = bucket_lock(old_tbl, old_hash);
 			spin_lock_bh(old_bucket_lock);
 
 			hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
-			if (old_tbl->buckets[old_hash] != NULL)
+			head = rht_dereference_bucket(old_tbl->buckets[old_hash],
+						      old_tbl, old_hash);
+			if (!rht_is_a_nulls(head))
 				complete = false;
 
 			spin_unlock_bh(old_bucket_lock);
@@ -498,6 +510,7 @@ static void rht_deferred_worker(struct work_struct *work)
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
 	struct bucket_table *tbl;
+	struct rhash_head *head;
 	spinlock_t *lock;
 	unsigned hash;
 
@@ -508,7 +521,12 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 	lock = bucket_lock(tbl, hash);
 
 	spin_lock_bh(lock);
-	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
+	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	if (rht_is_a_nulls(head))
+		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
+	else
+		RCU_INIT_POINTER(obj->next, head);
+
 	rcu_assign_pointer(tbl->buckets[hash], obj);
 	spin_unlock_bh(lock);
 
@@ -709,6 +727,7 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
+ *	.nulls_base = (1U << RHT_BASE_SHIFT),
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -741,6 +760,9 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	    (!params->key_len && !params->obj_hashfn))
 		return -EINVAL;
 
+	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
+		return -EINVAL;
+
 	params->min_shift = max_t(size_t, params->min_shift,
 				  ilog2(HASH_MIN_SIZE));
 
@@ -974,6 +996,7 @@ static int __init test_rht_init(void)
 		.key_offset = offsetof(struct test_obj, value),
 		.key_len = sizeof(int),
 		.hashfn = jhash,
+		.nulls_base = (3U << RHT_BASE_SHIFT),
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
 	};
-- 
cgit v1.2.3


From efb975a67ea7846b966080f999589de570686aa0 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 7 Jan 2015 13:41:52 +0800
Subject: rhashtable: optimize rhashtable_lookup routine

Define an internal compare function and relevant compare argument,
and then make use of rhashtable_lookup_compare() to lookup key in
hash table, reducing duplicated code between rhashtable_lookup()
and rhashtable_lookup_compare().

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 41 ++++++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index cbad192d3b3d..f2fdd7a7cb16 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -612,6 +612,19 @@ restart:
 }
 EXPORT_SYMBOL_GPL(rhashtable_remove);
 
+struct rhashtable_compare_arg {
+	struct rhashtable *ht;
+	const void *key;
+};
+
+static bool rhashtable_compare(void *ptr, void *arg)
+{
+	struct rhashtable_compare_arg *x = arg;
+	struct rhashtable *ht = x->ht;
+
+	return !memcmp(ptr + ht->p.key_offset, x->key, ht->p.key_len);
+}
+
 /**
  * rhashtable_lookup - lookup key in hash table
  * @ht:		hash table
@@ -627,32 +640,14 @@ EXPORT_SYMBOL_GPL(rhashtable_remove);
  */
 void *rhashtable_lookup(struct rhashtable *ht, const void *key)
 {
-	const struct bucket_table *tbl, *old_tbl;
-	struct rhash_head *he;
-	u32 hash;
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
 
 	BUG_ON(!ht->p.key_len);
 
-	rcu_read_lock();
-	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	tbl = rht_dereference_rcu(ht->future_tbl, ht);
-	hash = key_hashfn(ht, key, ht->p.key_len);
-restart:
-	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
-		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
-			   ht->p.key_len))
-			continue;
-		rcu_read_unlock();
-		return rht_obj(ht, he);
-	}
-
-	if (unlikely(tbl != old_tbl)) {
-		tbl = old_tbl;
-		goto restart;
-	}
-
-	rcu_read_unlock();
-	return NULL;
+	return rhashtable_lookup_compare(ht, key, &rhashtable_compare, &arg);
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup);
 
-- 
cgit v1.2.3


From 54c5b7d311c8e1801f9dcce9f388a7420a25fa90 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 7 Jan 2015 13:41:53 +0800
Subject: rhashtable: introduce rhashtable_wakeup_worker helper function

Introduce rhashtable_wakeup_worker() helper function to reduce
duplicated code where to wake up worker.

By the way, as long as the both "future_tbl" and "tbl" bucket table
pointers point to the same bucket array, we should try to wake up
the resizing worker thread, otherwise, it indicates the work of
resizing hash table is not finished yet. However, currently we will
wake up the worker thread only when the two pointers point to
different bucket array. Obviously this is wrong. So, the issue is
also fixed as well in the patch.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index f2fdd7a7cb16..20006854fce0 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -492,6 +492,19 @@ static void rht_deferred_worker(struct work_struct *work)
 	mutex_unlock(&ht->mutex);
 }
 
+static void rhashtable_wakeup_worker(struct rhashtable *ht)
+{
+	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	size_t size = tbl->size;
+
+	/* Only adjust the table if no resizing is currently in progress. */
+	if (tbl == new_tbl &&
+	    ((ht->p.grow_decision && ht->p.grow_decision(ht, size)) ||
+	     (ht->p.shrink_decision && ht->p.shrink_decision(ht, size))))
+		schedule_delayed_work(&ht->run_work, 0);
+}
+
 /**
  * rhashtable_insert - insert object into hash hash table
  * @ht:		hash table
@@ -532,10 +545,7 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 
 	atomic_inc(&ht->nelems);
 
-	/* Only grow the table if no resizing is currently in progress. */
-	if (ht->tbl != ht->future_tbl &&
-	    ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
-		schedule_delayed_work(&ht->run_work, 0);
+	rhashtable_wakeup_worker(ht);
 
 	rcu_read_unlock();
 }
@@ -584,10 +594,7 @@ restart:
 
 		spin_unlock_bh(lock);
 
-		if (ht->tbl != ht->future_tbl &&
-		    ht->p.shrink_decision &&
-		    ht->p.shrink_decision(ht, tbl->size))
-			schedule_delayed_work(&ht->run_work, 0);
+		rhashtable_wakeup_worker(ht);
 
 		rcu_read_unlock();
 
-- 
cgit v1.2.3


From db30485408326a6f466a843b291b23535f63eda0 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 7 Jan 2015 13:41:54 +0800
Subject: rhashtable: involve rhashtable_lookup_insert routine

Involve a new function called rhashtable_lookup_insert() which makes
lookup and insertion atomic under bucket lock protection, helping us
avoid to introduce an extra lock when we search and insert an object
into hash table.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 82 insertions(+), 15 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 20006854fce0..4430233c4e11 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -505,8 +505,26 @@ static void rhashtable_wakeup_worker(struct rhashtable *ht)
 		schedule_delayed_work(&ht->run_work, 0);
 }
 
+static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
+				struct bucket_table *tbl, u32 hash)
+{
+	struct rhash_head *head = rht_dereference_bucket(tbl->buckets[hash],
+							 tbl, hash);
+
+	if (rht_is_a_nulls(head))
+		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
+	else
+		RCU_INIT_POINTER(obj->next, head);
+
+	rcu_assign_pointer(tbl->buckets[hash], obj);
+
+	atomic_inc(&ht->nelems);
+
+	rhashtable_wakeup_worker(ht);
+}
+
 /**
- * rhashtable_insert - insert object into hash hash table
+ * rhashtable_insert - insert object into hash table
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
  *
@@ -523,7 +541,6 @@ static void rhashtable_wakeup_worker(struct rhashtable *ht)
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
 	struct bucket_table *tbl;
-	struct rhash_head *head;
 	spinlock_t *lock;
 	unsigned hash;
 
@@ -534,19 +551,9 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 	lock = bucket_lock(tbl, hash);
 
 	spin_lock_bh(lock);
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
-	if (rht_is_a_nulls(head))
-		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
-	else
-		RCU_INIT_POINTER(obj->next, head);
-
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+	__rhashtable_insert(ht, obj, tbl, hash);
 	spin_unlock_bh(lock);
 
-	atomic_inc(&ht->nelems);
-
-	rhashtable_wakeup_worker(ht);
-
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
@@ -560,7 +567,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  * walk the bucket chain upon removal. The removal operation is thus
  * considerable slow if the hash table is not correctly sized.
  *
- * Will automatically shrink the table via rhashtable_expand() if the the
+ * Will automatically shrink the table via rhashtable_expand() if the
  * shrink_decision function specified at rhashtable_init() returns true.
  *
  * The caller must ensure that no concurrent table mutations occur. It is
@@ -641,7 +648,7 @@ static bool rhashtable_compare(void *ptr, void *arg)
  * for a entry with an identical key. The first matching entry is returned.
  *
  * This lookup function may only be used for fixed key hash table (key_len
- * paramter set). It will BUG() if used inappropriately.
+ * parameter set). It will BUG() if used inappropriately.
  *
  * Lookups may occur in parallel with hashtable mutations and resizing.
  */
@@ -702,6 +709,66 @@ restart:
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
 
+/**
+ * rhashtable_lookup_insert - lookup and insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ *
+ * Locks down the bucket chain in both the old and new table if a resize
+ * is in progress to ensure that writers can't remove from the old table
+ * and can't insert to the new table during the atomic operation of search
+ * and insertion. Searches for duplicates in both the old and new table if
+ * a resize is in progress.
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * parameter set). It will BUG() if used inappropriately.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
+{
+	struct bucket_table *new_tbl, *old_tbl;
+	spinlock_t *new_bucket_lock, *old_bucket_lock;
+	u32 new_hash, old_hash;
+	bool success = true;
+
+	BUG_ON(!ht->p.key_len);
+
+	rcu_read_lock();
+
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	old_hash = head_hashfn(ht, old_tbl, obj);
+	old_bucket_lock = bucket_lock(old_tbl, old_hash);
+	spin_lock_bh(old_bucket_lock);
+
+	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	new_hash = head_hashfn(ht, new_tbl, obj);
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+	if (unlikely(old_tbl != new_tbl))
+		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+
+	if (rhashtable_lookup(ht, rht_obj(ht, obj) + ht->p.key_offset)) {
+		success = false;
+		goto exit;
+	}
+
+	__rhashtable_insert(ht, obj, new_tbl, new_hash);
+
+exit:
+	if (unlikely(old_tbl != new_tbl))
+		spin_unlock_bh(new_bucket_lock);
+	spin_unlock_bh(old_bucket_lock);
+
+	rcu_read_unlock();
+
+	return success;
+}
+EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
 	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
-- 
cgit v1.2.3


From bd6d4db552ceb52fb19890a454836dcda59743ce Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 7 Jan 2015 13:41:55 +0800
Subject: rhashtable: future table needs to be traversed when remove an object

When remove an object from hash table, we currently only traverse old
bucket table to check whether the object exists. If the object is not
found in it, we will try again. But in the second search loop, we still
search the object from the old table instead of future table. As a
result, the object may be not removed from hash table especially when
resizing is currently in progress and the object is just saved in the
future table.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 4430233c4e11..1aef942976fe 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -608,10 +608,10 @@ restart:
 		return true;
 	}
 
-	if (tbl != rht_dereference_rcu(ht->tbl, ht)) {
+	if (tbl != rht_dereference_rcu(ht->future_tbl, ht)) {
 		spin_unlock_bh(lock);
 
-		tbl = rht_dereference_rcu(ht->tbl, ht);
+		tbl = rht_dereference_rcu(ht->future_tbl, ht);
 		hash = head_hashfn(ht, tbl, obj);
 
 		lock = bucket_lock(tbl, hash);
-- 
cgit v1.2.3


From c0c09bfdc4150b3918526660768585cd477adf35 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 7 Jan 2015 13:41:56 +0800
Subject: rhashtable: avoid unnecessary wakeup for worker queue

Move condition statements of verifying whether hash table size exceeds
its maximum threshold or reaches its minimum threshold from resizing
functions to resizing decision functions, avoiding unnecessary wakeup
for worker queue thread.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 1aef942976fe..7fb474b18f1b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -199,7 +199,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
 {
 	/* Expand table when exceeding 75% load */
-	return atomic_read(&ht->nelems) > (new_size / 4 * 3);
+	return atomic_read(&ht->nelems) > (new_size / 4 * 3) &&
+	       (ht->p.max_shift && atomic_read(&ht->shift) < ht->p.max_shift);
 }
 EXPORT_SYMBOL_GPL(rht_grow_above_75);
 
@@ -211,7 +212,8 @@ EXPORT_SYMBOL_GPL(rht_grow_above_75);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 {
 	/* Shrink table beneath 30% load */
-	return atomic_read(&ht->nelems) < (new_size * 3 / 10);
+	return atomic_read(&ht->nelems) < (new_size * 3 / 10) &&
+	       (atomic_read(&ht->shift) > ht->p.min_shift);
 }
 EXPORT_SYMBOL_GPL(rht_shrink_below_30);
 
@@ -318,14 +320,11 @@ int rhashtable_expand(struct rhashtable *ht)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
-		return 0;
-
 	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	ht->shift++;
+	atomic_inc(&ht->shift);
 
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
@@ -421,9 +420,6 @@ int rhashtable_shrink(struct rhashtable *ht)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	if (ht->shift <= ht->p.min_shift)
-		return 0;
-
 	new_tbl = bucket_table_alloc(ht, tbl->size / 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
@@ -462,7 +458,7 @@ int rhashtable_shrink(struct rhashtable *ht)
 
 	/* Publish the new, valid hash table */
 	rcu_assign_pointer(ht->tbl, new_tbl);
-	ht->shift--;
+	atomic_dec(&ht->shift);
 
 	/* Wait for readers. No new readers will have references to the
 	 * old hash table.
@@ -851,7 +847,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (tbl == NULL)
 		return -ENOMEM;
 
-	ht->shift = ilog2(tbl->size);
+	atomic_set(&ht->shift, ilog2(tbl->size));
 	RCU_INIT_POINTER(ht->tbl, tbl);
 	RCU_INIT_POINTER(ht->future_tbl, tbl);
 
-- 
cgit v1.2.3


From 545a148e43bed67618cc90b66f9864fba0878890 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 7 Jan 2015 13:41:57 +0800
Subject: rhashtable: initialize atomic nelems variable

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 7fb474b18f1b..8023b554905c 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -847,6 +847,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (tbl == NULL)
 		return -ENOMEM;
 
+	atomic_set(&ht->nelems, 0);
 	atomic_set(&ht->shift, ilog2(tbl->size));
 	RCU_INIT_POINTER(ht->tbl, tbl);
 	RCU_INIT_POINTER(ht->future_tbl, tbl);
-- 
cgit v1.2.3


From 7a868d1e9ab3c534c5ad44e3e5dc46753a1e5636 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 12 Jan 2015 14:52:22 +0800
Subject: rhashtable: involve rhashtable_lookup_compare_insert routine

Introduce a new function called rhashtable_lookup_compare_insert()
which is very similar to rhashtable_lookup_insert(). But the former
makes use of users' given compare function to look for an object,
and then inserts it into hash table if found. As the entire process
of search and insertion is under protection of per bucket lock, this
can help users to avoid the involvement of extra lock.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 8023b554905c..ed6ae1ad304c 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -726,6 +726,43 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
  * to rhashtable_init().
  */
 bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
+{
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = rht_obj(ht, obj) + ht->p.key_offset,
+	};
+
+	BUG_ON(!ht->p.key_len);
+
+	return rhashtable_lookup_compare_insert(ht, obj, &rhashtable_compare,
+						&arg);
+}
+EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+
+/**
+ * rhashtable_lookup_compare_insert - search and insert object to hash table
+ *                                    with compare function
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @compare:	compare function, must return true on match
+ * @arg:	argument passed on to compare function
+ *
+ * Locks down the bucket chain in both the old and new table if a resize
+ * is in progress to ensure that writers can't remove from the old table
+ * and can't insert to the new table during the atomic operation of search
+ * and insertion. Searches for duplicates in both the old and new table if
+ * a resize is in progress.
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
+				      struct rhash_head *obj,
+				      bool (*compare)(void *, void *),
+				      void *arg)
 {
 	struct bucket_table *new_tbl, *old_tbl;
 	spinlock_t *new_bucket_lock, *old_bucket_lock;
@@ -747,7 +784,8 @@ bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
 	if (unlikely(old_tbl != new_tbl))
 		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
 
-	if (rhashtable_lookup(ht, rht_obj(ht, obj) + ht->p.key_offset)) {
+	if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
+				      compare, arg)) {
 		success = false;
 		goto exit;
 	}
@@ -763,7 +801,7 @@ exit:
 
 	return success;
 }
-EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert);
 
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
-- 
cgit v1.2.3


From 80ca8c3a84c74a87977558861bb8eef650732912 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Mon, 12 Jan 2015 23:58:21 +0000
Subject: rhashtable: Lower/upper bucket may map to same lock while shrinking

Each per bucket lock covers a configurable number of buckets. While
shrinking, two buckets in the old table contain entries for a single
bucket in the new table. We need to lock down both while linking.
Check if they are protected by different locks to avoid a recursive
lock.

Fixes: 97defe1e ("rhashtable: Per bucket locks & deferred expansion/shrinking")
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index ed6ae1ad304c..aca699813ba9 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -443,8 +443,16 @@ int rhashtable_shrink(struct rhashtable *ht)
 		new_bucket_lock = bucket_lock(new_tbl, new_hash);
 
 		spin_lock_bh(old_bucket_lock1);
-		spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
-		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
+
+		/* Depending on the lock per buckets mapping, the bucket in
+		 * the lower and upper region may map to the same lock.
+		 */
+		if (old_bucket_lock1 != old_bucket_lock2) {
+			spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
+			spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
+		} else {
+			spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+		}
 
 		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
 				   tbl->buckets[new_hash]);
@@ -452,7 +460,8 @@ int rhashtable_shrink(struct rhashtable *ht)
 				   tbl->buckets[new_hash + new_tbl->size]);
 
 		spin_unlock_bh(new_bucket_lock);
-		spin_unlock_bh(old_bucket_lock2);
+		if (old_bucket_lock1 != old_bucket_lock2)
+			spin_unlock_bh(old_bucket_lock2);
 		spin_unlock_bh(old_bucket_lock1);
 	}
 
-- 
cgit v1.2.3


From 57699a40b4f2694d3ee63fd5e6465ec8f600b620 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Fri, 16 Jan 2015 11:13:09 +0800
Subject: rhashtable: Fix race in rhashtable_destroy() and use regular
 work_struct

When we put our declared work task in the global workqueue with
schedule_delayed_work(), its delay parameter is always zero.
Therefore, we should define a regular work in rhashtable structure
instead of a delayed work.

By the way, we add a condition to check whether resizing functions
are NULL before cancelling the work, avoiding to cancel an
uninitialized work.

Lastly, while we wait for all work items we submitted before to run
to completion with cancel_delayed_work(), ht->mutex has been taken in
rhashtable_destroy(). Moreover, cancel_delayed_work() doesn't return
until all work items are accomplished, and when work items are
scheduled, the work's function - rht_deferred_worker() will be called.
However, as rht_deferred_worker() also needs to acquire the lock,
deadlock might happen at the moment as the lock is already held before.
So if the cancel work function is moved out of the lock covered scope,
this will avoid the deadlock.

Fixes: 97defe1 ("rhashtable: Per bucket locks & deferred expansion/shrinking")
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index aca699813ba9..84a78e396a56 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -485,7 +485,7 @@ static void rht_deferred_worker(struct work_struct *work)
 	struct rhashtable *ht;
 	struct bucket_table *tbl;
 
-	ht = container_of(work, struct rhashtable, run_work.work);
+	ht = container_of(work, struct rhashtable, run_work);
 	mutex_lock(&ht->mutex);
 	tbl = rht_dereference(ht->tbl, ht);
 
@@ -507,7 +507,7 @@ static void rhashtable_wakeup_worker(struct rhashtable *ht)
 	if (tbl == new_tbl &&
 	    ((ht->p.grow_decision && ht->p.grow_decision(ht, size)) ||
 	     (ht->p.shrink_decision && ht->p.shrink_decision(ht, size))))
-		schedule_delayed_work(&ht->run_work, 0);
+		schedule_work(&ht->run_work);
 }
 
 static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
@@ -903,7 +903,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
 
 	if (ht->p.grow_decision || ht->p.shrink_decision)
-		INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker);
+		INIT_WORK(&ht->run_work, rht_deferred_worker);
 
 	return 0;
 }
@@ -921,11 +921,11 @@ void rhashtable_destroy(struct rhashtable *ht)
 {
 	ht->being_destroyed = true;
 
-	mutex_lock(&ht->mutex);
+	if (ht->p.grow_decision || ht->p.shrink_decision)
+		cancel_work_sync(&ht->run_work);
 
-	cancel_delayed_work(&ht->run_work);
+	mutex_lock(&ht->mutex);
 	bucket_table_free(rht_dereference(ht->tbl, ht));
-
 	mutex_unlock(&ht->mutex);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
-- 
cgit v1.2.3


From fe6a043c535acfec8f8e554536c87923dcb45097 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 21 Jan 2015 11:54:01 +0000
Subject: rhashtable: rhashtable_remove() must unlink in both tbl and
 future_tbl

As removals can occur during resizes, entries may be referred to from
both tbl and future_tbl when the removal is requested. Therefore
rhashtable_remove() must unlink the entry in both tables if this is
the case. The existing code did search both tables but stopped when it
hit the first match.

Failing to unlink in both tables resulted in use after free.

Fixes: 97defe1ecf86 ("rhashtable: Per bucket locks & deferred expansion/shrinking")
Reported-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 84a78e396a56..bc2d0d80d1f9 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -585,6 +585,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 	struct rhash_head *he;
 	spinlock_t *lock;
 	unsigned int hash;
+	bool ret = false;
 
 	rcu_read_lock();
 	tbl = rht_dereference_rcu(ht->tbl, ht);
@@ -602,17 +603,16 @@ restart:
 		}
 
 		rcu_assign_pointer(*pprev, obj->next);
-		atomic_dec(&ht->nelems);
-
-		spin_unlock_bh(lock);
-
-		rhashtable_wakeup_worker(ht);
-
-		rcu_read_unlock();
 
-		return true;
+		ret = true;
+		break;
 	}
 
+	/* The entry may be linked in either 'tbl', 'future_tbl', or both.
+	 * 'future_tbl' only exists for a short period of time during
+	 * resizing. Thus traversing both is fine and the added cost is
+	 * very rare.
+	 */
 	if (tbl != rht_dereference_rcu(ht->future_tbl, ht)) {
 		spin_unlock_bh(lock);
 
@@ -625,9 +625,15 @@ restart:
 	}
 
 	spin_unlock_bh(lock);
+
+	if (ret) {
+		atomic_dec(&ht->nelems);
+		rhashtable_wakeup_worker(ht);
+	}
+
 	rcu_read_unlock();
 
-	return false;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(rhashtable_remove);
 
-- 
cgit v1.2.3


From 9d6dbe1bbaf8bf03804c164fb67a98646bf2f622 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 29 Jan 2015 15:40:25 +0100
Subject: rhashtable: Make selftest modular

Allow the selftest on the resizable hash table to be built modular, just
like all other tests that do not depend on DEBUG_KERNEL.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/Kconfig.debug     |   2 +-
 lib/Makefile          |   1 +
 lib/rhashtable.c      | 205 ---------------------------------------------
 lib/test_rhashtable.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 229 insertions(+), 206 deletions(-)
 create mode 100644 lib/test_rhashtable.c

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5f2ce616c046..a8f3c9993229 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1586,7 +1586,7 @@ config TEST_KSTRTOX
 	tristate "Test kstrto*() family of functions at runtime"
 
 config TEST_RHASHTABLE
-	bool "Perform selftest on resizable hash table"
+	tristate "Perform selftest on resizable hash table"
 	default n
 	help
 	  Enable this option to test the rhashtable functions at boot.
diff --git a/lib/Makefile b/lib/Makefile
index 3c3b30b9e020..a8cf98d14199 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TEST_LKM) += test_module.o
 obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
+obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index bc2d0d80d1f9..c41e21096373 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -935,208 +935,3 @@ void rhashtable_destroy(struct rhashtable *ht)
 	mutex_unlock(&ht->mutex);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
-
-/**************************************************************************
- * Self Test
- **************************************************************************/
-
-#ifdef CONFIG_TEST_RHASHTABLE
-
-#define TEST_HT_SIZE	8
-#define TEST_ENTRIES	2048
-#define TEST_PTR	((void *) 0xdeadbeef)
-#define TEST_NEXPANDS	4
-
-struct test_obj {
-	void			*ptr;
-	int			value;
-	struct rhash_head	node;
-};
-
-static int __init test_rht_lookup(struct rhashtable *ht)
-{
-	unsigned int i;
-
-	for (i = 0; i < TEST_ENTRIES * 2; i++) {
-		struct test_obj *obj;
-		bool expected = !(i % 2);
-		u32 key = i;
-
-		obj = rhashtable_lookup(ht, &key);
-
-		if (expected && !obj) {
-			pr_warn("Test failed: Could not find key %u\n", key);
-			return -ENOENT;
-		} else if (!expected && obj) {
-			pr_warn("Test failed: Unexpected entry found for key %u\n",
-				key);
-			return -EEXIST;
-		} else if (expected && obj) {
-			if (obj->ptr != TEST_PTR || obj->value != i) {
-				pr_warn("Test failed: Lookup value mismatch %p!=%p, %u!=%u\n",
-					obj->ptr, TEST_PTR, obj->value, i);
-				return -EINVAL;
-			}
-		}
-	}
-
-	return 0;
-}
-
-static void test_bucket_stats(struct rhashtable *ht, bool quiet)
-{
-	unsigned int cnt, rcu_cnt, i, total = 0;
-	struct rhash_head *pos;
-	struct test_obj *obj;
-	struct bucket_table *tbl;
-
-	tbl = rht_dereference_rcu(ht->tbl, ht);
-	for (i = 0; i < tbl->size; i++) {
-		rcu_cnt = cnt = 0;
-
-		if (!quiet)
-			pr_info(" [%#4x/%zu]", i, tbl->size);
-
-		rht_for_each_entry_rcu(obj, pos, tbl, i, node) {
-			cnt++;
-			total++;
-			if (!quiet)
-				pr_cont(" [%p],", obj);
-		}
-
-		rht_for_each_entry_rcu(obj, pos, tbl, i, node)
-			rcu_cnt++;
-
-		if (rcu_cnt != cnt)
-			pr_warn("Test failed: Chain count mismach %d != %d",
-				cnt, rcu_cnt);
-
-		if (!quiet)
-			pr_cont("\n  [%#x] first element: %p, chain length: %u\n",
-				i, tbl->buckets[i], cnt);
-	}
-
-	pr_info("  Traversal complete: counted=%u, nelems=%u, entries=%d\n",
-		total, atomic_read(&ht->nelems), TEST_ENTRIES);
-
-	if (total != atomic_read(&ht->nelems) || total != TEST_ENTRIES)
-		pr_warn("Test failed: Total count mismatch ^^^");
-}
-
-static int __init test_rhashtable(struct rhashtable *ht)
-{
-	struct bucket_table *tbl;
-	struct test_obj *obj;
-	struct rhash_head *pos, *next;
-	int err;
-	unsigned int i;
-
-	/*
-	 * Insertion Test:
-	 * Insert TEST_ENTRIES into table with all keys even numbers
-	 */
-	pr_info("  Adding %d keys\n", TEST_ENTRIES);
-	for (i = 0; i < TEST_ENTRIES; i++) {
-		struct test_obj *obj;
-
-		obj = kzalloc(sizeof(*obj), GFP_KERNEL);
-		if (!obj) {
-			err = -ENOMEM;
-			goto error;
-		}
-
-		obj->ptr = TEST_PTR;
-		obj->value = i * 2;
-
-		rhashtable_insert(ht, &obj->node);
-	}
-
-	rcu_read_lock();
-	test_bucket_stats(ht, true);
-	test_rht_lookup(ht);
-	rcu_read_unlock();
-
-	for (i = 0; i < TEST_NEXPANDS; i++) {
-		pr_info("  Table expansion iteration %u...\n", i);
-		mutex_lock(&ht->mutex);
-		rhashtable_expand(ht);
-		mutex_unlock(&ht->mutex);
-
-		rcu_read_lock();
-		pr_info("  Verifying lookups...\n");
-		test_rht_lookup(ht);
-		rcu_read_unlock();
-	}
-
-	for (i = 0; i < TEST_NEXPANDS; i++) {
-		pr_info("  Table shrinkage iteration %u...\n", i);
-		mutex_lock(&ht->mutex);
-		rhashtable_shrink(ht);
-		mutex_unlock(&ht->mutex);
-
-		rcu_read_lock();
-		pr_info("  Verifying lookups...\n");
-		test_rht_lookup(ht);
-		rcu_read_unlock();
-	}
-
-	rcu_read_lock();
-	test_bucket_stats(ht, true);
-	rcu_read_unlock();
-
-	pr_info("  Deleting %d keys\n", TEST_ENTRIES);
-	for (i = 0; i < TEST_ENTRIES; i++) {
-		u32 key = i * 2;
-
-		obj = rhashtable_lookup(ht, &key);
-		BUG_ON(!obj);
-
-		rhashtable_remove(ht, &obj->node);
-		kfree(obj);
-	}
-
-	return 0;
-
-error:
-	tbl = rht_dereference_rcu(ht->tbl, ht);
-	for (i = 0; i < tbl->size; i++)
-		rht_for_each_entry_safe(obj, pos, next, tbl, i, node)
-			kfree(obj);
-
-	return err;
-}
-
-static int __init test_rht_init(void)
-{
-	struct rhashtable ht;
-	struct rhashtable_params params = {
-		.nelem_hint = TEST_HT_SIZE,
-		.head_offset = offsetof(struct test_obj, node),
-		.key_offset = offsetof(struct test_obj, value),
-		.key_len = sizeof(int),
-		.hashfn = jhash,
-		.nulls_base = (3U << RHT_BASE_SHIFT),
-		.grow_decision = rht_grow_above_75,
-		.shrink_decision = rht_shrink_below_30,
-	};
-	int err;
-
-	pr_info("Running resizable hashtable tests...\n");
-
-	err = rhashtable_init(&ht, &params);
-	if (err < 0) {
-		pr_warn("Test failed: Unable to initialize hashtable: %d\n",
-			err);
-		return err;
-	}
-
-	err = test_rhashtable(&ht);
-
-	rhashtable_destroy(&ht);
-
-	return err;
-}
-
-subsys_initcall(test_rht_init);
-
-#endif /* CONFIG_TEST_RHASHTABLE */
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
new file mode 100644
index 000000000000..1dfeba73fc74
--- /dev/null
+++ b/lib/test_rhashtable.c
@@ -0,0 +1,227 @@
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch>
+ * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * Based on the following paper:
+ * https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf
+ *
+ * Code partially derived from nft_hash
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/**************************************************************************
+ * Self Test
+ **************************************************************************/
+
+#include <linux/init.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/rhashtable.h>
+#include <linux/slab.h>
+
+
+#define TEST_HT_SIZE	8
+#define TEST_ENTRIES	2048
+#define TEST_PTR	((void *) 0xdeadbeef)
+#define TEST_NEXPANDS	4
+
+struct test_obj {
+	void			*ptr;
+	int			value;
+	struct rhash_head	node;
+};
+
+static int __init test_rht_lookup(struct rhashtable *ht)
+{
+	unsigned int i;
+
+	for (i = 0; i < TEST_ENTRIES * 2; i++) {
+		struct test_obj *obj;
+		bool expected = !(i % 2);
+		u32 key = i;
+
+		obj = rhashtable_lookup(ht, &key);
+
+		if (expected && !obj) {
+			pr_warn("Test failed: Could not find key %u\n", key);
+			return -ENOENT;
+		} else if (!expected && obj) {
+			pr_warn("Test failed: Unexpected entry found for key %u\n",
+				key);
+			return -EEXIST;
+		} else if (expected && obj) {
+			if (obj->ptr != TEST_PTR || obj->value != i) {
+				pr_warn("Test failed: Lookup value mismatch %p!=%p, %u!=%u\n",
+					obj->ptr, TEST_PTR, obj->value, i);
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void test_bucket_stats(struct rhashtable *ht, bool quiet)
+{
+	unsigned int cnt, rcu_cnt, i, total = 0;
+	struct rhash_head *pos;
+	struct test_obj *obj;
+	struct bucket_table *tbl;
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	for (i = 0; i < tbl->size; i++) {
+		rcu_cnt = cnt = 0;
+
+		if (!quiet)
+			pr_info(" [%#4x/%zu]", i, tbl->size);
+
+		rht_for_each_entry_rcu(obj, pos, tbl, i, node) {
+			cnt++;
+			total++;
+			if (!quiet)
+				pr_cont(" [%p],", obj);
+		}
+
+		rht_for_each_entry_rcu(obj, pos, tbl, i, node)
+			rcu_cnt++;
+
+		if (rcu_cnt != cnt)
+			pr_warn("Test failed: Chain count mismach %d != %d",
+				cnt, rcu_cnt);
+
+		if (!quiet)
+			pr_cont("\n  [%#x] first element: %p, chain length: %u\n",
+				i, tbl->buckets[i], cnt);
+	}
+
+	pr_info("  Traversal complete: counted=%u, nelems=%u, entries=%d\n",
+		total, atomic_read(&ht->nelems), TEST_ENTRIES);
+
+	if (total != atomic_read(&ht->nelems) || total != TEST_ENTRIES)
+		pr_warn("Test failed: Total count mismatch ^^^");
+}
+
+static int __init test_rhashtable(struct rhashtable *ht)
+{
+	struct bucket_table *tbl;
+	struct test_obj *obj;
+	struct rhash_head *pos, *next;
+	int err;
+	unsigned int i;
+
+	/*
+	 * Insertion Test:
+	 * Insert TEST_ENTRIES into table with all keys even numbers
+	 */
+	pr_info("  Adding %d keys\n", TEST_ENTRIES);
+	for (i = 0; i < TEST_ENTRIES; i++) {
+		struct test_obj *obj;
+
+		obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+		if (!obj) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		obj->ptr = TEST_PTR;
+		obj->value = i * 2;
+
+		rhashtable_insert(ht, &obj->node);
+	}
+
+	rcu_read_lock();
+	test_bucket_stats(ht, true);
+	test_rht_lookup(ht);
+	rcu_read_unlock();
+
+	for (i = 0; i < TEST_NEXPANDS; i++) {
+		pr_info("  Table expansion iteration %u...\n", i);
+		mutex_lock(&ht->mutex);
+		rhashtable_expand(ht);
+		mutex_unlock(&ht->mutex);
+
+		rcu_read_lock();
+		pr_info("  Verifying lookups...\n");
+		test_rht_lookup(ht);
+		rcu_read_unlock();
+	}
+
+	for (i = 0; i < TEST_NEXPANDS; i++) {
+		pr_info("  Table shrinkage iteration %u...\n", i);
+		mutex_lock(&ht->mutex);
+		rhashtable_shrink(ht);
+		mutex_unlock(&ht->mutex);
+
+		rcu_read_lock();
+		pr_info("  Verifying lookups...\n");
+		test_rht_lookup(ht);
+		rcu_read_unlock();
+	}
+
+	rcu_read_lock();
+	test_bucket_stats(ht, true);
+	rcu_read_unlock();
+
+	pr_info("  Deleting %d keys\n", TEST_ENTRIES);
+	for (i = 0; i < TEST_ENTRIES; i++) {
+		u32 key = i * 2;
+
+		obj = rhashtable_lookup(ht, &key);
+		BUG_ON(!obj);
+
+		rhashtable_remove(ht, &obj->node);
+		kfree(obj);
+	}
+
+	return 0;
+
+error:
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	for (i = 0; i < tbl->size; i++)
+		rht_for_each_entry_safe(obj, pos, next, tbl, i, node)
+			kfree(obj);
+
+	return err;
+}
+
+static int __init test_rht_init(void)
+{
+	struct rhashtable ht;
+	struct rhashtable_params params = {
+		.nelem_hint = TEST_HT_SIZE,
+		.head_offset = offsetof(struct test_obj, node),
+		.key_offset = offsetof(struct test_obj, value),
+		.key_len = sizeof(int),
+		.hashfn = jhash,
+		.nulls_base = (3U << RHT_BASE_SHIFT),
+		.grow_decision = rht_grow_above_75,
+		.shrink_decision = rht_shrink_below_30,
+	};
+	int err;
+
+	pr_info("Running resizable hashtable tests...\n");
+
+	err = rhashtable_init(&ht, &params);
+	if (err < 0) {
+		pr_warn("Test failed: Unable to initialize hashtable: %d\n",
+			err);
+		return err;
+	}
+
+	err = test_rhashtable(&ht);
+
+	rhashtable_destroy(&ht);
+
+	return err;
+}
+
+module_init(test_rht_init);
+
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From aad9a1cec7dcd1d45809b64643fce37061b17788 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2014 14:49:01 -0500
Subject: vhost: switch vhost get_indirect() to iov_iter, kill
 memcpy_fromiovec()

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 lib/iovec.c | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'lib')

diff --git a/lib/iovec.c b/lib/iovec.c
index 2d99cb4a5006..4a90875c64ae 100644
--- a/lib/iovec.c
+++ b/lib/iovec.c
@@ -2,31 +2,6 @@
 #include <linux/export.h>
 #include <linux/uio.h>
 
-/*
- *	Copy iovec to kernel. Returns -EFAULT on error.
- *
- *	Note: this modifies the original iovec.
- */
-
-int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
-{
-	while (len > 0) {
-		if (iov->iov_len) {
-			int copy = min_t(unsigned int, len, iov->iov_len);
-			if (copy_from_user(kdata, iov->iov_base, copy))
-				return -EFAULT;
-			len -= copy;
-			kdata += copy;
-			iov->iov_base += copy;
-			iov->iov_len -= copy;
-		}
-		iov++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovec);
-
 /*
  *	Copy kernel to iovec. Returns -EFAULT on error.
  */
-- 
cgit v1.2.3


From ba7438aed924133df54a60e4cd5499d359bcf2a8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2014 15:51:28 -0500
Subject: vhost: don't bother copying iovecs in handle_rx(), kill
 memcpy_toiovecend()

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 lib/iovec.c | 26 --------------------------
 1 file changed, 26 deletions(-)

(limited to 'lib')

diff --git a/lib/iovec.c b/lib/iovec.c
index 4a90875c64ae..d8f17a9b1ccf 100644
--- a/lib/iovec.c
+++ b/lib/iovec.c
@@ -2,32 +2,6 @@
 #include <linux/export.h>
 #include <linux/uio.h>
 
-/*
- *	Copy kernel to iovec. Returns -EFAULT on error.
- */
-
-int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
-		      int offset, int len)
-{
-	int copy;
-	for (; len > 0; ++iov) {
-		/* Skip over the finished iovecs */
-		if (unlikely(offset >= iov->iov_len)) {
-			offset -= iov->iov_len;
-			continue;
-		}
-		copy = min_t(unsigned int, iov->iov_len - offset, len);
-		if (copy_to_user(iov->iov_base + offset, kdata, copy))
-			return -EFAULT;
-		offset = 0;
-		kdata += copy;
-		len -= copy;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_toiovecend);
-
 /*
  *	Copy iovec to kernel. Returns -EFAULT on error.
  */
-- 
cgit v1.2.3


From 57dd8a0735aabff4862025cf64ad94da3d80e620 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2014 16:03:43 -0500
Subject: vhost: vhost_scsi_handle_vq() should just use copy_from_user()

it has just verified that it asks no more than the length of the
first segment of iovec.

And with that the last user of stuff in lib/iovec.c is gone.
RIP.

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 lib/Makefile |  2 +-
 lib/iovec.c  | 36 ------------------------------------
 2 files changed, 1 insertion(+), 37 deletions(-)
 delete mode 100644 lib/iovec.c

(limited to 'lib')

diff --git a/lib/Makefile b/lib/Makefile
index 3c3b30b9e020..1071d06398c3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -24,7 +24,7 @@ obj-y	+= lockref.o
 
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
-	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
+	 gcd.o lcm.o list_sort.o uuid.o flex_array.o clz_ctz.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
 obj-y += string_helpers.o
diff --git a/lib/iovec.c b/lib/iovec.c
deleted file mode 100644
index d8f17a9b1ccf..000000000000
--- a/lib/iovec.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include <linux/uaccess.h>
-#include <linux/export.h>
-#include <linux/uio.h>
-
-/*
- *	Copy iovec to kernel. Returns -EFAULT on error.
- */
-
-int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
-			int offset, int len)
-{
-	/* No data? Done! */
-	if (len == 0)
-		return 0;
-
-	/* Skip over the finished iovecs */
-	while (offset >= iov->iov_len) {
-		offset -= iov->iov_len;
-		iov++;
-	}
-
-	while (len > 0) {
-		u8 __user *base = iov->iov_base + offset;
-		int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
-		offset = 0;
-		if (copy_from_user(kdata, base, copy))
-			return -EFAULT;
-		len -= copy;
-		kdata += copy;
-		iov++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovecend);
-- 
cgit v1.2.3


From 28134a53d624ae7e90fff8500b25b3add4d40b92 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 4 Feb 2015 07:33:22 +1100
Subject: rhashtable: Fix potential crash on destroy in rhashtable_shrink

The current being_destroyed check in rhashtable_expand is not
enough since if we start a shrinking process after freeing all
elements in the table that's also going to crash.

This patch adds a being_destroyed check to the deferred worker
thread so that we bail out as soon as we take the lock.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c41e21096373..904b419b72f5 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -487,6 +487,9 @@ static void rht_deferred_worker(struct work_struct *work)
 
 	ht = container_of(work, struct rhashtable, run_work);
 	mutex_lock(&ht->mutex);
+	if (ht->being_destroyed)
+		goto unlock;
+
 	tbl = rht_dereference(ht->tbl, ht);
 
 	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
@@ -494,6 +497,7 @@ static void rht_deferred_worker(struct work_struct *work)
 	else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size))
 		rhashtable_shrink(ht);
 
+unlock:
 	mutex_unlock(&ht->mutex);
 }
 
-- 
cgit v1.2.3


From f2dba9c6ff0d9a515b4c3f1b037cd65c8b2a868c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 4 Feb 2015 07:33:23 +1100
Subject: rhashtable: Introduce rhashtable_walk_*

Some existing rhashtable users get too intimate with it by walking
the buckets directly.  This prevents us from easily changing the
internals of rhashtable.

This patch adds the helpers rhashtable_walk_init/exit/start/next/stop
which will replace these custom walkers.

They are meant to be usable for both procfs seq_file walks as well
as walking by a netlink dump.  The iterator structure should fit
inside a netlink dump cb structure, with at least one element to
spare.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 904b419b72f5..057919164e23 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -484,6 +484,7 @@ static void rht_deferred_worker(struct work_struct *work)
 {
 	struct rhashtable *ht;
 	struct bucket_table *tbl;
+	struct rhashtable_walker *walker;
 
 	ht = container_of(work, struct rhashtable, run_work);
 	mutex_lock(&ht->mutex);
@@ -492,6 +493,9 @@ static void rht_deferred_worker(struct work_struct *work)
 
 	tbl = rht_dereference(ht->tbl, ht);
 
+	list_for_each_entry(walker, &ht->walkers, list)
+		walker->resize = true;
+
 	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
 		rhashtable_expand(ht);
 	else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size))
@@ -822,6 +826,164 @@ exit:
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert);
 
+/**
+ * rhashtable_walk_init - Initialise an iterator
+ * @ht:		Table to walk over
+ * @iter:	Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may sleep so you must not call it from interrupt
+ * context or with spin locks held.
+ *
+ * You must call rhashtable_walk_exit if this function returns
+ * successfully.
+ */
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)
+{
+	iter->ht = ht;
+	iter->p = NULL;
+	iter->slot = 0;
+	iter->skip = 0;
+
+	iter->walker = kmalloc(sizeof(*iter->walker), GFP_KERNEL);
+	if (!iter->walker)
+		return -ENOMEM;
+
+	mutex_lock(&ht->mutex);
+	list_add(&iter->walker->list, &ht->walkers);
+	mutex_unlock(&ht->mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_init);
+
+/**
+ * rhashtable_walk_exit - Free an iterator
+ * @iter:	Hash table Iterator
+ *
+ * This function frees resources allocated by rhashtable_walk_init.
+ */
+void rhashtable_walk_exit(struct rhashtable_iter *iter)
+{
+	mutex_lock(&iter->ht->mutex);
+	list_del(&iter->walker->list);
+	mutex_unlock(&iter->ht->mutex);
+	kfree(iter->walker);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
+
+/**
+ * rhashtable_walk_start - Start a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Start a hash table walk.  Note that we take the RCU lock in all
+ * cases including when we return an error.  So you must always call
+ * rhashtable_walk_stop to clean up.
+ *
+ * Returns zero if successful.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may use it immediately
+ * by calling rhashtable_walk_next.
+ */
+int rhashtable_walk_start(struct rhashtable_iter *iter)
+{
+	rcu_read_lock();
+
+	if (iter->walker->resize) {
+		iter->slot = 0;
+		iter->skip = 0;
+		iter->walker->resize = false;
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_start);
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter:	Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+	const struct bucket_table *tbl;
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+	void *obj = NULL;
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	if (p) {
+		p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot);
+		goto next;
+	}
+
+	for (; iter->slot < tbl->size; iter->slot++) {
+		int skip = iter->skip;
+
+		rht_for_each_rcu(p, tbl, iter->slot) {
+			if (!skip)
+				break;
+			skip--;
+		}
+
+next:
+		if (!rht_is_a_nulls(p)) {
+			iter->skip++;
+			iter->p = p;
+			obj = rht_obj(ht, p);
+			goto out;
+		}
+
+		iter->skip = 0;
+	}
+
+	iter->p = NULL;
+
+out:
+	if (iter->walker->resize) {
+		iter->p = NULL;
+		iter->slot = 0;
+		iter->skip = 0;
+		iter->walker->resize = false;
+		return ERR_PTR(-EAGAIN);
+	}
+
+	return obj;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+
+/**
+ * rhashtable_walk_stop - Finish a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Finish a hash table walk.
+ */
+void rhashtable_walk_stop(struct rhashtable_iter *iter)
+{
+	rcu_read_unlock();
+	iter->p = NULL;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
+
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
 	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
@@ -894,6 +1056,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	memcpy(&ht->p, params, sizeof(*params));
+	INIT_LIST_HEAD(&ht->walkers);
 
 	if (params->locks_mul)
 		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
-- 
cgit v1.2.3


From c88455ce50ae4224d84960ce2baa53e61580df27 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 5 Feb 2015 02:03:31 +0100
Subject: rhashtable: key_hashfn() must return full hash value

The value computed by key_hashfn() is used by rhashtable_lookup_compare()
to traverse both tables during a resize. key_hashfn() must therefore
return the hash value without the buckets mask applied so it can be
masked to the size of each individual table.

Fixes: 97defe1ecf86 ("rhashtable: Per bucket locks & deferred expansion/shrinking")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 057919164e23..71fd0dd45ce3 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -94,13 +94,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 
 static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
 {
-	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
-	u32 hash;
-
-	hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
-	hash >>= HASH_RESERVED_SPACE;
-
-	return rht_bucket_index(tbl, hash);
+	return ht->p.hashfn(key, len, ht->p.hash_rnd) >> HASH_RESERVED_SPACE;
 }
 
 static u32 head_hashfn(const struct rhashtable *ht,
-- 
cgit v1.2.3


From a5ec68e3b8f2c95ea1a5d23dd543abbe0c8d0624 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 5 Feb 2015 02:03:32 +0100
Subject: rhashtable: Use a single bucket lock for sibling buckets

rhashtable currently allows to use a bucket lock per bucket. This
requires multiple levels of complicated nested locking because when
resizing, a single bucket of the smaller table will map to two
buckets in the larger table. So far rhashtable has explicitly locked
both buckets in the larger table.

By excluding the highest bit of the hash from the bucket lock map and
thus only allowing locks to buckets in a ratio of 1:2, the locking
can be simplified a lot without losing the benefits of multiple locks.
Larger tables which benefit from multiple locks will not have a single
lock per bucket anyway.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 170 ++++++++++++++++++++++---------------------------------
 1 file changed, 69 insertions(+), 101 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 71fd0dd45ce3..cea4244e032b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -1,7 +1,7 @@
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
- * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch>
+ * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
  * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
  *
  * Based on the following paper:
@@ -34,12 +34,17 @@
 enum {
 	RHT_LOCK_NORMAL,
 	RHT_LOCK_NESTED,
-	RHT_LOCK_NESTED2,
 };
 
 /* The bucket lock is selected based on the hash and protects mutations
  * on a group of hash buckets.
  *
+ * A maximum of tbl->size/2 bucket locks is allocated. This ensures that
+ * a single lock always covers both buckets which may both contains
+ * entries which link to the same bucket of the old table during resizing.
+ * This allows to simplify the locking as locking the bucket in both
+ * tables during resize always guarantee protection.
+ *
  * IMPORTANT: When holding the bucket lock of both the old and new table
  * during expansions and shrinking, the old bucket lock must always be
  * acquired first.
@@ -128,8 +133,8 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
 	nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
 	size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
 
-	/* Never allocate more than one lock per bucket */
-	size = min_t(unsigned int, size, tbl->size);
+	/* Never allocate more than 0.5 locks per bucket */
+	size = min_t(unsigned int, size, tbl->size >> 1);
 
 	if (sizeof(spinlock_t) != 0) {
 #ifdef CONFIG_NUMA
@@ -211,13 +216,36 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 }
 EXPORT_SYMBOL_GPL(rht_shrink_below_30);
 
-static void hashtable_chain_unzip(const struct rhashtable *ht,
+static void lock_buckets(struct bucket_table *new_tbl,
+			 struct bucket_table *old_tbl, unsigned int hash)
+	__acquires(old_bucket_lock)
+{
+	spin_lock_bh(bucket_lock(old_tbl, hash));
+	if (new_tbl != old_tbl)
+		spin_lock_bh_nested(bucket_lock(new_tbl, hash),
+				    RHT_LOCK_NESTED);
+}
+
+static void unlock_buckets(struct bucket_table *new_tbl,
+			   struct bucket_table *old_tbl, unsigned int hash)
+	__releases(old_bucket_lock)
+{
+	if (new_tbl != old_tbl)
+		spin_unlock_bh(bucket_lock(new_tbl, hash));
+	spin_unlock_bh(bucket_lock(old_tbl, hash));
+}
+
+/**
+ * Unlink entries on bucket which hash to different bucket.
+ *
+ * Returns true if no more work needs to be performed on the bucket.
+ */
+static bool hashtable_chain_unzip(const struct rhashtable *ht,
 				  const struct bucket_table *new_tbl,
 				  struct bucket_table *old_tbl,
 				  size_t old_hash)
 {
 	struct rhash_head *he, *p, *next;
-	spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL;
 	unsigned int new_hash, new_hash2;
 
 	ASSERT_BUCKET_LOCK(old_tbl, old_hash);
@@ -226,10 +254,10 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
 				   old_hash);
 	if (rht_is_a_nulls(p))
-		return;
+		return false;
 
-	new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
-	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+	new_hash = head_hashfn(ht, new_tbl, p);
+	ASSERT_BUCKET_LOCK(new_tbl, new_hash);
 
 	/* Advance the old bucket pointer one or more times until it
 	 * reaches a node that doesn't hash to the same bucket as the
@@ -237,22 +265,14 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 */
 	rht_for_each_continue(he, p->next, old_tbl, old_hash) {
 		new_hash2 = head_hashfn(ht, new_tbl, he);
+		ASSERT_BUCKET_LOCK(new_tbl, new_hash2);
+
 		if (new_hash != new_hash2)
 			break;
 		p = he;
 	}
 	rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
 
-	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
-
-	/* If we have encountered an entry that maps to a different bucket in
-	 * the new table, lock down that bucket as well as we might cut off
-	 * the end of the chain.
-	 */
-	new_bucket_lock2 = bucket_lock(new_tbl, new_hash);
-	if (new_bucket_lock != new_bucket_lock2)
-		spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2);
-
 	/* Find the subsequent node which does hash to the same
 	 * bucket as node P, or NULL if no such node exists.
 	 */
@@ -271,21 +291,16 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 */
 	rcu_assign_pointer(p->next, next);
 
-	if (new_bucket_lock != new_bucket_lock2)
-		spin_unlock_bh(new_bucket_lock2);
-	spin_unlock_bh(new_bucket_lock);
+	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
+				   old_hash);
+
+	return !rht_is_a_nulls(p);
 }
 
 static void link_old_to_new(struct bucket_table *new_tbl,
 			    unsigned int new_hash, struct rhash_head *entry)
 {
-	spinlock_t *new_bucket_lock;
-
-	new_bucket_lock = bucket_lock(new_tbl, new_hash);
-
-	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
 	rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
-	spin_unlock_bh(new_bucket_lock);
 }
 
 /**
@@ -308,7 +323,6 @@ int rhashtable_expand(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 	struct rhash_head *he;
-	spinlock_t *old_bucket_lock;
 	unsigned int new_hash, old_hash;
 	bool complete = false;
 
@@ -338,16 +352,14 @@ int rhashtable_expand(struct rhashtable *ht)
 	 */
 	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
 		old_hash = rht_bucket_index(old_tbl, new_hash);
-		old_bucket_lock = bucket_lock(old_tbl, old_hash);
-
-		spin_lock_bh(old_bucket_lock);
+		lock_buckets(new_tbl, old_tbl, new_hash);
 		rht_for_each(he, old_tbl, old_hash) {
 			if (head_hashfn(ht, new_tbl, he) == new_hash) {
 				link_old_to_new(new_tbl, new_hash, he);
 				break;
 			}
 		}
-		spin_unlock_bh(old_bucket_lock);
+		unlock_buckets(new_tbl, old_tbl, new_hash);
 	}
 
 	/* Publish the new table pointer. Lookups may now traverse
@@ -370,18 +382,13 @@ int rhashtable_expand(struct rhashtable *ht)
 		 */
 		complete = true;
 		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
-			struct rhash_head *head;
+			lock_buckets(new_tbl, old_tbl, old_hash);
 
-			old_bucket_lock = bucket_lock(old_tbl, old_hash);
-			spin_lock_bh(old_bucket_lock);
-
-			hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
-			head = rht_dereference_bucket(old_tbl->buckets[old_hash],
-						      old_tbl, old_hash);
-			if (!rht_is_a_nulls(head))
+			if (hashtable_chain_unzip(ht, new_tbl, old_tbl,
+						  old_hash))
 				complete = false;
 
-			spin_unlock_bh(old_bucket_lock);
+			unlock_buckets(new_tbl, old_tbl, old_hash);
 		}
 	}
 
@@ -409,7 +416,6 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
 int rhashtable_shrink(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
-	spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2;
 	unsigned int new_hash;
 
 	ASSERT_RHT_MUTEX(ht);
@@ -427,36 +433,16 @@ int rhashtable_shrink(struct rhashtable *ht)
 	 * always divide the size in half when shrinking, each bucket
 	 * in the new table maps to exactly two buckets in the old
 	 * table.
-	 *
-	 * As removals can occur concurrently on the old table, we need
-	 * to lock down both matching buckets in the old table.
 	 */
 	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
-		old_bucket_lock1 = bucket_lock(tbl, new_hash);
-		old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size);
-		new_bucket_lock = bucket_lock(new_tbl, new_hash);
-
-		spin_lock_bh(old_bucket_lock1);
-
-		/* Depending on the lock per buckets mapping, the bucket in
-		 * the lower and upper region may map to the same lock.
-		 */
-		if (old_bucket_lock1 != old_bucket_lock2) {
-			spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
-			spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
-		} else {
-			spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
-		}
+		lock_buckets(new_tbl, tbl, new_hash);
 
 		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
 				   tbl->buckets[new_hash]);
 		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
 				   tbl->buckets[new_hash + new_tbl->size]);
 
-		spin_unlock_bh(new_bucket_lock);
-		if (old_bucket_lock1 != old_bucket_lock2)
-			spin_unlock_bh(old_bucket_lock2);
-		spin_unlock_bh(old_bucket_lock1);
+		unlock_buckets(new_tbl, tbl, new_hash);
 	}
 
 	/* Publish the new, valid hash table */
@@ -547,19 +533,18 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
  */
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl;
-	spinlock_t *lock;
+	struct bucket_table *tbl, *old_tbl;
 	unsigned hash;
 
 	rcu_read_lock();
 
 	tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
 	hash = head_hashfn(ht, tbl, obj);
-	lock = bucket_lock(tbl, hash);
 
-	spin_lock_bh(lock);
+	lock_buckets(tbl, old_tbl, hash);
 	__rhashtable_insert(ht, obj, tbl, hash);
-	spin_unlock_bh(lock);
+	unlock_buckets(tbl, old_tbl, hash);
 
 	rcu_read_unlock();
 }
@@ -582,21 +567,20 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  */
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl;
+	struct bucket_table *tbl, *new_tbl, *old_tbl;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
-	spinlock_t *lock;
-	unsigned int hash;
+	unsigned int hash, new_hash;
 	bool ret = false;
 
 	rcu_read_lock();
-	tbl = rht_dereference_rcu(ht->tbl, ht);
-	hash = head_hashfn(ht, tbl, obj);
-
-	lock = bucket_lock(tbl, hash);
-	spin_lock_bh(lock);
+	tbl = old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	new_hash = head_hashfn(ht, new_tbl, obj);
 
+	lock_buckets(new_tbl, old_tbl, new_hash);
 restart:
+	hash = rht_bucket_index(tbl, new_hash);
 	pprev = &tbl->buckets[hash];
 	rht_for_each(he, tbl, hash) {
 		if (he != obj) {
@@ -615,18 +599,12 @@ restart:
 	 * resizing. Thus traversing both is fine and the added cost is
 	 * very rare.
 	 */
-	if (tbl != rht_dereference_rcu(ht->future_tbl, ht)) {
-		spin_unlock_bh(lock);
-
-		tbl = rht_dereference_rcu(ht->future_tbl, ht);
-		hash = head_hashfn(ht, tbl, obj);
-
-		lock = bucket_lock(tbl, hash);
-		spin_lock_bh(lock);
+	if (tbl != new_tbl) {
+		tbl = new_tbl;
 		goto restart;
 	}
 
-	spin_unlock_bh(lock);
+	unlock_buckets(new_tbl, old_tbl, new_hash);
 
 	if (ret) {
 		atomic_dec(&ht->nelems);
@@ -782,24 +760,17 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
 				      void *arg)
 {
 	struct bucket_table *new_tbl, *old_tbl;
-	spinlock_t *new_bucket_lock, *old_bucket_lock;
-	u32 new_hash, old_hash;
+	u32 new_hash;
 	bool success = true;
 
 	BUG_ON(!ht->p.key_len);
 
 	rcu_read_lock();
-
 	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	old_hash = head_hashfn(ht, old_tbl, obj);
-	old_bucket_lock = bucket_lock(old_tbl, old_hash);
-	spin_lock_bh(old_bucket_lock);
-
 	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	new_hash = head_hashfn(ht, new_tbl, obj);
-	new_bucket_lock = bucket_lock(new_tbl, new_hash);
-	if (unlikely(old_tbl != new_tbl))
-		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+
+	lock_buckets(new_tbl, old_tbl, new_hash);
 
 	if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
 				      compare, arg)) {
@@ -810,10 +781,7 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
 	__rhashtable_insert(ht, obj, new_tbl, new_hash);
 
 exit:
-	if (unlikely(old_tbl != new_tbl))
-		spin_unlock_bh(new_bucket_lock);
-	spin_unlock_bh(old_bucket_lock);
-
+	unlock_buckets(new_tbl, old_tbl, new_hash);
 	rcu_read_unlock();
 
 	return success;
-- 
cgit v1.2.3


From 2af4b52988fd4f7ae525fcada29d4db8680033d6 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 5 Feb 2015 02:03:33 +0100
Subject: rhashtable: Wait for RCU readers after final unzip work

We need to wait for all RCU readers to complete after the last bit of
unzipping has been completed. Otherwise the old table is freed up
prematurely.

Fixes: 7e1e77636e36 ("lib: Resizable, Scalable, Concurrent Hash Table")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index cea4244e032b..fd1033d518c6 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -392,6 +392,8 @@ int rhashtable_expand(struct rhashtable *ht)
 		}
 	}
 
+	synchronize_rcu();
+
 	bucket_table_free(old_tbl);
 	return 0;
 }
-- 
cgit v1.2.3


From a03eaec0df52a0f1fd37ebf7dcb2dc505d891255 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 5 Feb 2015 02:03:34 +0100
Subject: rhashtable: Dump bucket tables on locking violation under
 PROVE_LOCKING

This simplifies debugging of locking violations if compiled with
CONFIG_PROVE_LOCKING.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 99 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 75 insertions(+), 24 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index fd1033d518c6..c2c39495fac6 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -54,26 +54,6 @@ static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
 	return &tbl->locks[hash & tbl->locks_mask];
 }
 
-#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
-#define ASSERT_BUCKET_LOCK(TBL, HASH) \
-	BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH))
-
-#ifdef CONFIG_PROVE_LOCKING
-int lockdep_rht_mutex_is_held(struct rhashtable *ht)
-{
-	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
-}
-EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
-
-int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
-{
-	spinlock_t *lock = bucket_lock(tbl, hash);
-
-	return (debug_locks) ? lockdep_is_held(lock) : 1;
-}
-EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
-#endif
-
 static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
 {
 	return (void *) he - ht->p.head_offset;
@@ -109,6 +89,77 @@ static u32 head_hashfn(const struct rhashtable *ht,
 	return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
 }
 
+#ifdef CONFIG_PROVE_LOCKING
+static void debug_dump_buckets(const struct rhashtable *ht,
+			       const struct bucket_table *tbl)
+{
+	struct rhash_head *he;
+	unsigned int i, hash;
+
+	for (i = 0; i < tbl->size; i++) {
+		pr_warn(" [Bucket %d] ", i);
+		rht_for_each_rcu(he, tbl, i) {
+			hash = head_hashfn(ht, tbl, he);
+			pr_cont("[hash = %#x, lock = %p] ",
+				hash, bucket_lock(tbl, hash));
+		}
+		pr_cont("\n");
+	}
+
+}
+
+static void debug_dump_table(struct rhashtable *ht,
+			     const struct bucket_table *tbl,
+			     unsigned int hash)
+{
+	struct bucket_table *old_tbl, *future_tbl;
+
+	pr_emerg("BUG: lock for hash %#x in table %p not held\n",
+		 hash, tbl);
+
+	rcu_read_lock();
+	future_tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	if (future_tbl != old_tbl) {
+		pr_warn("Future table %p (size: %zd)\n",
+			future_tbl, future_tbl->size);
+		debug_dump_buckets(ht, future_tbl);
+	}
+
+	pr_warn("Table %p (size: %zd)\n", old_tbl, old_tbl->size);
+	debug_dump_buckets(ht, old_tbl);
+
+	rcu_read_unlock();
+}
+
+#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
+#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)				\
+	do {								\
+		if (unlikely(!lockdep_rht_bucket_is_held(TBL, HASH))) {	\
+			debug_dump_table(HT, TBL, HASH);		\
+			BUG();						\
+		}							\
+	} while (0)
+
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
+
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+	spinlock_t *lock = bucket_lock(tbl, hash);
+
+	return (debug_locks) ? lockdep_is_held(lock) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
+#else
+#define ASSERT_RHT_MUTEX(HT)
+#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)
+#endif
+
+
 static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
 {
 	struct rhash_head __rcu **pprev;
@@ -240,7 +291,7 @@ static void unlock_buckets(struct bucket_table *new_tbl,
  *
  * Returns true if no more work needs to be performed on the bucket.
  */
-static bool hashtable_chain_unzip(const struct rhashtable *ht,
+static bool hashtable_chain_unzip(struct rhashtable *ht,
 				  const struct bucket_table *new_tbl,
 				  struct bucket_table *old_tbl,
 				  size_t old_hash)
@@ -248,7 +299,7 @@ static bool hashtable_chain_unzip(const struct rhashtable *ht,
 	struct rhash_head *he, *p, *next;
 	unsigned int new_hash, new_hash2;
 
-	ASSERT_BUCKET_LOCK(old_tbl, old_hash);
+	ASSERT_BUCKET_LOCK(ht, old_tbl, old_hash);
 
 	/* Old bucket empty, no work needed. */
 	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
@@ -257,7 +308,7 @@ static bool hashtable_chain_unzip(const struct rhashtable *ht,
 		return false;
 
 	new_hash = head_hashfn(ht, new_tbl, p);
-	ASSERT_BUCKET_LOCK(new_tbl, new_hash);
+	ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash);
 
 	/* Advance the old bucket pointer one or more times until it
 	 * reaches a node that doesn't hash to the same bucket as the
@@ -265,7 +316,7 @@ static bool hashtable_chain_unzip(const struct rhashtable *ht,
 	 */
 	rht_for_each_continue(he, p->next, old_tbl, old_hash) {
 		new_hash2 = head_hashfn(ht, new_tbl, he);
-		ASSERT_BUCKET_LOCK(new_tbl, new_hash2);
+		ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash2);
 
 		if (new_hash != new_hash2)
 			break;
-- 
cgit v1.2.3


From 7cd10db8de2b6a32ccabef2e0e01c7444faa49d4 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 5 Feb 2015 02:03:35 +0100
Subject: rhashtable: Add more lock verification

Catch hash miscalculations which result in hard to track down race
conditions.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c2c39495fac6..ef0816b6be82 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -348,9 +348,11 @@ static bool hashtable_chain_unzip(struct rhashtable *ht,
 	return !rht_is_a_nulls(p);
 }
 
-static void link_old_to_new(struct bucket_table *new_tbl,
+static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl,
 			    unsigned int new_hash, struct rhash_head *entry)
 {
+	ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash);
+
 	rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
 }
 
@@ -406,7 +408,7 @@ int rhashtable_expand(struct rhashtable *ht)
 		lock_buckets(new_tbl, old_tbl, new_hash);
 		rht_for_each(he, old_tbl, old_hash) {
 			if (head_hashfn(ht, new_tbl, he) == new_hash) {
-				link_old_to_new(new_tbl, new_hash, he);
+				link_old_to_new(ht, new_tbl, new_hash, he);
 				break;
 			}
 		}
@@ -492,6 +494,7 @@ int rhashtable_shrink(struct rhashtable *ht)
 
 		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
 				   tbl->buckets[new_hash]);
+		ASSERT_BUCKET_LOCK(ht, tbl, new_hash + new_tbl->size);
 		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
 				   tbl->buckets[new_hash + new_tbl->size]);
 
@@ -557,6 +560,8 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	struct rhash_head *head = rht_dereference_bucket(tbl->buckets[hash],
 							 tbl, hash);
 
+	ASSERT_BUCKET_LOCK(ht, tbl, hash);
+
 	if (rht_is_a_nulls(head))
 		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
 	else
@@ -641,6 +646,7 @@ restart:
 			continue;
 		}
 
+		ASSERT_BUCKET_LOCK(ht, tbl, hash);
 		rcu_assign_pointer(*pprev, obj->next);
 
 		ret = true;
-- 
cgit v1.2.3


From cf52d52f9ccb9966ac019d9f79824195583e3e6c Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 5 Feb 2015 02:03:36 +0100
Subject: rhashtable: Avoid bucket cross reference after removal

During a resize, when two buckets in the larger table map to
a single bucket in the smaller table and the new table has already
been (partially) linked to the old table. Removal of an element
may result the bucket in the larger table to point to entries
which all hash to a different value than the bucket index. Thus
causing two buckets to point to the same sub chain after unzipping.
This is not illegal *during* the resize phase but after it has
completed.

Keep the old table around until all of the unzipping is done to
allow the removal code to only search for matching hashed entries
during this special period.

Reported-by: Ying Xue <ying.xue@windriver.com>
Fixes: 97defe1ecf86 ("rhashtable: Per bucket locks & deferred expansion/shrinking")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index ef0816b6be82..5919d63f58e4 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -415,12 +415,6 @@ int rhashtable_expand(struct rhashtable *ht)
 		unlock_buckets(new_tbl, old_tbl, new_hash);
 	}
 
-	/* Publish the new table pointer. Lookups may now traverse
-	 * the new table, but they will not benefit from any
-	 * additional efficiency until later steps unzip the buckets.
-	 */
-	rcu_assign_pointer(ht->tbl, new_tbl);
-
 	/* Unzip interleaved hash chains */
 	while (!complete && !ht->being_destroyed) {
 		/* Wait for readers. All new readers will see the new
@@ -445,6 +439,7 @@ int rhashtable_expand(struct rhashtable *ht)
 		}
 	}
 
+	rcu_assign_pointer(ht->tbl, new_tbl);
 	synchronize_rcu();
 
 	bucket_table_free(old_tbl);
@@ -627,14 +622,14 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 {
 	struct bucket_table *tbl, *new_tbl, *old_tbl;
 	struct rhash_head __rcu **pprev;
-	struct rhash_head *he;
+	struct rhash_head *he, *he2;
 	unsigned int hash, new_hash;
 	bool ret = false;
 
 	rcu_read_lock();
 	tbl = old_tbl = rht_dereference_rcu(ht->tbl, ht);
 	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
-	new_hash = head_hashfn(ht, new_tbl, obj);
+	new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
 
 	lock_buckets(new_tbl, old_tbl, new_hash);
 restart:
@@ -647,8 +642,21 @@ restart:
 		}
 
 		ASSERT_BUCKET_LOCK(ht, tbl, hash);
-		rcu_assign_pointer(*pprev, obj->next);
 
+		if (unlikely(new_tbl != tbl)) {
+			rht_for_each_continue(he2, he->next, tbl, hash) {
+				if (head_hashfn(ht, tbl, he2) == hash) {
+					rcu_assign_pointer(*pprev, he2);
+					goto found;
+				}
+			}
+
+			INIT_RHT_NULLS_HEAD(*pprev, ht, hash);
+		} else {
+			rcu_assign_pointer(*pprev, obj->next);
+		}
+
+found:
 		ret = true;
 		break;
 	}
-- 
cgit v1.2.3


From 020219a69d40a205dad12b0ea1e6a46153793368 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 6 Feb 2015 16:08:43 +0000
Subject: rhashtable: Fix remove logic to avoid cross references between
 buckets

The remove logic properly searched the remaining chain for a matching
entry with an identical hash but it did this while searching from both
the old and new table. Instead in order to not leave stale references
behind we need to:

 1. When growing and searching from the new table:
    Search remaining chain for entry with same hash to avoid having
    the new table directly point to a entry with a different hash.

 2. When shrinking and searching from the old table:
    Check if the element after the removed would create a cross
    reference and avoid it if so.

These bugs were present from the beginning in nft_hash.

Also, both insert functions calculated the hash based on the mask of
the new table. This worked while growing. Wwhile shrinking, the mask
of the inew table is smaller than the mask of the old table. This lead
to a bit not being taken into account when selecting the bucket lock
and thus caused the wrong bucket to be locked eventually.

Fixes: 7e1e77636e36 ("lib: Resizable, Scalable, Concurrent Hash Table")
Fixes: 97defe1ecf86 ("rhashtable: Per bucket locks & deferred expansion/shrinking")
Reported-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 5919d63f58e4..e96fc00208bc 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -552,8 +552,10 @@ static void rhashtable_wakeup_worker(struct rhashtable *ht)
 static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 				struct bucket_table *tbl, u32 hash)
 {
-	struct rhash_head *head = rht_dereference_bucket(tbl->buckets[hash],
-							 tbl, hash);
+	struct rhash_head *head;
+
+	hash = rht_bucket_index(tbl, hash);
+	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 
 	ASSERT_BUCKET_LOCK(ht, tbl, hash);
 
@@ -593,7 +595,7 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 
 	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	hash = head_hashfn(ht, tbl, obj);
+	hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
 
 	lock_buckets(tbl, old_tbl, hash);
 	__rhashtable_insert(ht, obj, tbl, hash);
@@ -627,8 +629,8 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 	bool ret = false;
 
 	rcu_read_lock();
-	tbl = old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	tbl = new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
 
 	lock_buckets(new_tbl, old_tbl, new_hash);
@@ -643,15 +645,19 @@ restart:
 
 		ASSERT_BUCKET_LOCK(ht, tbl, hash);
 
-		if (unlikely(new_tbl != tbl)) {
-			rht_for_each_continue(he2, he->next, tbl, hash) {
+		if (old_tbl->size > new_tbl->size && tbl == old_tbl &&
+		    !rht_is_a_nulls(obj->next) &&
+		    head_hashfn(ht, tbl, obj->next) != hash) {
+			rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash));
+		} else if (unlikely(old_tbl->size < new_tbl->size && tbl == new_tbl)) {
+			rht_for_each_continue(he2, obj->next, tbl, hash) {
 				if (head_hashfn(ht, tbl, he2) == hash) {
 					rcu_assign_pointer(*pprev, he2);
 					goto found;
 				}
 			}
 
-			INIT_RHT_NULLS_HEAD(*pprev, ht, hash);
+			rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash));
 		} else {
 			rcu_assign_pointer(*pprev, obj->next);
 		}
@@ -666,8 +672,8 @@ found:
 	 * resizing. Thus traversing both is fine and the added cost is
 	 * very rare.
 	 */
-	if (tbl != new_tbl) {
-		tbl = new_tbl;
+	if (tbl != old_tbl) {
+		tbl = old_tbl;
 		goto restart;
 	}
 
@@ -835,7 +841,7 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
 	rcu_read_lock();
 	old_tbl = rht_dereference_rcu(ht->tbl, ht);
 	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
-	new_hash = head_hashfn(ht, new_tbl, obj);
+	new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
 
 	lock_buckets(new_tbl, old_tbl, new_hash);
 
-- 
cgit v1.2.3


From 61d7b097738c9e37f7d5dcb1adf54a54d34444f7 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 9 Feb 2015 14:04:03 +1100
Subject: rhashtable: using ERR_PTR requires linux/err.h

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e96fc00208bc..9cc4c4a90d00 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -23,6 +23,7 @@
 #include <linux/jhash.h>
 #include <linux/random.h>
 #include <linux/rhashtable.h>
+#include <linux/err.h>
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4UL
-- 
cgit v1.2.3