From 6d514b4e7737ad75a7e7e0a3f7dde45d46341691 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@horizon.com>
Date: Mon, 23 Jun 2014 15:11:54 +0200
Subject: lib: crc32: Greatly shrink CRC combining code

There's no need for a full 32x32 matrix, when rows before the last are
just shifted copies of the rows after them.

There's still room for improvement (especially on X86 processors with
CRC32 and PCLMUL instructions), but this is a large step in the
right direction [which is in particular useful for its current user,
namely SCTP checksumming over multiple skb frags[] entries, i.e. in
IPVS balancing when other CRC32 offloads are not available].

The internal primitive is now called crc32_generic_shift and takes one
less argument; the XOR with crc2 is done in inline wrappers.

Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/crc32.c | 147 +++++++++++++++++++++++++++++-------------------------------
 1 file changed, 70 insertions(+), 77 deletions(-)

(limited to 'lib')

diff --git a/lib/crc32.c b/lib/crc32.c
index 21a7b2135af6..9af30ff334c5 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -50,30 +50,6 @@ MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>");
 MODULE_DESCRIPTION("Various CRC32 calculations");
 MODULE_LICENSE("GPL");
 
-#define GF2_DIM		32
-
-static u32 gf2_matrix_times(u32 *mat, u32 vec)
-{
-	u32 sum = 0;
-
-	while (vec) {
-		if (vec & 1)
-			sum ^= *mat;
-		vec >>= 1;
-		mat++;
-	}
-
-	return sum;
-}
-
-static void gf2_matrix_square(u32 *square, u32 *mat)
-{
-	int i;
-
-	for (i = 0; i < GF2_DIM; i++)
-		square[i] = gf2_matrix_times(mat, mat[i]);
-}
-
 #if CRC_LE_BITS > 8 || CRC_BE_BITS > 8
 
 /* implements slicing-by-4 or slicing-by-8 algorithm */
@@ -155,51 +131,6 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
 }
 #endif
 
-/* For conditions of distribution and use, see copyright notice in zlib.h */
-static u32 crc32_generic_combine(u32 crc1, u32 crc2, size_t len2,
-				 u32 polynomial)
-{
-	u32 even[GF2_DIM]; /* Even-power-of-two zeros operator */
-	u32 odd[GF2_DIM];  /* Odd-power-of-two zeros operator  */
-	u32 row;
-	int i;
-
-	if (len2 <= 0)
-		return crc1;
-
-	/* Put operator for one zero bit in odd */
-	odd[0] = polynomial;
-	row = 1;
-	for (i = 1; i < GF2_DIM; i++) {
-		odd[i] = row;
-		row <<= 1;
-	}
-
-	gf2_matrix_square(even, odd); /* Put operator for two zero bits in even */
-	gf2_matrix_square(odd, even); /* Put operator for four zero bits in odd */
-
-	/* Apply len2 zeros to crc1 (first square will put the operator for one
-	 * zero byte, eight zero bits, in even).
-	 */
-	do {
-		/* Apply zeros operator for this bit of len2 */
-		gf2_matrix_square(even, odd);
-		if (len2 & 1)
-			crc1 = gf2_matrix_times(even, crc1);
-		len2 >>= 1;
-		/* If no more bits set, then done */
-		if (len2 == 0)
-			break;
-		/* Another iteration of the loop with odd and even swapped */
-		gf2_matrix_square(odd, even);
-		if (len2 & 1)
-			crc1 = gf2_matrix_times(odd, crc1);
-		len2 >>= 1;
-	} while (len2 != 0);
-
-	crc1 ^= crc2;
-	return crc1;
-}
 
 /**
  * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II
@@ -271,19 +202,81 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
 			(const u32 (*)[256])crc32ctable_le, CRC32C_POLY_LE);
 }
 #endif
-u32 __pure crc32_le_combine(u32 crc1, u32 crc2, size_t len2)
+EXPORT_SYMBOL(crc32_le);
+EXPORT_SYMBOL(__crc32c_le);
+
+/*
+ * This multiplies the polynomials x and y modulo the given modulus.
+ * This follows the "little-endian" CRC convention that the lsbit
+ * represents the highest power of x, and the msbit represents x^0.
+ */
+static u32 __attribute_const__ gf2_multiply(u32 x, u32 y, u32 modulus)
 {
-	return crc32_generic_combine(crc1, crc2, len2, CRCPOLY_LE);
+	u32 product = x & 1 ? y : 0;
+	int i;
+
+	for (i = 0; i < 31; i++) {
+		product = (product >> 1) ^ (product & 1 ? modulus : 0);
+		x >>= 1;
+		product ^= x & 1 ? y : 0;
+	}
+
+	return product;
 }
 
-u32 __pure __crc32c_le_combine(u32 crc1, u32 crc2, size_t len2)
+/**
+ * crc32_generic_shift - Append len 0 bytes to crc, in logarithmic time
+ * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient)
+ * @len: The number of bytes. @crc is multiplied by x^(8*@len)
+ * @polynomial: The modulus used to reduce the result to 32 bits.
+ *
+ * It's possible to parallelize CRC computations by computing a CRC
+ * over separate ranges of a buffer, then summing them.
+ * This shifts the given CRC by 8*len bits (i.e. produces the same effect
+ * as appending len bytes of zero to the data), in time proportional
+ * to log(len).
+ */
+static u32 __attribute_const__ crc32_generic_shift(u32 crc, size_t len,
+						   u32 polynomial)
 {
-	return crc32_generic_combine(crc1, crc2, len2, CRC32C_POLY_LE);
+	u32 power = polynomial;	/* CRC of x^32 */
+	int i;
+
+	/* Shift up to 32 bits in the simple linear way */
+	for (i = 0; i < 8 * (int)(len & 3); i++)
+		crc = (crc >> 1) ^ (crc & 1 ? polynomial : 0);
+
+	len >>= 2;
+	if (!len)
+		return crc;
+
+	for (;;) {
+		/* "power" is x^(2^i), modulo the polynomial */
+		if (len & 1)
+			crc = gf2_multiply(crc, power, polynomial);
+
+		len >>= 1;
+		if (!len)
+			break;
+
+		/* Square power, advancing to x^(2^(i+1)) */
+		power = gf2_multiply(power, power, polynomial);
+	}
+
+	return crc;
 }
-EXPORT_SYMBOL(crc32_le);
-EXPORT_SYMBOL(crc32_le_combine);
-EXPORT_SYMBOL(__crc32c_le);
-EXPORT_SYMBOL(__crc32c_le_combine);
+
+u32 __attribute_const__ crc32_le_shift(u32 crc, size_t len)
+{
+	return crc32_generic_shift(crc, len, CRCPOLY_LE);
+}
+
+u32 __attribute_const__ __crc32c_le_shift(u32 crc, size_t len)
+{
+	return crc32_generic_shift(crc, len, CRC32C_POLY_LE);
+}
+EXPORT_SYMBOL(crc32_le_shift);
+EXPORT_SYMBOL(__crc32c_le_shift);
 
 /**
  * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
-- 
cgit v1.2.3


From 4fa8e03b22df9b34f87906fa29de788bfa628bff Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@horizon.com>
Date: Mon, 23 Jun 2014 15:11:55 +0200
Subject: lib: crc32: Mark test data __initconst

So it gets discarded after the selftest.

Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/crc32.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/crc32.c b/lib/crc32.c
index 9af30ff334c5..af938ab12468 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(crc32_be);
 #ifdef CONFIG_CRC32_SELFTEST
 
 /* 4096 random bytes */
-static u8 __attribute__((__aligned__(8))) test_buf[] =
+static u8 const __aligned(8) test_buf[] __initconst =
 {
 	0x5b, 0x85, 0x21, 0xcb, 0x09, 0x68, 0x7d, 0x30,
 	0xc7, 0x69, 0xd7, 0x30, 0x92, 0xde, 0x59, 0xe4,
@@ -868,7 +868,7 @@ static struct crc_test {
 	u32 crc_le;	/* expected crc32_le result */
 	u32 crc_be;	/* expected crc32_be result */
 	u32 crc32c_le;	/* expected crc32c_le result */
-} test[] =
+} const test[] __initconst =
 {
 	{0x674bf11d, 0x00000038, 0x00000542, 0x0af6d466, 0xd8b6e4c1, 0xf6e93d6c},
 	{0x35c672c6, 0x0000003a, 0x000001aa, 0xc6d3dfba, 0x28aaf3ad, 0x0fe92aca},
-- 
cgit v1.2.3


From d8f1c4778e957273c3b5b6e045d8d3af38484ca8 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@horizon.com>
Date: Mon, 23 Jun 2014 15:11:56 +0200
Subject: lib: crc32: Add some additional __pure annotations

In case they help the compiler.

Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/crc32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/crc32.c b/lib/crc32.c
index af938ab12468..9a907d489d95 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -53,7 +53,7 @@ MODULE_LICENSE("GPL");
 #if CRC_LE_BITS > 8 || CRC_BE_BITS > 8
 
 /* implements slicing-by-4 or slicing-by-8 algorithm */
-static inline u32
+static inline u32 __pure
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
 {
 # ifdef __LITTLE_ENDIAN
-- 
cgit v1.2.3


From a69f5edb8ba20c87c5f7c96ec40581f9f51f2910 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 24 Jun 2014 11:20:48 -0700
Subject: mac_pton: Use bool not int return

Use bool instead of int as the return type.

All uses are tested with !.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/net_utils.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/net_utils.c b/lib/net_utils.c
index 2e3c52c8d050..148fc6e99ef6 100644
--- a/lib/net_utils.c
+++ b/lib/net_utils.c
@@ -3,24 +3,24 @@
 #include <linux/ctype.h>
 #include <linux/kernel.h>
 
-int mac_pton(const char *s, u8 *mac)
+bool mac_pton(const char *s, u8 *mac)
 {
 	int i;
 
 	/* XX:XX:XX:XX:XX:XX */
 	if (strlen(s) < 3 * ETH_ALEN - 1)
-		return 0;
+		return false;
 
 	/* Don't dirty result unless string is valid MAC. */
 	for (i = 0; i < ETH_ALEN; i++) {
 		if (!isxdigit(s[i * 3]) || !isxdigit(s[i * 3 + 1]))
-			return 0;
+			return false;
 		if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':')
-			return 0;
+			return false;
 	}
 	for (i = 0; i < ETH_ALEN; i++) {
 		mac[i] = (hex_to_bin(s[i * 3]) << 4) | hex_to_bin(s[i * 3 + 1]);
 	}
-	return 1;
+	return true;
 }
 EXPORT_SYMBOL(mac_pton);
-- 
cgit v1.2.3


From ccc7f4968a18b980994e622006b84e0195754390 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@gmail.com>
Date: Thu, 17 Jul 2014 19:46:10 +0200
Subject: net: print net_device reg_state in netdev_* unless it's registered

This way we'll always know in what status the device is, unless it's
running normally (i.e. NETDEV_REGISTERED).

Also, emit a warning once in case of a bad reg_state.

CC: "David S. Miller" <davem@davemloft.net>
CC: Jason Baron <jbaron@akamai.com>
CC: Eric Dumazet <edumazet@google.com>
CC: Vlad Yasevich <vyasevic@redhat.com>
CC: stephen hemminger <stephen@networkplumber.org>
CC: Jerry Chu <hkchu@google.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Joe Perches <joe@perches.com>
Signed-off-by: Veaceslav Falico <vfalico@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/dynamic_debug.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 7288e38e1757..c9afbe2c445a 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -614,13 +614,15 @@ int __dynamic_netdev_dbg(struct _ddebug *descriptor,
 		char buf[PREFIX_SIZE];
 
 		res = dev_printk_emit(7, dev->dev.parent,
-				      "%s%s %s %s: %pV",
+				      "%s%s %s %s%s: %pV",
 				      dynamic_emit_prefix(descriptor, buf),
 				      dev_driver_string(dev->dev.parent),
 				      dev_name(dev->dev.parent),
-				      netdev_name(dev), &vaf);
+				      netdev_name(dev), netdev_reg_state(dev),
+				      &vaf);
 	} else if (dev) {
-		res = printk(KERN_DEBUG "%s: %pV", netdev_name(dev), &vaf);
+		res = printk(KERN_DEBUG "%s%s: %pV", netdev_name(dev),
+			     netdev_reg_state(dev), &vaf);
 	} else {
 		res = printk(KERN_DEBUG "(NULL net_device): %pV", &vaf);
 	}
-- 
cgit v1.2.3


From 2695fb552cbef1029aa025a98acb80cc51d66de5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 24 Jul 2014 16:38:21 -0700
Subject: net: filter: rename 'struct sock_filter_int' into 'struct bpf_insn'

eBPF is used by socket filtering, seccomp and soon by tracing and
exposed to userspace, therefore 'sock_filter_int' name is not accurate.
Rename it to 'bpf_insn'

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/test_bpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index c579e0f58818..5f48623ee1a7 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -66,7 +66,7 @@ struct bpf_test {
 	const char *descr;
 	union {
 		struct sock_filter insns[MAX_INSNS];
-		struct sock_filter_int insns_int[MAX_INSNS];
+		struct bpf_insn insns_int[MAX_INSNS];
 	} u;
 	__u8 aux;
 	__u8 data[MAX_DATA];
@@ -1807,7 +1807,7 @@ static struct sk_filter *generate_filter(int which, int *err)
 
 		fp->len = flen;
 		memcpy(fp->insnsi, tests[which].u.insns_int,
-		       fp->len * sizeof(struct sock_filter_int));
+		       fp->len * sizeof(struct bpf_insn));
 
 		sk_filter_select_runtime(fp);
 		break;
-- 
cgit v1.2.3


From 4ada97abe937cdb3fc029a871d5b0f21aa661a60 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 28 Jul 2014 14:01:38 +0200
Subject: random32: mix in entropy from core to late initcall

Currently, we have a 3-stage seeding process in prandom():

Phase 1 is from the early actual initialization of prandom()
subsystem which happens during core_initcall() and remains
most likely until the beginning of late_initcall() phase.
Here, the system might not have enough entropy available
for seeding with strong randomness from the random driver.
That means, we currently have a 32bit weak LCG() seeding
the PRNG status register 1 and mixing that successively
into the other 3 registers just to get it up and running.

Phase 2 starts with late_initcall() phase resp. when the
random driver has initialized its non-blocking pool with
enough entropy. At that time, we throw away *all* inner
state from its 4 registers and do a full reseed with strong
randomness.

Phase 3 starts right after that and does a periodic reseed
with random slack of status register 1 by a strong random
source again.

A problem in phase 1 is that during bootup data structures
can be initialized, e.g. on module load time, and thus access
a weakly seeded prandom and are never changed for the rest
of their live-time, thus carrying along the results from a
week seed. Lets make sure that current but also future users
access a possibly better early seeded prandom.

This patch therefore improves phase 1 by trying to make it
more 'unpredictable' through mixing in seed from a possible
hardware source. Now, the mix-in xors inner state with the
outcome of either of the two functions arch_get_random_{,seed}_int(),
preferably arch_get_random_seed_int() as it likely represents
a non-deterministic random bit generator in hw rather than
a cryptographically secure PRNG in hw. However, not all might
have the first one, so we use the PRNG as a fallback if
available. As we xor the seed into the current state, the
worst case would be that a hardware source could be unverifiable
compromised or backdoored. In that case nevertheless it
would be as good as our original early seeding function
prandom_seed_very_weak() since we mix through xor which is
entropy preserving.

Joint work with Daniel Borkmann.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/random32.c | 49 ++++++++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

(limited to 'lib')

diff --git a/lib/random32.c b/lib/random32.c
index fa5da61ce7ad..c9b6bf3afe0c 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -40,6 +40,10 @@
 
 #ifdef CONFIG_RANDOM32_SELFTEST
 static void __init prandom_state_selftest(void);
+#else
+static inline void prandom_state_selftest(void)
+{
+}
 #endif
 
 static DEFINE_PER_CPU(struct rnd_state, net_rand_state);
@@ -53,8 +57,7 @@ static DEFINE_PER_CPU(struct rnd_state, net_rand_state);
  */
 u32 prandom_u32_state(struct rnd_state *state)
 {
-#define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b)
-
+#define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b)
 	state->s1 = TAUSWORTHE(state->s1,  6U, 13U, 4294967294U, 18U);
 	state->s2 = TAUSWORTHE(state->s2,  2U, 27U, 4294967288U,  2U);
 	state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U,  7U);
@@ -147,21 +150,25 @@ static void prandom_warmup(struct rnd_state *state)
 	prandom_u32_state(state);
 }
 
-static void prandom_seed_very_weak(struct rnd_state *state, u32 seed)
+static u32 __extract_hwseed(void)
 {
-	/* Note: This sort of seeding is ONLY used in test cases and
-	 * during boot at the time from core_initcall until late_initcall
-	 * as we don't have a stronger entropy source available yet.
-	 * After late_initcall, we reseed entire state, we have to (!),
-	 * otherwise an attacker just needs to search 32 bit space to
-	 * probe for our internal 128 bit state if he knows a couple
-	 * of prandom32 outputs!
-	 */
-#define LCG(x)	((x) * 69069U)	/* super-duper LCG */
-	state->s1 = __seed(LCG(seed),        2U);
-	state->s2 = __seed(LCG(state->s1),   8U);
-	state->s3 = __seed(LCG(state->s2),  16U);
-	state->s4 = __seed(LCG(state->s3), 128U);
+	u32 val = 0;
+
+	(void)(arch_get_random_seed_int(&val) ||
+	       arch_get_random_int(&val));
+
+	return val;
+}
+
+static void prandom_seed_early(struct rnd_state *state, u32 seed,
+			       bool mix_with_hwseed)
+{
+#define LCG(x)	 ((x) * 69069U)	/* super-duper LCG */
+#define HWSEED() (mix_with_hwseed ? __extract_hwseed() : 0)
+	state->s1 = __seed(HWSEED() ^ LCG(seed),        2U);
+	state->s2 = __seed(HWSEED() ^ LCG(state->s1),   8U);
+	state->s3 = __seed(HWSEED() ^ LCG(state->s2),  16U);
+	state->s4 = __seed(HWSEED() ^ LCG(state->s3), 128U);
 }
 
 /**
@@ -194,14 +201,13 @@ static int __init prandom_init(void)
 {
 	int i;
 
-#ifdef CONFIG_RANDOM32_SELFTEST
 	prandom_state_selftest();
-#endif
 
 	for_each_possible_cpu(i) {
 		struct rnd_state *state = &per_cpu(net_rand_state,i);
+		u32 weak_seed = (i + jiffies) ^ random_get_entropy();
 
-		prandom_seed_very_weak(state, (i + jiffies) ^ random_get_entropy());
+		prandom_seed_early(state, weak_seed, true);
 		prandom_warmup(state);
 	}
 
@@ -210,6 +216,7 @@ static int __init prandom_init(void)
 core_initcall(prandom_init);
 
 static void __prandom_timer(unsigned long dontcare);
+
 static DEFINE_TIMER(seed_timer, __prandom_timer, 0, 0);
 
 static void __prandom_timer(unsigned long dontcare)
@@ -419,7 +426,7 @@ static void __init prandom_state_selftest(void)
 	for (i = 0; i < ARRAY_SIZE(test1); i++) {
 		struct rnd_state state;
 
-		prandom_seed_very_weak(&state, test1[i].seed);
+		prandom_seed_early(&state, test1[i].seed, false);
 		prandom_warmup(&state);
 
 		if (test1[i].result != prandom_u32_state(&state))
@@ -434,7 +441,7 @@ static void __init prandom_state_selftest(void)
 	for (i = 0; i < ARRAY_SIZE(test2); i++) {
 		struct rnd_state state;
 
-		prandom_seed_very_weak(&state, test2[i].seed);
+		prandom_seed_early(&state, test2[i].seed, false);
 		prandom_warmup(&state);
 
 		for (j = 0; j < test2[i].iteration - 1; j++)
-- 
cgit v1.2.3


From 7ae457c1e5b45a1b826fad9d62b32191d2bdcfdb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 30 Jul 2014 20:34:16 -0700
Subject: net: filter: split 'struct sk_filter' into socket and bpf parts

clean up names related to socket filtering and bpf in the following way:
- everything that deals with sockets keeps 'sk_*' prefix
- everything that is pure BPF is changed to 'bpf_*' prefix

split 'struct sk_filter' into
struct sk_filter {
	atomic_t        refcnt;
	struct rcu_head rcu;
	struct bpf_prog *prog;
};
and
struct bpf_prog {
        u32                     jited:1,
                                len:31;
        struct sock_fprog_kern  *orig_prog;
        unsigned int            (*bpf_func)(const struct sk_buff *skb,
                                            const struct bpf_insn *filter);
        union {
                struct sock_filter      insns[0];
                struct bpf_insn         insnsi[0];
                struct work_struct      work;
        };
};
so that 'struct bpf_prog' can be used independent of sockets and cleans up
'unattached' bpf use cases

split SK_RUN_FILTER macro into:
    SK_RUN_FILTER to be used with 'struct sk_filter *' and
    BPF_PROG_RUN to be used with 'struct bpf_prog *'

__sk_filter_release(struct sk_filter *) gains
__bpf_prog_release(struct bpf_prog *) helper function

also perform related renames for the functions that work
with 'struct bpf_prog *', since they're on the same lines:

sk_filter_size -> bpf_prog_size
sk_filter_select_runtime -> bpf_prog_select_runtime
sk_filter_free -> bpf_prog_free
sk_unattached_filter_create -> bpf_prog_create
sk_unattached_filter_destroy -> bpf_prog_destroy
sk_store_orig_filter -> bpf_prog_store_orig_filter
sk_release_orig_filter -> bpf_release_orig_filter
__sk_migrate_filter -> bpf_migrate_filter
__sk_prepare_filter -> bpf_prepare_filter

API for attaching classic BPF to a socket stays the same:
sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *)
and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program
which is used by sockets, tun, af_packet

API for 'unattached' BPF programs becomes:
bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *)
and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program
which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/test_bpf.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'lib')

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 5f48623ee1a7..89e0345733bd 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -1761,9 +1761,9 @@ static int probe_filter_length(struct sock_filter *fp)
 	return len + 1;
 }
 
-static struct sk_filter *generate_filter(int which, int *err)
+static struct bpf_prog *generate_filter(int which, int *err)
 {
-	struct sk_filter *fp;
+	struct bpf_prog *fp;
 	struct sock_fprog_kern fprog;
 	unsigned int flen = probe_filter_length(tests[which].u.insns);
 	__u8 test_type = tests[which].aux & TEST_TYPE_MASK;
@@ -1773,7 +1773,7 @@ static struct sk_filter *generate_filter(int which, int *err)
 		fprog.filter = tests[which].u.insns;
 		fprog.len = flen;
 
-		*err = sk_unattached_filter_create(&fp, &fprog);
+		*err = bpf_prog_create(&fp, &fprog);
 		if (tests[which].aux & FLAG_EXPECTED_FAIL) {
 			if (*err == -EINVAL) {
 				pr_cont("PASS\n");
@@ -1798,7 +1798,7 @@ static struct sk_filter *generate_filter(int which, int *err)
 		break;
 
 	case INTERNAL:
-		fp = kzalloc(sk_filter_size(flen), GFP_KERNEL);
+		fp = kzalloc(bpf_prog_size(flen), GFP_KERNEL);
 		if (fp == NULL) {
 			pr_cont("UNEXPECTED_FAIL no memory left\n");
 			*err = -ENOMEM;
@@ -1809,7 +1809,7 @@ static struct sk_filter *generate_filter(int which, int *err)
 		memcpy(fp->insnsi, tests[which].u.insns_int,
 		       fp->len * sizeof(struct bpf_insn));
 
-		sk_filter_select_runtime(fp);
+		bpf_prog_select_runtime(fp);
 		break;
 	}
 
@@ -1817,21 +1817,21 @@ static struct sk_filter *generate_filter(int which, int *err)
 	return fp;
 }
 
-static void release_filter(struct sk_filter *fp, int which)
+static void release_filter(struct bpf_prog *fp, int which)
 {
 	__u8 test_type = tests[which].aux & TEST_TYPE_MASK;
 
 	switch (test_type) {
 	case CLASSIC:
-		sk_unattached_filter_destroy(fp);
+		bpf_prog_destroy(fp);
 		break;
 	case INTERNAL:
-		sk_filter_free(fp);
+		bpf_prog_free(fp);
 		break;
 	}
 }
 
-static int __run_one(const struct sk_filter *fp, const void *data,
+static int __run_one(const struct bpf_prog *fp, const void *data,
 		     int runs, u64 *duration)
 {
 	u64 start, finish;
@@ -1840,7 +1840,7 @@ static int __run_one(const struct sk_filter *fp, const void *data,
 	start = ktime_to_us(ktime_get());
 
 	for (i = 0; i < runs; i++)
-		ret = SK_RUN_FILTER(fp, data);
+		ret = BPF_PROG_RUN(fp, data);
 
 	finish = ktime_to_us(ktime_get());
 
@@ -1850,7 +1850,7 @@ static int __run_one(const struct sk_filter *fp, const void *data,
 	return ret;
 }
 
-static int run_one(const struct sk_filter *fp, struct bpf_test *test)
+static int run_one(const struct bpf_prog *fp, struct bpf_test *test)
 {
 	int err_cnt = 0, i, runs = MAX_TESTRUNS;
 
@@ -1884,7 +1884,7 @@ static __init int test_bpf(void)
 	int i, err_cnt = 0, pass_cnt = 0;
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
-		struct sk_filter *fp;
+		struct bpf_prog *fp;
 		int err;
 
 		pr_info("#%d %s ", i, tests[i].descr);
-- 
cgit v1.2.3


From 06ebb06d49486676272a3c030bfeef4bd969a8e6 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Thu, 31 Jul 2014 23:00:35 -0400
Subject: iovec: make sure the caller actually wants anything in
 memcpy_fromiovecend

Check for cases when the caller requests 0 bytes instead of running off
and dereferencing potentially invalid iovecs.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/iovec.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'lib')

diff --git a/lib/iovec.c b/lib/iovec.c
index 7a7c2da4cddf..df3abd1eaa4a 100644
--- a/lib/iovec.c
+++ b/lib/iovec.c
@@ -85,6 +85,10 @@ EXPORT_SYMBOL(memcpy_toiovecend);
 int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
 			int offset, int len)
 {
+	/* No data? Done! */
+	if (len == 0)
+		return 0;
+
 	/* Skip over the finished iovecs */
 	while (offset >= iov->iov_len) {
 		offset -= iov->iov_len;
-- 
cgit v1.2.3


From 7e1e77636e36075ebf118298855268468f1028e8 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 2 Aug 2014 11:47:44 +0200
Subject: lib: Resizable, Scalable, Concurrent Hash Table

Generic implementation of a resizable, scalable, concurrent hash table
based on [0]. The implementation supports both, fixed size keys specified
via an offset and length, or arbitrary keys via own hash and compare
functions.

Lookups are lockless and protected as RCU read side critical sections.
Automatic growing/shrinking based on user configurable watermarks is
available while allowing concurrent lookups to take place.

Objects to be hashed must include a struct rhash_head. The reason for not
using the existing struct hlist_head is that the expansion and shrinking
will have two buckets point to a single entry which would lead in obscure
reverse chaining behaviour.

Code includes a boot selftest if CONFIG_TEST_RHASHTABLE is defined.

[0] https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Reviewed-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/Kconfig.debug |   8 +
 lib/Makefile      |   2 +-
 lib/rhashtable.c  | 797 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 806 insertions(+), 1 deletion(-)
 create mode 100644 lib/rhashtable.c

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7a638aa3545b..f11a2e8f6157 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1550,6 +1550,14 @@ config TEST_STRING_HELPERS
 config TEST_KSTRTOX
 	tristate "Test kstrto*() family of functions at runtime"
 
+config TEST_RHASHTABLE
+	bool "Perform selftest on resizable hash table"
+	default n
+	help
+	  Enable this option to test the rhashtable functions at boot.
+
+	  If unsure, say N.
+
 endmenu # runtime tests
 
 config PROVIDE_OHCI1394_DMA_INIT
diff --git a/lib/Makefile b/lib/Makefile
index ba967a19edba..fd248e4c05ad 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o percpu_ida.o hash.o
+	 percpu-refcount.o percpu_ida.o hash.o rhashtable.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
new file mode 100644
index 000000000000..e6940cf16628
--- /dev/null
+++ b/lib/rhashtable.c
@@ -0,0 +1,797 @@
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch>
+ * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * Based on the following paper:
+ * https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf
+ *
+ * Code partially derived from nft_hash
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include <linux/rhashtable.h>
+#include <linux/log2.h>
+
+#define HASH_DEFAULT_SIZE	64UL
+#define HASH_MIN_SIZE		4UL
+
+#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
+
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
+{
+	return ht->p.mutex_is_held();
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
+#endif
+
+/**
+ * rht_obj - cast hash head to outer object
+ * @ht:		hash table
+ * @he:		hashed node
+ */
+void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
+{
+	return (void *) he - ht->p.head_offset;
+}
+EXPORT_SYMBOL_GPL(rht_obj);
+
+static u32 __hashfn(const struct rhashtable *ht, const void *key,
+		      u32 len, u32 hsize)
+{
+	u32 h;
+
+	h = ht->p.hashfn(key, len, ht->p.hash_rnd);
+
+	return h & (hsize - 1);
+}
+
+/**
+ * rhashtable_hashfn - compute hash for key of given length
+ * @ht:		hash table to compuate for
+ * @key:	pointer to key
+ * @len:	length of key
+ *
+ * Computes the hash value using the hash function provided in the 'hashfn'
+ * of struct rhashtable_params. The returned value is guaranteed to be
+ * smaller than the number of buckets in the hash table.
+ */
+u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len)
+{
+	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	return __hashfn(ht, key, len, tbl->size);
+}
+EXPORT_SYMBOL_GPL(rhashtable_hashfn);
+
+static u32 obj_hashfn(const struct rhashtable *ht, const void *ptr, u32 hsize)
+{
+	if (unlikely(!ht->p.key_len)) {
+		u32 h;
+
+		h = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
+
+		return h & (hsize - 1);
+	}
+
+	return __hashfn(ht, ptr + ht->p.key_offset, ht->p.key_len, hsize);
+}
+
+/**
+ * rhashtable_obj_hashfn - compute hash for hashed object
+ * @ht:		hash table to compuate for
+ * @ptr:	pointer to hashed object
+ *
+ * Computes the hash value using the hash function `hashfn` respectively
+ * 'obj_hashfn' depending on whether the hash table is set up to work with
+ * a fixed length key. The returned value is guaranteed to be smaller than
+ * the number of buckets in the hash table.
+ */
+u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr)
+{
+	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	return obj_hashfn(ht, ptr, tbl->size);
+}
+EXPORT_SYMBOL_GPL(rhashtable_obj_hashfn);
+
+static u32 head_hashfn(const struct rhashtable *ht,
+		       const struct rhash_head *he, u32 hsize)
+{
+	return obj_hashfn(ht, rht_obj(ht, he), hsize);
+}
+
+static struct bucket_table *bucket_table_alloc(size_t nbuckets, gfp_t flags)
+{
+	struct bucket_table *tbl;
+	size_t size;
+
+	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
+	tbl = kzalloc(size, flags);
+	if (tbl == NULL)
+		tbl = vzalloc(size);
+
+	if (tbl == NULL)
+		return NULL;
+
+	tbl->size = nbuckets;
+
+	return tbl;
+}
+
+static void bucket_table_free(const struct bucket_table *tbl)
+{
+	kvfree(tbl);
+}
+
+/**
+ * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
+ * @ht:		hash table
+ * @new_size:	new table size
+ */
+bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
+{
+	/* Expand table when exceeding 75% load */
+	return ht->nelems > (new_size / 4 * 3);
+}
+EXPORT_SYMBOL_GPL(rht_grow_above_75);
+
+/**
+ * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
+ * @ht:		hash table
+ * @new_size:	new table size
+ */
+bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
+{
+	/* Shrink table beneath 30% load */
+	return ht->nelems < (new_size * 3 / 10);
+}
+EXPORT_SYMBOL_GPL(rht_shrink_below_30);
+
+static void hashtable_chain_unzip(const struct rhashtable *ht,
+				  const struct bucket_table *new_tbl,
+				  struct bucket_table *old_tbl, size_t n)
+{
+	struct rhash_head *he, *p, *next;
+	unsigned int h;
+
+	/* Old bucket empty, no work needed. */
+	p = rht_dereference(old_tbl->buckets[n], ht);
+	if (!p)
+		return;
+
+	/* Advance the old bucket pointer one or more times until it
+	 * reaches a node that doesn't hash to the same bucket as the
+	 * previous node p. Call the previous node p;
+	 */
+	h = head_hashfn(ht, p, new_tbl->size);
+	rht_for_each(he, p->next, ht) {
+		if (head_hashfn(ht, he, new_tbl->size) != h)
+			break;
+		p = he;
+	}
+	RCU_INIT_POINTER(old_tbl->buckets[n], p->next);
+
+	/* Find the subsequent node which does hash to the same
+	 * bucket as node P, or NULL if no such node exists.
+	 */
+	next = NULL;
+	if (he) {
+		rht_for_each(he, he->next, ht) {
+			if (head_hashfn(ht, he, new_tbl->size) == h) {
+				next = he;
+				break;
+			}
+		}
+	}
+
+	/* Set p's next pointer to that subsequent node pointer,
+	 * bypassing the nodes which do not hash to p's bucket
+	 */
+	RCU_INIT_POINTER(p->next, next);
+}
+
+/**
+ * rhashtable_expand - Expand hash table while allowing concurrent lookups
+ * @ht:		the hash table to expand
+ * @flags:	allocation flags
+ *
+ * A secondary bucket array is allocated and the hash entries are migrated
+ * while keeping them on both lists until the end of the RCU grace period.
+ *
+ * This function may only be called in a context where it is safe to call
+ * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
+ *
+ * The caller must ensure that no concurrent table mutations take place.
+ * It is however valid to have concurrent lookups if they are RCU protected.
+ */
+int rhashtable_expand(struct rhashtable *ht, gfp_t flags)
+{
+	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct rhash_head *he;
+	unsigned int i, h;
+	bool complete;
+
+	ASSERT_RHT_MUTEX(ht);
+
+	if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
+		return 0;
+
+	new_tbl = bucket_table_alloc(old_tbl->size * 2, flags);
+	if (new_tbl == NULL)
+		return -ENOMEM;
+
+	ht->shift++;
+
+	/* For each new bucket, search the corresponding old bucket
+	 * for the ﬁrst entry that hashes to the new bucket, and
+	 * link the new bucket to that entry. Since all the entries
+	 * which will end up in the new bucket appear in the same
+	 * old bucket, this constructs an entirely valid new hash
+	 * table, but with multiple buckets "zipped" together into a
+	 * single imprecise chain.
+	 */
+	for (i = 0; i < new_tbl->size; i++) {
+		h = i & (old_tbl->size - 1);
+		rht_for_each(he, old_tbl->buckets[h], ht) {
+			if (head_hashfn(ht, he, new_tbl->size) == i) {
+				RCU_INIT_POINTER(new_tbl->buckets[i], he);
+				break;
+			}
+		}
+	}
+
+	/* Publish the new table pointer. Lookups may now traverse
+	 * the new table, but they will not beneﬁt from any
+	 * additional efﬁciency until later steps unzip the buckets.
+	 */
+	rcu_assign_pointer(ht->tbl, new_tbl);
+
+	/* Unzip interleaved hash chains */
+	do {
+		/* Wait for readers. All new readers will see the new
+		 * table, and thus no references to the old table will
+		 * remain.
+		 */
+		synchronize_rcu();
+
+		/* For each bucket in the old table (each of which
+		 * contains items from multiple buckets of the new
+		 * table): ...
+		 */
+		complete = true;
+		for (i = 0; i < old_tbl->size; i++) {
+			hashtable_chain_unzip(ht, new_tbl, old_tbl, i);
+			if (old_tbl->buckets[i] != NULL)
+				complete = false;
+		}
+	} while (!complete);
+
+	bucket_table_free(old_tbl);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_expand);
+
+/**
+ * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
+ * @ht:		the hash table to shrink
+ * @flags:	allocation flags
+ *
+ * This function may only be called in a context where it is safe to call
+ * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
+ *
+ * The caller must ensure that no concurrent table mutations take place.
+ * It is however valid to have concurrent lookups if they are RCU protected.
+ */
+int rhashtable_shrink(struct rhashtable *ht, gfp_t flags)
+{
+	struct bucket_table *ntbl, *tbl = rht_dereference(ht->tbl, ht);
+	struct rhash_head __rcu **pprev;
+	unsigned int i;
+
+	ASSERT_RHT_MUTEX(ht);
+
+	if (tbl->size <= HASH_MIN_SIZE)
+		return 0;
+
+	ntbl = bucket_table_alloc(tbl->size / 2, flags);
+	if (ntbl == NULL)
+		return -ENOMEM;
+
+	ht->shift--;
+
+	/* Link each bucket in the new table to the ﬁrst bucket
+	 * in the old table that contains entries which will hash
+	 * to the new bucket.
+	 */
+	for (i = 0; i < ntbl->size; i++) {
+		ntbl->buckets[i] = tbl->buckets[i];
+
+		/* Link each bucket in the new table to the ﬁrst bucket
+		 * in the old table that contains entries which will hash
+		 * to the new bucket.
+		 */
+		for (pprev = &ntbl->buckets[i]; *pprev != NULL;
+		     pprev = &rht_dereference(*pprev, ht)->next)
+			;
+		RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]);
+	}
+
+	/* Publish the new, valid hash table */
+	rcu_assign_pointer(ht->tbl, ntbl);
+
+	/* Wait for readers. No new readers will have references to the
+	 * old hash table.
+	 */
+	synchronize_rcu();
+
+	bucket_table_free(tbl);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_shrink);
+
+/**
+ * rhashtable_insert - insert object into hash hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @flags:	allocation flags (table expansion)
+ *
+ * Will automatically grow the table via rhashtable_expand() if the the
+ * grow_decision function specified at rhashtable_init() returns true.
+ *
+ * The caller must ensure that no concurrent table mutations occur. It is
+ * however valid to have concurrent lookups if they are RCU protected.
+ */
+void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
+		       gfp_t flags)
+{
+	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
+	u32 hash;
+
+	ASSERT_RHT_MUTEX(ht);
+
+	hash = head_hashfn(ht, obj, tbl->size);
+	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
+	rcu_assign_pointer(tbl->buckets[hash], obj);
+	ht->nelems++;
+
+	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
+		rhashtable_expand(ht, flags);
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert);
+
+/**
+ * rhashtable_remove_pprev - remove object from hash table given previous element
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @pprev:	pointer to previous element
+ * @flags:	allocation flags (table expansion)
+ *
+ * Identical to rhashtable_remove() but caller is alreayd aware of the element
+ * in front of the element to be deleted. This is in particular useful for
+ * deletion when combined with walking or lookup.
+ */
+void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
+			     struct rhash_head **pprev, gfp_t flags)
+{
+	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
+
+	ASSERT_RHT_MUTEX(ht);
+
+	RCU_INIT_POINTER(*pprev, obj->next);
+	ht->nelems--;
+
+	if (ht->p.shrink_decision &&
+	    ht->p.shrink_decision(ht, tbl->size))
+		rhashtable_shrink(ht, flags);
+}
+EXPORT_SYMBOL_GPL(rhashtable_remove_pprev);
+
+/**
+ * rhashtable_remove - remove object from hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @flags:	allocation flags (table expansion)
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table via rhashtable_expand() if the the
+ * shrink_decision function specified at rhashtable_init() returns true.
+ *
+ * The caller must ensure that no concurrent table mutations occur. It is
+ * however valid to have concurrent lookups if they are RCU protected.
+ */
+bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj,
+		       gfp_t flags)
+{
+	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
+	struct rhash_head __rcu **pprev;
+	struct rhash_head *he;
+	u32 h;
+
+	ASSERT_RHT_MUTEX(ht);
+
+	h = head_hashfn(ht, obj, tbl->size);
+
+	pprev = &tbl->buckets[h];
+	rht_for_each(he, tbl->buckets[h], ht) {
+		if (he != obj) {
+			pprev = &he->next;
+			continue;
+		}
+
+		rhashtable_remove_pprev(ht, he, pprev, flags);
+		return true;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(rhashtable_remove);
+
+/**
+ * rhashtable_lookup - lookup key in hash table
+ * @ht:		hash table
+ * @key:	pointer to key
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * paramter set). It will BUG() if used inappropriately.
+ *
+ * Lookups may occur in parallel with hash mutations as long as the lookup is
+ * guarded by rcu_read_lock(). The caller must take care of this.
+ */
+void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
+{
+	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	struct rhash_head *he;
+	u32 h;
+
+	BUG_ON(!ht->p.key_len);
+
+	h = __hashfn(ht, key, ht->p.key_len, tbl->size);
+	rht_for_each_rcu(he, tbl->buckets[h], ht) {
+		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
+			   ht->p.key_len))
+			continue;
+		return (void *) he - ht->p.head_offset;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rhashtable_lookup);
+
+/**
+ * rhashtable_lookup_compare - search hash table with compare function
+ * @ht:		hash table
+ * @hash:	hash value of desired entry
+ * @compare:	compare function, must return true on match
+ * @arg:	argument passed on to compare function
+ *
+ * Traverses the bucket chain behind the provided hash value and calls the
+ * specified compare function for each entry.
+ *
+ * Lookups may occur in parallel with hash mutations as long as the lookup is
+ * guarded by rcu_read_lock(). The caller must take care of this.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash,
+				bool (*compare)(void *, void *), void *arg)
+{
+	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	struct rhash_head *he;
+
+	if (unlikely(hash >= tbl->size))
+		return NULL;
+
+	rht_for_each_rcu(he, tbl->buckets[hash], ht) {
+		if (!compare(rht_obj(ht, he), arg))
+			continue;
+		return (void *) he - ht->p.head_offset;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
+
+static size_t rounded_hashtable_size(unsigned int nelem)
+{
+	return max(roundup_pow_of_two(nelem * 4 / 3), HASH_MIN_SIZE);
+}
+
+/**
+ * rhashtable_init - initialize a new hash table
+ * @ht:		hash table to be initialized
+ * @params:	configuration parameters
+ *
+ * Initializes a new hash table based on the provided configuration
+ * parameters. A table can be configured either with a variable or
+ * fixed length key:
+ *
+ * Configuration Example 1: Fixed length keys
+ * struct test_obj {
+ *	int			key;
+ *	void *			my_member;
+ *	struct rhash_head	node;
+ * };
+ *
+ * struct rhashtable_params params = {
+ *	.head_offset = offsetof(struct test_obj, node),
+ *	.key_offset = offsetof(struct test_obj, key),
+ *	.key_len = sizeof(int),
+ *	.hashfn = arch_fast_hash,
+ *	.mutex_is_held = &my_mutex_is_held,
+ * };
+ *
+ * Configuration Example 2: Variable length keys
+ * struct test_obj {
+ *	[...]
+ *	struct rhash_head	node;
+ * };
+ *
+ * u32 my_hash_fn(const void *data, u32 seed)
+ * {
+ *	struct test_obj *obj = data;
+ *
+ *	return [... hash ...];
+ * }
+ *
+ * struct rhashtable_params params = {
+ *	.head_offset = offsetof(struct test_obj, node),
+ *	.hashfn = arch_fast_hash,
+ *	.obj_hashfn = my_hash_fn,
+ *	.mutex_is_held = &my_mutex_is_held,
+ * };
+ */
+int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
+{
+	struct bucket_table *tbl;
+	size_t size;
+
+	size = HASH_DEFAULT_SIZE;
+
+	if ((params->key_len && !params->hashfn) ||
+	    (!params->key_len && !params->obj_hashfn))
+		return -EINVAL;
+
+	if (params->nelem_hint)
+		size = rounded_hashtable_size(params->nelem_hint);
+
+	tbl = bucket_table_alloc(size, GFP_KERNEL);
+	if (tbl == NULL)
+		return -ENOMEM;
+
+	memset(ht, 0, sizeof(*ht));
+	ht->shift = ilog2(tbl->size);
+	memcpy(&ht->p, params, sizeof(*params));
+	RCU_INIT_POINTER(ht->tbl, tbl);
+
+	if (!ht->p.hash_rnd)
+		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_init);
+
+/**
+ * rhashtable_destroy - destroy hash table
+ * @ht:		the hash table to destroy
+ *
+ * Frees the bucket array.
+ */
+void rhashtable_destroy(const struct rhashtable *ht)
+{
+	const struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
+
+	bucket_table_free(tbl);
+}
+EXPORT_SYMBOL_GPL(rhashtable_destroy);
+
+/**************************************************************************
+ * Self Test
+ **************************************************************************/
+
+#ifdef CONFIG_TEST_RHASHTABLE
+
+#define TEST_HT_SIZE	8
+#define TEST_ENTRIES	2048
+#define TEST_PTR	((void *) 0xdeadbeef)
+#define TEST_NEXPANDS	4
+
+static int test_mutex_is_held(void)
+{
+	return 1;
+}
+
+struct test_obj {
+	void			*ptr;
+	int			value;
+	struct rhash_head	node;
+};
+
+static int __init test_rht_lookup(struct rhashtable *ht)
+{
+	unsigned int i;
+
+	for (i = 0; i < TEST_ENTRIES * 2; i++) {
+		struct test_obj *obj;
+		bool expected = !(i % 2);
+		u32 key = i;
+
+		obj = rhashtable_lookup(ht, &key);
+
+		if (expected && !obj) {
+			pr_warn("Test failed: Could not find key %u\n", key);
+			return -ENOENT;
+		} else if (!expected && obj) {
+			pr_warn("Test failed: Unexpected entry found for key %u\n",
+				key);
+			return -EEXIST;
+		} else if (expected && obj) {
+			if (obj->ptr != TEST_PTR || obj->value != i) {
+				pr_warn("Test failed: Lookup value mismatch %p!=%p, %u!=%u\n",
+					obj->ptr, TEST_PTR, obj->value, i);
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void test_bucket_stats(struct rhashtable *ht,
+				     struct bucket_table *tbl,
+				     bool quiet)
+{
+	unsigned int cnt, i, total = 0;
+	struct test_obj *obj;
+
+	for (i = 0; i < tbl->size; i++) {
+		cnt = 0;
+
+		if (!quiet)
+			pr_info(" [%#4x/%zu]", i, tbl->size);
+
+		rht_for_each_entry_rcu(obj, tbl->buckets[i], node) {
+			cnt++;
+			total++;
+			if (!quiet)
+				pr_cont(" [%p],", obj);
+		}
+
+		if (!quiet)
+			pr_cont("\n  [%#x] first element: %p, chain length: %u\n",
+				i, tbl->buckets[i], cnt);
+	}
+
+	pr_info("  Traversal complete: counted=%u, nelems=%zu, entries=%d\n",
+		total, ht->nelems, TEST_ENTRIES);
+}
+
+static int __init test_rhashtable(struct rhashtable *ht)
+{
+	struct bucket_table *tbl;
+	struct test_obj *obj, *next;
+	int err;
+	unsigned int i;
+
+	/*
+	 * Insertion Test:
+	 * Insert TEST_ENTRIES into table with all keys even numbers
+	 */
+	pr_info("  Adding %d keys\n", TEST_ENTRIES);
+	for (i = 0; i < TEST_ENTRIES; i++) {
+		struct test_obj *obj;
+
+		obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+		if (!obj) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		obj->ptr = TEST_PTR;
+		obj->value = i * 2;
+
+		rhashtable_insert(ht, &obj->node, GFP_KERNEL);
+	}
+
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	test_bucket_stats(ht, tbl, true);
+	test_rht_lookup(ht);
+	rcu_read_unlock();
+
+	for (i = 0; i < TEST_NEXPANDS; i++) {
+		pr_info("  Table expansion iteration %u...\n", i);
+		rhashtable_expand(ht, GFP_KERNEL);
+
+		rcu_read_lock();
+		pr_info("  Verifying lookups...\n");
+		test_rht_lookup(ht);
+		rcu_read_unlock();
+	}
+
+	for (i = 0; i < TEST_NEXPANDS; i++) {
+		pr_info("  Table shrinkage iteration %u...\n", i);
+		rhashtable_shrink(ht, GFP_KERNEL);
+
+		rcu_read_lock();
+		pr_info("  Verifying lookups...\n");
+		test_rht_lookup(ht);
+		rcu_read_unlock();
+	}
+
+	pr_info("  Deleting %d keys\n", TEST_ENTRIES);
+	for (i = 0; i < TEST_ENTRIES; i++) {
+		u32 key = i * 2;
+
+		obj = rhashtable_lookup(ht, &key);
+		BUG_ON(!obj);
+
+		rhashtable_remove(ht, &obj->node, GFP_KERNEL);
+		kfree(obj);
+	}
+
+	return 0;
+
+error:
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	for (i = 0; i < tbl->size; i++)
+		rht_for_each_entry_safe(obj, next, tbl->buckets[i], ht, node)
+			kfree(obj);
+
+	return err;
+}
+
+static int __init test_rht_init(void)
+{
+	struct rhashtable ht;
+	struct rhashtable_params params = {
+		.nelem_hint = TEST_HT_SIZE,
+		.head_offset = offsetof(struct test_obj, node),
+		.key_offset = offsetof(struct test_obj, value),
+		.key_len = sizeof(int),
+		.hashfn = arch_fast_hash,
+		.mutex_is_held = &test_mutex_is_held,
+		.grow_decision = rht_grow_above_75,
+		.shrink_decision = rht_shrink_below_30,
+	};
+	int err;
+
+	pr_info("Running resizable hashtable tests...\n");
+
+	err = rhashtable_init(&ht, &params);
+	if (err < 0) {
+		pr_warn("Test failed: Unable to initialize hashtable: %d\n",
+			err);
+		return err;
+	}
+
+	err = test_rhashtable(&ht);
+
+	rhashtable_destroy(&ht);
+
+	return err;
+}
+
+subsys_initcall(test_rht_init);
+
+#endif /* CONFIG_TEST_RHASHTABLE */
-- 
cgit v1.2.3