diff options
Diffstat (limited to 'lib')
60 files changed, 3681 insertions, 1148 deletions
diff --git a/lib/Kconfig b/lib/Kconfig index 3cca1222578e..d79909dc01ec 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -362,6 +362,9 @@ config INTERVAL_TREE for more information. +config RADIX_TREE_MULTIORDER + bool + config ASSOCIATIVE_ARRAY bool help @@ -523,6 +526,13 @@ config SG_SPLIT a scatterlist. This should be selected by a driver or an API which whishes to split a scatterlist amongst multiple DMA channels. +config SG_POOL + def_bool n + help + Provides a helper to allocate chained scatterlists. This should be + selected by a driver or an API which whishes to allocate chained + scatterlist. + # # sg chaining option # diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c0ae0fc125db..39d07e754822 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -244,6 +244,7 @@ config PAGE_OWNER depends on DEBUG_KERNEL && STACKTRACE_SUPPORT select DEBUG_FS select STACKTRACE + select STACKDEPOT select PAGE_EXTENSION help This keeps track of what call chain is the owner of a page, may @@ -257,6 +258,7 @@ config PAGE_OWNER config DEBUG_FS bool "Debug Filesystem" + select SRCU help debugfs is a virtual file system that kernel developers use to put debugging files into. Enable this option to be able to read and @@ -707,6 +709,8 @@ config KCOV bool "Code coverage for fuzzing" depends on ARCH_HAS_KCOV select DEBUG_FS + select GCC_PLUGINS if !COMPILE_TEST + select GCC_PLUGIN_SANCOV if !COMPILE_TEST help KCOV exposes kernel code coverage information in a form suitable for coverage-guided fuzzing (randomized testing). @@ -717,6 +721,17 @@ config KCOV For more details, see Documentation/kcov.txt. +config KCOV_INSTRUMENT_ALL + bool "Instrument all code by default" + depends on KCOV + default y if KCOV + help + If you are doing generic system call fuzzing (like e.g. syzkaller), + then you will want to instrument the whole kernel and you should + say y here. If you are doing more targeted fuzzing (like e.g. + filesystem fuzzing with AFL) then you will want to enable coverage + for more specific subsets of files, and should say n here. + config DEBUG_SHIRQ bool "Debug shared IRQ handlers" depends on DEBUG_KERNEL @@ -806,7 +821,7 @@ config DETECT_HUNG_TASK help Say Y here to enable the kernel to detect "hung tasks", which are bugs that cause the task to be stuck in - uninterruptible "D" state indefinitiley. + uninterruptible "D" state indefinitely. When a hung task is detected, the kernel will print the current stack trace (which you should report), but the @@ -1306,22 +1321,6 @@ config RCU_PERF_TEST Say M if you want the RCU performance tests to build as a module. Say N if you are unsure. -config RCU_PERF_TEST_RUNNABLE - bool "performance tests for RCU runnable by default" - depends on RCU_PERF_TEST = y - default n - help - This option provides a way to build the RCU performance tests - directly into the kernel without them starting up at boot time. - You can use /sys/module to manually override this setting. - This /proc file is available only when the RCU performance - tests have been built into the kernel. - - Say Y here if you want the RCU performance tests to start during - boot (you probably don't). - Say N here if you want the RCU performance tests to start only - after being manually enabled via /sys/module. - config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL @@ -1339,23 +1338,6 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. -config RCU_TORTURE_TEST_RUNNABLE - bool "torture tests for RCU runnable by default" - depends on RCU_TORTURE_TEST = y - default n - help - This option provides a way to build the RCU torture tests - directly into the kernel without them starting up at boot - time. You can use /proc/sys/kernel/rcutorture_runnable - to manually override this setting. This /proc file is - available only when the RCU torture tests have been built - into the kernel. - - Say Y here if you want the RCU torture tests to start during - boot (you probably don't). - Say N here if you want the RCU torture tests to start only - after being manually enabled via /proc. - config RCU_TORTURE_TEST_SLOW_PREINIT bool "Slow down RCU grace-period pre-initialization to expose races" depends on RCU_TORTURE_TEST @@ -1704,24 +1686,6 @@ config LATENCYTOP Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. -config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS - bool - -config DEBUG_STRICT_USER_COPY_CHECKS - bool "Strict user copy size checks" - depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS - depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING - help - Enabling this option turns a certain set of sanity checks for user - copy operations into compile time failures. - - The copy_from_user() etc checks are there to help test if there - are sufficient security checks on the length argument of - the copy operation, by having gcc prove that the argument is - within bounds. - - If unsure, say N. - source kernel/trace/Kconfig menu "Runtime Testing" @@ -1840,6 +1804,9 @@ config TEST_BITMAP If unsure, say N. +config TEST_UUID + tristate "Test functions located in the uuid module at runtime" + config TEST_RHASHTABLE tristate "Perform selftest on resizable hash table" default n @@ -1848,6 +1815,17 @@ config TEST_RHASHTABLE If unsure, say N. +config TEST_HASH + tristate "Perform selftest on hash functions" + default n + help + Enable this option to test the kernel's integer (<linux/hash,h>) + and string (<linux/stringhash.h>) hash functions on boot + (or module load). + + This is intended to help people writing architecture-specific + optimized versions. If unsure, say N. + endmenu # runtime tests config PROVIDE_OHCI1394_DMA_INIT diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 67d8c6838ba9..bd38aab05929 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -5,9 +5,9 @@ if HAVE_ARCH_KASAN config KASAN bool "KASan: runtime memory debugger" - depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) + depends on SLUB || (SLAB && !DEBUG_SLAB) select CONSTRUCTORS - select STACKDEPOT if SLAB + select STACKDEPOT help Enables kernel address sanitizer - runtime memory debugger, designed to find out-of-bounds accesses and use-after-free bugs. diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb index c635a107a7de..533f912638ed 100644 --- a/lib/Kconfig.kgdb +++ b/lib/Kconfig.kgdb @@ -22,7 +22,7 @@ config KGDB_SERIAL_CONSOLE tristate "KGDB: use kgdb over the serial console" select CONSOLE_POLL select MAGIC_SYSRQ - depends on TTY + depends on TTY && HW_CONSOLE default y help Share a serial console with kgdb. Sysrq-g must be used diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 39494af9a84a..bc6e651df68c 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -1,6 +1,9 @@ config ARCH_HAS_UBSAN_SANITIZE_ALL bool +config ARCH_WANTS_UBSAN_NO_NULL + def_bool n + config UBSAN bool "Undefined behaviour sanity checker" help @@ -34,3 +37,11 @@ config UBSAN_ALIGNMENT This option enables detection of unaligned memory accesses. Enabling this option on architectures that support unaligned accesses may produce a lot of false positives. + +config UBSAN_NULL + bool "Enable checking of null pointers" + depends on UBSAN + default y if !ARCH_WANTS_UBSAN_NO_NULL + help + This option enables detection of memory accesses via a + null pointer. diff --git a/lib/Makefile b/lib/Makefile index a65e9a861535..df747e5eeb7a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -15,19 +15,15 @@ KCOV_INSTRUMENT_rbtree.o := n KCOV_INSTRUMENT_list_debug.o := n KCOV_INSTRUMENT_debugobjects.o := n KCOV_INSTRUMENT_dynamic_debug.o := n -# Kernel does not boot if we instrument this file as it uses custom calling -# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS). -KCOV_INSTRUMENT_hweight.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ - sha1.o md5.o irq_regs.o argv_split.o \ + sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o + earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o -obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o lib-$(CONFIG_HAS_DMA) += dma-noop.o @@ -48,6 +44,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o obj-$(CONFIG_TEST_BPF) += test_bpf.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o +obj-$(CONFIG_TEST_HASH) += test_hash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LKM) += test_module.o @@ -57,6 +54,7 @@ obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o +obj-$(CONFIG_TEST_UUID) += test_uuid.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG @@ -72,8 +70,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -GCOV_PROFILE_hweight.o := n -CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o @@ -178,6 +174,7 @@ obj-$(CONFIG_GENERIC_STRNLEN_USER) += strnlen_user.o obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o obj-$(CONFIG_SG_SPLIT) += sg_split.o +obj-$(CONFIG_SG_POOL) += sg_pool.o obj-$(CONFIG_STMP_DEVICE) += stmp_device.o obj-$(CONFIG_IRQ_POLL) += irq_poll.o diff --git a/lib/atomic64.c b/lib/atomic64.c index 2886ebac6567..53c2d5edc826 100644 --- a/lib/atomic64.c +++ b/lib/atomic64.c @@ -96,17 +96,41 @@ long long atomic64_##op##_return(long long a, atomic64_t *v) \ } \ EXPORT_SYMBOL(atomic64_##op##_return); +#define ATOMIC64_FETCH_OP(op, c_op) \ +long long atomic64_fetch_##op(long long a, atomic64_t *v) \ +{ \ + unsigned long flags; \ + raw_spinlock_t *lock = lock_addr(v); \ + long long val; \ + \ + raw_spin_lock_irqsave(lock, flags); \ + val = v->counter; \ + v->counter c_op a; \ + raw_spin_unlock_irqrestore(lock, flags); \ + return val; \ +} \ +EXPORT_SYMBOL(atomic64_fetch_##op); + #define ATOMIC64_OPS(op, c_op) \ ATOMIC64_OP(op, c_op) \ - ATOMIC64_OP_RETURN(op, c_op) + ATOMIC64_OP_RETURN(op, c_op) \ + ATOMIC64_FETCH_OP(op, c_op) ATOMIC64_OPS(add, +=) ATOMIC64_OPS(sub, -=) -ATOMIC64_OP(and, &=) -ATOMIC64_OP(or, |=) -ATOMIC64_OP(xor, ^=) #undef ATOMIC64_OPS +#define ATOMIC64_OPS(op, c_op) \ + ATOMIC64_OP(op, c_op) \ + ATOMIC64_OP_RETURN(op, c_op) \ + ATOMIC64_FETCH_OP(op, c_op) + +ATOMIC64_OPS(and, &=) +ATOMIC64_OPS(or, |=) +ATOMIC64_OPS(xor, ^=) + +#undef ATOMIC64_OPS +#undef ATOMIC64_FETCH_OP #undef ATOMIC64_OP_RETURN #undef ATOMIC64_OP diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 123481814320..dbb369145dda 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -53,11 +53,25 @@ do { \ BUG_ON(atomic##bit##_read(&v) != r); \ } while (0) +#define TEST_FETCH(bit, op, c_op, val) \ +do { \ + atomic##bit##_set(&v, v0); \ + r = v0; \ + r c_op val; \ + BUG_ON(atomic##bit##_##op(val, &v) != v0); \ + BUG_ON(atomic##bit##_read(&v) != r); \ +} while (0) + #define RETURN_FAMILY_TEST(bit, op, c_op, val) \ do { \ FAMILY_TEST(TEST_RETURN, bit, op, c_op, val); \ } while (0) +#define FETCH_FAMILY_TEST(bit, op, c_op, val) \ +do { \ + FAMILY_TEST(TEST_FETCH, bit, op, c_op, val); \ +} while (0) + #define TEST_ARGS(bit, op, init, ret, expect, args...) \ do { \ atomic##bit##_set(&v, init); \ @@ -114,6 +128,16 @@ static __init void test_atomic(void) RETURN_FAMILY_TEST(, sub_return, -=, onestwos); RETURN_FAMILY_TEST(, sub_return, -=, -one); + FETCH_FAMILY_TEST(, fetch_add, +=, onestwos); + FETCH_FAMILY_TEST(, fetch_add, +=, -one); + FETCH_FAMILY_TEST(, fetch_sub, -=, onestwos); + FETCH_FAMILY_TEST(, fetch_sub, -=, -one); + + FETCH_FAMILY_TEST(, fetch_or, |=, v1); + FETCH_FAMILY_TEST(, fetch_and, &=, v1); + FETCH_FAMILY_TEST(, fetch_andnot, &= ~, v1); + FETCH_FAMILY_TEST(, fetch_xor, ^=, v1); + INC_RETURN_FAMILY_TEST(, v0); DEC_RETURN_FAMILY_TEST(, v0); @@ -154,6 +178,16 @@ static __init void test_atomic64(void) RETURN_FAMILY_TEST(64, sub_return, -=, onestwos); RETURN_FAMILY_TEST(64, sub_return, -=, -one); + FETCH_FAMILY_TEST(64, fetch_add, +=, onestwos); + FETCH_FAMILY_TEST(64, fetch_add, +=, -one); + FETCH_FAMILY_TEST(64, fetch_sub, -=, onestwos); + FETCH_FAMILY_TEST(64, fetch_sub, -=, -one); + + FETCH_FAMILY_TEST(64, fetch_or, |=, v1); + FETCH_FAMILY_TEST(64, fetch_and, &=, v1); + FETCH_FAMILY_TEST(64, fetch_andnot, &= ~, v1); + FETCH_FAMILY_TEST(64, fetch_xor, ^=, v1); + INIT(v0); atomic64_inc(&v); r += one; diff --git a/lib/bitmap.c b/lib/bitmap.c index c66da508cbf7..eca88087fa8a 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -14,9 +14,9 @@ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/string.h> +#include <linux/uaccess.h> #include <asm/page.h> -#include <asm/uaccess.h> /* * bitmaps provide an array of bits, implemented using an an diff --git a/lib/chacha20.c b/lib/chacha20.c new file mode 100644 index 000000000000..250ceed9ec9a --- /dev/null +++ b/lib/chacha20.c @@ -0,0 +1,79 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539 + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/bitops.h> +#include <linux/cryptohash.h> +#include <asm/unaligned.h> +#include <crypto/chacha20.h> + +static inline u32 rotl32(u32 v, u8 n) +{ + return (v << n) | (v >> (sizeof(v) * 8 - n)); +} + +extern void chacha20_block(u32 *state, void *stream) +{ + u32 x[16], *out = stream; + int i; + + for (i = 0; i < ARRAY_SIZE(x); i++) + x[i] = state[i]; + + for (i = 0; i < 20; i += 2) { + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16); + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16); + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16); + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16); + + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12); + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12); + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12); + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12); + + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8); + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8); + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8); + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8); + + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7); + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7); + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7); + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7); + + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16); + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16); + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16); + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16); + + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12); + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12); + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12); + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12); + + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8); + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8); + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8); + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8); + + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7); + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7); + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7); + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7); + } + + for (i = 0; i < ARRAY_SIZE(x); i++) + out[i] = cpu_to_le32(x[i] + state[i]); + + state[12]++; +} +EXPORT_SYMBOL(chacha20_block); diff --git a/lib/cpu-notifier-error-inject.c b/lib/cpu-notifier-error-inject.c index 707ca24f7b18..0e2c9a1e958a 100644 --- a/lib/cpu-notifier-error-inject.c +++ b/lib/cpu-notifier-error-inject.c @@ -8,16 +8,47 @@ static int priority; module_param(priority, int, 0); MODULE_PARM_DESC(priority, "specify cpu notifier priority"); +#define UP_PREPARE 0 +#define UP_PREPARE_FROZEN 0 +#define DOWN_PREPARE 0 +#define DOWN_PREPARE_FROZEN 0 + static struct notifier_err_inject cpu_notifier_err_inject = { .actions = { - { NOTIFIER_ERR_INJECT_ACTION(CPU_UP_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_UP_PREPARE_FROZEN) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_DOWN_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_DOWN_PREPARE_FROZEN) }, + { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE) }, + { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE_FROZEN) }, + { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE) }, + { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE_FROZEN) }, {} } }; +static int notf_err_handle(struct notifier_err_inject_action *action) +{ + int ret; + + ret = action->error; + if (ret) + pr_info("Injecting error (%d) to %s\n", ret, action->name); + return ret; +} + +static int notf_err_inj_up_prepare(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) + return notf_err_handle(&cpu_notifier_err_inject.actions[0]); + else + return notf_err_handle(&cpu_notifier_err_inject.actions[1]); +} + +static int notf_err_inj_dead(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) + return notf_err_handle(&cpu_notifier_err_inject.actions[2]); + else + return notf_err_handle(&cpu_notifier_err_inject.actions[3]); +} + static struct dentry *dir; static int err_inject_init(void) @@ -29,7 +60,10 @@ static int err_inject_init(void) if (IS_ERR(dir)) return PTR_ERR(dir); - err = register_hotcpu_notifier(&cpu_notifier_err_inject.nb); + err = cpuhp_setup_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE, + "cpu-err-notif:prepare", + notf_err_inj_up_prepare, + notf_err_inj_dead); if (err) debugfs_remove_recursive(dir); @@ -38,7 +72,7 @@ static int err_inject_init(void) static void err_inject_exit(void) { - unregister_hotcpu_notifier(&cpu_notifier_err_inject.nb); + cpuhp_remove_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE); debugfs_remove_recursive(dir); } diff --git a/lib/crc32.c b/lib/crc32.c index 9a907d489d95..7fbd1a112b9d 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -979,7 +979,6 @@ static int __init crc32c_test(void) int i; int errors = 0; int bytes = 0; - struct timespec start, stop; u64 nsec; unsigned long flags; @@ -999,20 +998,17 @@ static int __init crc32c_test(void) local_irq_save(flags); local_irq_disable(); - getnstimeofday(&start); + nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { if (test[i].crc32c_le != __crc32c_le(test[i].crc, test_buf + test[i].start, test[i].length)) errors++; } - getnstimeofday(&stop); + nsec = ktime_get_ns() - nsec; local_irq_restore(flags); local_irq_enable(); - nsec = stop.tv_nsec - start.tv_nsec + - 1000000000 * (stop.tv_sec - start.tv_sec); - pr_info("crc32c: CRC_LE_BITS = %d\n", CRC_LE_BITS); if (errors) @@ -1065,7 +1061,6 @@ static int __init crc32_test(void) int i; int errors = 0; int bytes = 0; - struct timespec start, stop; u64 nsec; unsigned long flags; @@ -1088,7 +1083,7 @@ static int __init crc32_test(void) local_irq_save(flags); local_irq_disable(); - getnstimeofday(&start); + nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { if (test[i].crc_le != crc32_le(test[i].crc, test_buf + test[i].start, test[i].length)) @@ -1098,14 +1093,11 @@ static int __init crc32_test(void) test[i].start, test[i].length)) errors++; } - getnstimeofday(&stop); + nsec = ktime_get_ns() - nsec; local_irq_restore(flags); local_irq_enable(); - nsec = stop.tv_nsec - start.tv_nsec + - 1000000000 * (stop.tv_sec - start.tv_sec); - pr_info("crc32: CRC_LE_BITS = %d, CRC_BE BITS = %d\n", CRC_LE_BITS, CRC_BE_BITS); diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 519b5a10fd70..a8e12601eb37 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg) * Try to repair the damage, so we have a better chance to get useful * debug output. */ -static int -debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state), +static bool +debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state), void * addr, enum debug_obj_state state) { - int fixed = 0; - - if (fixup) - fixed = fixup(addr, state); - debug_objects_fixups += fixed; - return fixed; + if (fixup && fixup(addr, state)) { + debug_objects_fixups++; + return true; + } + return false; } static void debug_object_is_on_stack(void *addr, int onstack) @@ -416,7 +415,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); ret = debug_object_fixup(descr->fixup_activate, addr, state); - return ret ? -EINVAL : 0; + return ret ? 0 : -EINVAL; case ODEBUG_STATE_DESTROYED: debug_print_object(obj, "activate"); @@ -432,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); /* - * This happens when a static object is activated. We - * let the type specific code decide whether this is - * true or not. + * We are here when a static object is activated. We + * let the type specific code confirm whether this is + * true or not. if true, we just make sure that the + * static object is tracked in the object tracker. If + * not, this must be a bug, so we try to fix it up. */ - if (debug_object_fixup(descr->fixup_activate, addr, - ODEBUG_STATE_NOTAVAILABLE)) { + if (descr->is_static_object && descr->is_static_object(addr)) { + /* track this static object */ + debug_object_init(addr, descr); + debug_object_activate(addr, descr); + } else { debug_print_object(&o, "activate"); - return -EINVAL; + ret = debug_object_fixup(descr->fixup_activate, addr, + ODEBUG_STATE_NOTAVAILABLE); + return ret ? 0 : -EINVAL; } return 0; } @@ -603,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); /* - * Maybe the object is static. Let the type specific - * code decide what to do. + * Maybe the object is static, and we let the type specific + * code confirm. Track this static object if true, else invoke + * fixup. */ - if (debug_object_fixup(descr->fixup_assert_init, addr, - ODEBUG_STATE_NOTAVAILABLE)) + if (descr->is_static_object && descr->is_static_object(addr)) { + /* Track this static object */ + debug_object_init(addr, descr); + } else { debug_print_object(&o, "assert_init"); + debug_object_fixup(descr->fixup_assert_init, addr, + ODEBUG_STATE_NOTAVAILABLE); + } return; } @@ -793,11 +805,18 @@ struct self_test { static __initdata struct debug_obj_descr descr_type_test; +static bool __init is_static_object(void *addr) +{ + struct self_test *obj = addr; + + return obj->static_init; +} + /* * fixup_init is called when: * - an active object is initialized */ -static int __init fixup_init(void *addr, enum debug_obj_state state) +static bool __init fixup_init(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -805,37 +824,31 @@ static int __init fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_init(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ -static int __init fixup_activate(void *addr, enum debug_obj_state state) +static bool __init fixup_activate(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (obj->static_init == 1) { - debug_object_init(obj, &descr_type_test); - debug_object_activate(obj, &descr_type_test); - return 0; - } - return 1; - + return true; case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_activate(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -843,7 +856,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state) * fixup_destroy is called when: * - an active object is destroyed */ -static int __init fixup_destroy(void *addr, enum debug_obj_state state) +static bool __init fixup_destroy(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -851,9 +864,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_destroy(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -861,7 +874,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int __init fixup_free(void *addr, enum debug_obj_state state) +static bool __init fixup_free(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -869,9 +882,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_free(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -917,6 +930,7 @@ out: static __initdata struct debug_obj_descr descr_type_test = { .name = "selftest", + .is_static_object = is_static_object, .fixup_init = fixup_init, .fixup_activate = fixup_activate, .fixup_destroy = fixup_destroy, diff --git a/lib/digsig.c b/lib/digsig.c index 07be6c1ef4e2..55b8b2f41a9e 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -104,21 +104,25 @@ static int digsig_verify_rsa(struct key *key, datap = pkh->mpi; endp = ukp->data + ukp->datalen; - err = -ENOMEM; - for (i = 0; i < pkh->nmpi; i++) { unsigned int remaining = endp - datap; pkey[i] = mpi_read_from_buffer(datap, &remaining); - if (!pkey[i]) + if (IS_ERR(pkey[i])) { + err = PTR_ERR(pkey[i]); goto err; + } datap += remaining; } mblen = mpi_get_nbits(pkey[0]); mlen = DIV_ROUND_UP(mblen, 8); - if (mlen == 0) + if (mlen == 0) { + err = -EINVAL; goto err; + } + + err = -ENOMEM; out1 = kzalloc(mlen, GFP_KERNEL); if (!out1) @@ -126,8 +130,10 @@ static int digsig_verify_rsa(struct key *key, nret = siglen; in = mpi_read_from_buffer(sig, &nret); - if (!in) + if (IS_ERR(in)) { + err = PTR_ERR(in); goto err; + } res = mpi_alloc(mpi_get_nlimbs(in) * 2); if (!res) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 4a1515f4b452..8971370bfb16 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -22,6 +22,7 @@ #include <linux/stacktrace.h> #include <linux/dma-debug.h> #include <linux/spinlock.h> +#include <linux/vmalloc.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -43,6 +44,7 @@ enum { dma_debug_page, dma_debug_sg, dma_debug_coherent, + dma_debug_resource, }; enum map_err_types { @@ -150,8 +152,9 @@ static const char *const maperr2str[] = { [MAP_ERR_CHECKED] = "dma map error checked", }; -static const char *type2name[4] = { "single", "page", - "scather-gather", "coherent" }; +static const char *type2name[5] = { "single", "page", + "scather-gather", "coherent", + "resource" }; static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", "DMA_FROM_DEVICE", "DMA_NONE" }; @@ -253,6 +256,7 @@ static int hash_fn(struct dma_debug_entry *entry) */ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, unsigned long *flags) + __acquires(&dma_entry_hash[idx].lock) { int idx = hash_fn(entry); unsigned long __flags; @@ -267,6 +271,7 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, */ static void put_hash_bucket(struct hash_bucket *bucket, unsigned long *flags) + __releases(&bucket->lock) { unsigned long __flags = *flags; @@ -397,6 +402,9 @@ static void hash_bucket_del(struct dma_debug_entry *entry) static unsigned long long phys_addr(struct dma_debug_entry *entry) { + if (entry->type == dma_debug_resource) + return __pfn_to_phys(entry->pfn) + entry->offset; + return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; } @@ -657,9 +665,9 @@ static struct dma_debug_entry *dma_entry_alloc(void) spin_lock_irqsave(&free_entries_lock, flags); if (list_empty(&free_entries)) { - pr_err("DMA-API: debugging out of memory - disabling\n"); global_disable = true; spin_unlock_irqrestore(&free_entries_lock, flags); + pr_err("DMA-API: debugging out of memory - disabling\n"); return NULL; } @@ -1162,11 +1170,32 @@ static void check_unmap(struct dma_debug_entry *ref) put_hash_bucket(bucket, &flags); } -static void check_for_stack(struct device *dev, void *addr) +static void check_for_stack(struct device *dev, + struct page *page, size_t offset) { - if (object_is_on_stack(addr)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from " - "stack [addr=%p]\n", addr); + void *addr; + struct vm_struct *stack_vm_area = task_stack_vm_area(current); + + if (!stack_vm_area) { + /* Stack is direct-mapped. */ + if (PageHighMem(page)) + return; + addr = page_address(page) + offset; + if (object_is_on_stack(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); + } else { + /* Stack is vmalloced. */ + int i; + + for (i = 0; i < stack_vm_area->nr_pages; i++) { + if (page != stack_vm_area->pages[i]) + continue; + + addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); + break; + } + } } static inline bool overlap(void *addr, unsigned long len, void *start, void *end) @@ -1289,10 +1318,11 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, if (map_single) entry->type = dma_debug_single; + check_for_stack(dev, page, offset); + if (!PageHighMem(page)) { void *addr = page_address(page) + offset; - check_for_stack(dev, addr); check_for_illegal_area(dev, addr, size); } @@ -1384,8 +1414,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->sg_call_ents = nents; entry->sg_mapped_ents = mapped_ents; + check_for_stack(dev, sg_page(s), s->offset); + if (!PageHighMem(sg_page(s))) { - check_for_stack(dev, sg_virt(s)); check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); } @@ -1493,6 +1524,49 @@ void debug_dma_free_coherent(struct device *dev, size_t size, } EXPORT_SYMBOL(debug_dma_free_coherent); +void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, + int direction, dma_addr_t dma_addr) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_resource; + entry->dev = dev; + entry->pfn = PHYS_PFN(addr); + entry->offset = offset_in_page(addr); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + entry->map_err_type = MAP_ERR_NOT_CHECKED; + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_map_resource); + +void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + struct dma_debug_entry ref = { + .type = dma_debug_resource, + .dev = dev, + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_unmap_resource); + void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { diff --git a/lib/dma-noop.c b/lib/dma-noop.c index 72145646857e..3d766e78fbe2 100644 --- a/lib/dma-noop.c +++ b/lib/dma-noop.c @@ -10,7 +10,7 @@ static void *dma_noop_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, - struct dma_attrs *attrs) + unsigned long attrs) { void *ret; @@ -22,7 +22,7 @@ static void *dma_noop_alloc(struct device *dev, size_t size, static void dma_noop_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, - struct dma_attrs *attrs) + unsigned long attrs) { free_pages((unsigned long)cpu_addr, get_order(size)); } @@ -30,13 +30,14 @@ static void dma_noop_free(struct device *dev, size_t size, static dma_addr_t dma_noop_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { return page_to_phys(page) + offset; } static int dma_noop_map_sg(struct device *dev, struct scatterlist *sgl, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, + unsigned long attrs) { int i; struct scatterlist *sg; diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index fe42b6ec3f0c..da796e2dc4f5 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -188,6 +188,13 @@ static int ddebug_change(const struct ddebug_query *query, newflags = (dp->flags & mask) | flags; if (newflags == dp->flags) continue; +#ifdef HAVE_JUMP_LABEL + if (dp->flags & _DPRINTK_FLAGS_PRINT) { + if (!(flags & _DPRINTK_FLAGS_PRINT)) + static_branch_disable(&dp->key.dd_key_true); + } else if (flags & _DPRINTK_FLAGS_PRINT) + static_branch_enable(&dp->key.dd_key_true); +#endif dp->flags = newflags; vpr_info("changed %s:%d [%s]%s =%s\n", trim_prefix(dp->filename), dp->lineno, diff --git a/lib/earlycpio.c b/lib/earlycpio.c index 3eb3e4722b8e..db283ba4d2c1 100644 --- a/lib/earlycpio.c +++ b/lib/earlycpio.c @@ -125,7 +125,10 @@ struct cpio_data find_cpio_data(const char *path, void *data, if ((ch[C_MODE] & 0170000) == 0100000 && ch[C_NAMESIZE] >= mypathsize && !memcmp(p, path, mypathsize)) { - *nextoff = (long)nptr - (long)data; + + if (nextoff) + *nextoff = (long)nptr - (long)data; + if (ch[C_NAMESIZE] - mypathsize >= MAX_CPIO_FILE_NAME) { pr_warn( "File %s exceeding MAX_CPIO_FILE_NAME [%d]\n", diff --git a/lib/gcd.c b/lib/gcd.c index 3657f129d7b8..135ee6407a5e 100644 --- a/lib/gcd.c +++ b/lib/gcd.c @@ -2,20 +2,77 @@ #include <linux/gcd.h> #include <linux/export.h> -/* Greatest common divisor */ +/* + * This implements the binary GCD algorithm. (Often attributed to Stein, + * but as Knuth has noted, appears in a first-century Chinese math text.) + * + * This is faster than the division-based algorithm even on x86, which + * has decent hardware division. + */ + +#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS) + +/* If __ffs is available, the even/odd algorithm benchmarks slower. */ unsigned long gcd(unsigned long a, unsigned long b) { - unsigned long r; + unsigned long r = a | b; + + if (!a || !b) + return r; - if (a < b) - swap(a, b); + b >>= __ffs(b); + if (b == 1) + return r & -r; - if (!b) - return a; - while ((r = a % b) != 0) { - a = b; - b = r; + for (;;) { + a >>= __ffs(a); + if (a == 1) + return r & -r; + if (a == b) + return a << __ffs(r); + + if (a < b) + swap(a, b); + a -= b; } - return b; } + +#else + +/* If normalization is done by loops, the even/odd algorithm is a win. */ +unsigned long gcd(unsigned long a, unsigned long b) +{ + unsigned long r = a | b; + + if (!a || !b) + return r; + + /* Isolate lsbit of r */ + r &= -r; + + while (!(b & r)) + b >>= 1; + if (b == r) + return r; + + for (;;) { + while (!(a & r)) + a >>= 1; + if (a == r) + return r; + if (a == b) + return a; + + if (a < b) + swap(a, b); + a -= b; + a >>= 1; + if (a & r) + a += b; + a >>= 1; + } +} + +#endif + EXPORT_SYMBOL_GPL(gcd); diff --git a/lib/hweight.c b/lib/hweight.c index 9a5c1f221558..43273a7d83cf 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,6 +9,7 @@ * The Hamming Weight of a number is the total number of bits set in it. */ +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned int __sw_hweight32(unsigned int w) { #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER @@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w) #endif } EXPORT_SYMBOL(__sw_hweight32); +#endif unsigned int __sw_hweight16(unsigned int w) { @@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w) } EXPORT_SYMBOL(__sw_hweight8); +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 @@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w) #endif } EXPORT_SYMBOL(__sw_hweight64); +#endif diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c index c27e269210c4..a816f3a80625 100644 --- a/lib/iommu-helper.c +++ b/lib/iommu-helper.c @@ -29,8 +29,7 @@ again: index = bitmap_find_next_zero_area(map, size, start, nr, align_mask); if (index < size) { if (iommu_is_span_boundary(index, nr, shift, boundary_size)) { - /* we could do more effectively */ - start = index + 1; + start = ALIGN(shift + index, boundary_size) - shift; goto again; } bitmap_set(map, index, nr); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index ca5316e0087b..7e3138cfc8c9 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -56,37 +56,24 @@ n = wanted; \ } -#define iterate_bvec(i, n, __v, __p, skip, STEP) { \ - size_t wanted = n; \ - __p = i->bvec; \ - __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ - if (likely(__v.bv_len)) { \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset + skip; \ - (void)(STEP); \ - skip += __v.bv_len; \ - n -= __v.bv_len; \ - } \ - while (unlikely(n)) { \ - __p++; \ - __v.bv_len = min_t(size_t, n, __p->bv_len); \ - if (unlikely(!__v.bv_len)) \ +#define iterate_bvec(i, n, __v, __bi, skip, STEP) { \ + struct bvec_iter __start; \ + __start.bi_size = n; \ + __start.bi_bvec_done = skip; \ + __start.bi_idx = 0; \ + for_each_bvec(__v, i->bvec, __bi, __start) { \ + if (!__v.bv_len) \ continue; \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset; \ (void)(STEP); \ - skip = __v.bv_len; \ - n -= __v.bv_len; \ } \ - n = wanted; \ } #define iterate_all_kinds(i, n, v, I, B, K) { \ size_t skip = i->iov_offset; \ if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ + struct bvec_iter __bi; \ + iterate_bvec(i, n, v, __bi, skip, (B)) \ } else if (unlikely(i->type & ITER_KVEC)) { \ const struct kvec *kvec; \ struct kvec v; \ @@ -99,40 +86,42 @@ } #define iterate_and_advance(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ - struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ - if (skip == bvec->bv_len) { \ - bvec++; \ - skip = 0; \ - } \ - i->nr_segs -= bvec - i->bvec; \ - i->bvec = bvec; \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - if (skip == kvec->iov_len) { \ - kvec++; \ - skip = 0; \ - } \ - i->nr_segs -= kvec - i->kvec; \ - i->kvec = kvec; \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ - if (skip == iov->iov_len) { \ - iov++; \ - skip = 0; \ + if (unlikely(i->count < n)) \ + n = i->count; \ + if (i->count) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + const struct bio_vec *bvec = i->bvec; \ + struct bio_vec v; \ + struct bvec_iter __bi; \ + iterate_bvec(i, n, v, __bi, skip, (B)) \ + i->bvec = __bvec_iter_bvec(i->bvec, __bi); \ + i->nr_segs -= i->bvec - bvec; \ + skip = __bi.bi_bvec_done; \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + if (skip == kvec->iov_len) { \ + kvec++; \ + skip = 0; \ + } \ + i->nr_segs -= kvec - i->kvec; \ + i->kvec = kvec; \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + if (skip == iov->iov_len) { \ + iov++; \ + skip = 0; \ + } \ + i->nr_segs -= iov - i->iov; \ + i->iov = iov; \ } \ - i->nr_segs -= iov - i->iov; \ - i->iov = iov; \ + i->count -= n; \ + i->iov_offset = skip; \ } \ - i->count -= n; \ - i->iov_offset = skip; \ } static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, @@ -155,7 +144,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b buf = iov->iov_base + skip; copy = min(bytes, iov->iov_len - skip); - if (!fault_in_pages_writeable(buf, copy)) { + if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { kaddr = kmap_atomic(page); from = kaddr + offset; @@ -186,6 +175,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b copy = min(bytes, iov->iov_len - skip); } /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); from = kaddr + offset; left = __copy_to_user(buf, from, copy); @@ -204,6 +194,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b bytes -= copy; } kunmap(page); + done: if (skip == iov->iov_len) { iov++; @@ -236,7 +227,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t buf = iov->iov_base + skip; copy = min(bytes, iov->iov_len - skip); - if (!fault_in_pages_readable(buf, copy)) { + if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { kaddr = kmap_atomic(page); to = kaddr + offset; @@ -267,6 +258,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t copy = min(bytes, iov->iov_len - skip); } /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); to = kaddr + offset; left = __copy_from_user(to, buf, copy); @@ -285,6 +277,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes -= copy; } kunmap(page); + done: if (skip == iov->iov_len) { iov++; @@ -298,33 +291,13 @@ done: } /* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) -{ - if (!(i->type & (ITER_BVEC|ITER_KVEC))) { - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); - } - return 0; -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); - -/* * Fault in one or more iovecs of the given iov_iter, to a maximum length of * bytes. For each iovec, fault in each page that constitutes the iovec. * * Return 0 on success, or non-zero if the memory could not be accessed (i.e. * because it is an invalid address). */ -int iov_iter_fault_in_multipages_readable(struct iov_iter *i, size_t bytes) +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) { size_t skip = i->iov_offset; const struct iovec *iov; @@ -341,7 +314,7 @@ int iov_iter_fault_in_multipages_readable(struct iov_iter *i, size_t bytes) } return 0; } -EXPORT_SYMBOL(iov_iter_fault_in_multipages_readable); +EXPORT_SYMBOL(iov_iter_fault_in_readable); void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, unsigned long nr_segs, @@ -386,12 +359,6 @@ static void memzero_page(struct page *page, size_t offset, size_t len) size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - iterate_and_advance(i, bytes, v, __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), @@ -407,12 +374,6 @@ EXPORT_SYMBOL(copy_to_iter); size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - iterate_and_advance(i, bytes, v, __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -428,12 +389,6 @@ EXPORT_SYMBOL(copy_from_iter); size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - iterate_and_advance(i, bytes, v, __copy_from_user_nocache((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -474,12 +429,6 @@ EXPORT_SYMBOL(copy_page_from_iter); size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - iterate_and_advance(i, bytes, v, __clear_user(v.iov_base, v.iov_len), memzero_page(v.bv_page, v.bv_offset, v.bv_len), @@ -685,12 +634,6 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, char *to = addr; __wsum sum, next; size_t off = 0; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - sum = *csum; iterate_and_advance(i, bytes, v, ({ int err = 0; @@ -729,12 +672,6 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, const char *from = addr; __wsum sum, next; size_t off = 0; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - sum = *csum; iterate_and_advance(i, bytes, v, ({ int err = 0; diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 836f7db4e548..2be55692aa43 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -184,30 +184,21 @@ void irq_poll_init(struct irq_poll *iop, int weight, irq_poll_fn *poll_fn) } EXPORT_SYMBOL(irq_poll_init); -static int irq_poll_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int irq_poll_cpu_dead(unsigned int cpu) { /* * If a CPU goes away, splice its entries to the current CPU * and trigger a run of the softirq */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), - this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); - local_irq_enable(); - } + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), + this_cpu_ptr(&blk_cpu_iopoll)); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + local_irq_enable(); - return NOTIFY_OK; + return 0; } -static struct notifier_block irq_poll_cpu_notifier = { - .notifier_call = irq_poll_cpu_notify, -}; - static __init int irq_poll_setup(void) { int i; @@ -216,7 +207,8 @@ static __init int irq_poll_setup(void) INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq); - register_hotcpu_notifier(&irq_poll_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_IRQ_POLL_DEAD, "irq_poll:dead", NULL, + irq_poll_cpu_dead); return 0; } subsys_initcall(irq_poll_setup); diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index 747606f9e4a3..5a0f75a3bf01 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -21,6 +21,7 @@ #include <linux/bitops.h> #include <linux/count_zeros.h> #include <linux/byteorder/generic.h> +#include <linux/scatterlist.h> #include <linux/string.h> #include "mpi-internal.h" @@ -50,9 +51,7 @@ MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes) return NULL; } if (nbytes > 0) - nbits -= count_leading_zeros(buffer[0]); - else - nbits = 0; + nbits -= count_leading_zeros(buffer[0]) - (BITS_PER_LONG - 8); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); @@ -82,50 +81,30 @@ EXPORT_SYMBOL_GPL(mpi_read_raw_data); MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) { const uint8_t *buffer = xbuffer; - int i, j; - unsigned nbits, nbytes, nlimbs, nread = 0; - mpi_limb_t a; - MPI val = NULL; + unsigned int nbits, nbytes; + MPI val; if (*ret_nread < 2) - goto leave; + return ERR_PTR(-EINVAL); nbits = buffer[0] << 8 | buffer[1]; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); - goto leave; + return ERR_PTR(-EINVAL); } - buffer += 2; - nread = 2; nbytes = DIV_ROUND_UP(nbits, 8); - nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); - val = mpi_alloc(nlimbs); - if (!val) - return NULL; - i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; - i %= BYTES_PER_MPI_LIMB; - val->nbits = nbits; - j = val->nlimbs = nlimbs; - val->sign = 0; - for (; j > 0; j--) { - a = 0; - for (; i < BYTES_PER_MPI_LIMB; i++) { - if (++nread > *ret_nread) { - printk - ("MPI: mpi larger than buffer nread=%d ret_nread=%d\n", - nread, *ret_nread); - goto leave; - } - a <<= 8; - a |= *buffer++; - } - i = 0; - val->d[j - 1] = a; + if (nbytes + 2 > *ret_nread) { + pr_info("MPI: mpi larger than buffer nbytes=%u ret_nread=%u\n", + nbytes, *ret_nread); + return ERR_PTR(-EINVAL); } -leave: - *ret_nread = nread; + val = mpi_read_raw_data(buffer + 2, nbytes); + if (!val) + return ERR_PTR(-ENOMEM); + + *ret_nread = nbytes + 2; return val; } EXPORT_SYMBOL_GPL(mpi_read_from_buffer); @@ -250,82 +229,6 @@ void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign) } EXPORT_SYMBOL_GPL(mpi_get_buffer); -/**************** - * Use BUFFER to update MPI. - */ -int mpi_set_buffer(MPI a, const void *xbuffer, unsigned nbytes, int sign) -{ - const uint8_t *buffer = xbuffer, *p; - mpi_limb_t alimb; - int nlimbs; - int i; - - nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); - if (RESIZE_IF_NEEDED(a, nlimbs) < 0) - return -ENOMEM; - a->sign = sign; - - for (i = 0, p = buffer + nbytes - 1; p >= buffer + BYTES_PER_MPI_LIMB;) { -#if BYTES_PER_MPI_LIMB == 4 - alimb = (mpi_limb_t) *p--; - alimb |= (mpi_limb_t) *p-- << 8; - alimb |= (mpi_limb_t) *p-- << 16; - alimb |= (mpi_limb_t) *p-- << 24; -#elif BYTES_PER_MPI_LIMB == 8 - alimb = (mpi_limb_t) *p--; - alimb |= (mpi_limb_t) *p-- << 8; - alimb |= (mpi_limb_t) *p-- << 16; - alimb |= (mpi_limb_t) *p-- << 24; - alimb |= (mpi_limb_t) *p-- << 32; - alimb |= (mpi_limb_t) *p-- << 40; - alimb |= (mpi_limb_t) *p-- << 48; - alimb |= (mpi_limb_t) *p-- << 56; -#else -#error please implement for this limb size. -#endif - a->d[i++] = alimb; - } - if (p >= buffer) { -#if BYTES_PER_MPI_LIMB == 4 - alimb = *p--; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 8; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 16; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 24; -#elif BYTES_PER_MPI_LIMB == 8 - alimb = (mpi_limb_t) *p--; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 8; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 16; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 24; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 32; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 40; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 48; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 56; -#else -#error please implement for this limb size. -#endif - a->d[i++] = alimb; - } - a->nlimbs = i; - - if (i != nlimbs) { - pr_emerg("MPI: mpi_set_buffer: Assertion failed (%d != %d)", i, - nlimbs); - BUG(); - } - return 0; -} -EXPORT_SYMBOL_GPL(mpi_set_buffer); - /** * mpi_write_to_sgl() - Funnction exports MPI to an sgl (msb first) * @@ -335,16 +238,13 @@ EXPORT_SYMBOL_GPL(mpi_set_buffer); * @a: a multi precision integer * @sgl: scatterlist to write to. Needs to be at least * mpi_get_size(a) long. - * @nbytes: in/out param - it has the be set to the maximum number of - * bytes that can be written to sgl. This has to be at least - * the size of the integer a. On return it receives the actual - * length of the data written on success or the data that would - * be written if buffer was too small. + * @nbytes: the number of bytes to write. Leading bytes will be + * filled with zero. * @sign: if not NULL, it will be set to the sign of a. * * Return: 0 on success or error code in case of error */ -int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes, +int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned nbytes, int *sign) { u8 *p, *p2; @@ -356,55 +256,60 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes, #error please implement for this limb size. #endif unsigned int n = mpi_get_size(a); - int i, x, y = 0, lzeros, buf_len; - - if (!nbytes) - return -EINVAL; + struct sg_mapping_iter miter; + int i, x, buf_len; + int nents; if (sign) *sign = a->sign; - lzeros = count_lzeros(a); - - if (*nbytes < n - lzeros) { - *nbytes = n - lzeros; + if (nbytes < n) return -EOVERFLOW; - } - *nbytes = n - lzeros; - buf_len = sgl->length; - p2 = sg_virt(sgl); + nents = sg_nents_for_len(sgl, nbytes); + if (nents < 0) + return -EINVAL; - for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB, - lzeros %= BYTES_PER_MPI_LIMB; - i >= 0; i--) { + sg_miter_start(&miter, sgl, nents, SG_MITER_ATOMIC | SG_MITER_TO_SG); + sg_miter_next(&miter); + buf_len = miter.length; + p2 = miter.addr; + + while (nbytes > n) { + i = min_t(unsigned, nbytes - n, buf_len); + memset(p2, 0, i); + p2 += i; + nbytes -= i; + + buf_len -= i; + if (!buf_len) { + sg_miter_next(&miter); + buf_len = miter.length; + p2 = miter.addr; + } + } + + for (i = a->nlimbs - 1; i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 - alimb = cpu_to_be32(a->d[i]); + alimb = a->d[i] ? cpu_to_be32(a->d[i]) : 0; #elif BYTES_PER_MPI_LIMB == 8 - alimb = cpu_to_be64(a->d[i]); + alimb = a->d[i] ? cpu_to_be64(a->d[i]) : 0; #else #error please implement for this limb size. #endif - if (lzeros) { - y = lzeros; - lzeros = 0; - } - - p = (u8 *)&alimb + y; + p = (u8 *)&alimb; - for (x = 0; x < sizeof(alimb) - y; x++) { - if (!buf_len) { - sgl = sg_next(sgl); - if (!sgl) - return -EINVAL; - buf_len = sgl->length; - p2 = sg_virt(sgl); - } + for (x = 0; x < sizeof(alimb); x++) { *p2++ = *p++; - buf_len--; + if (!--buf_len) { + sg_miter_next(&miter); + buf_len = miter.length; + p2 = miter.addr; + } } - y = 0; } + + sg_miter_stop(&miter); return 0; } EXPORT_SYMBOL_GPL(mpi_write_to_sgl); @@ -424,19 +329,23 @@ EXPORT_SYMBOL_GPL(mpi_write_to_sgl); */ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) { - struct scatterlist *sg; - int x, i, j, z, lzeros, ents; + struct sg_mapping_iter miter; unsigned int nbits, nlimbs; + int x, j, z, lzeros, ents; + unsigned int len; + const u8 *buff; mpi_limb_t a; MPI val = NULL; - lzeros = 0; - ents = sg_nents(sgl); + ents = sg_nents_for_len(sgl, nbytes); + if (ents < 0) + return NULL; - for_each_sg(sgl, sg, ents, i) { - const u8 *buff = sg_virt(sg); - int len = sg->length; + sg_miter_start(&miter, sgl, ents, SG_MITER_ATOMIC | SG_MITER_FROM_SG); + lzeros = 0; + len = 0; + while (nbytes > 0) { while (len && !*buff) { lzeros++; len--; @@ -446,12 +355,17 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) if (len && *buff) break; - ents--; + sg_miter_next(&miter); + buff = miter.addr; + len = miter.length; + nbytes -= lzeros; lzeros = 0; } - sgl = sg; + miter.consumed = lzeros; + sg_miter_stop(&miter); + nbytes -= lzeros; nbits = nbytes * 8; if (nbits > MAX_EXTERN_MPI_BITS) { @@ -460,8 +374,7 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) } if (nbytes > 0) - nbits -= count_leading_zeros(*(u8 *)(sg_virt(sgl) + lzeros)) - - (BITS_PER_LONG - 8); + nbits -= count_leading_zeros(*buff) - (BITS_PER_LONG - 8); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); @@ -480,21 +393,21 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; z %= BYTES_PER_MPI_LIMB; - for_each_sg(sgl, sg, ents, i) { - const u8 *buffer = sg_virt(sg) + lzeros; - int len = sg->length - lzeros; + while (sg_miter_next(&miter)) { + buff = miter.addr; + len = miter.length; for (x = 0; x < len; x++) { a <<= 8; - a |= *buffer++; + a |= *buff++; if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) { val->d[j--] = a; a = 0; } } z += x; - lzeros = 0; } + return val; } EXPORT_SYMBOL_GPL(mpi_read_raw_from_sgl); diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 6019c53c669e..26caf51cc238 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -16,33 +16,14 @@ #include <linux/delay.h> #include <linux/kprobes.h> #include <linux/nmi.h> -#include <linux/seq_buf.h> #ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) -{ - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); -} - /* * When raise() is called it will be is passed a pointer to the * backtrace_mask. Architectures that call nmi_cpu_backtrace() @@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end) void nmi_trigger_all_cpu_backtrace(bool include_self, void (*raise)(cpumask_t *mask)) { - struct nmi_seq_buf *s; - int i, cpu, this_cpu = get_cpu(); + int i, this_cpu = get_cpu(); if (test_and_set_bit(0, &backtrace_flag)) { /* @@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, if (!include_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - if (!cpumask_empty(to_cpumask(backtrace_mask))) { pr_info("Sending NMI to %s CPUs:\n", (include_self ? "all" : "other")); @@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, } /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. + * Force flush any remote buffers that might be stuck in IRQ context + * and therefore could not run their irq_work. */ - for_each_cpu(cpu, &printtrace_mask) { - int len, last_i = 0; + printk_nmi_flush(); - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } - - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); + clear_bit_unlock(0, &backtrace_flag); put_cpu(); } -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; -} - bool nmi_cpu_backtrace(struct pt_regs *regs) { int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); pr_warn("NMI backtrace for cpu %d\n", cpu); if (regs) show_regs(regs); else dump_stack(); - this_cpu_write(printk_func, printk_func_save); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } diff --git a/lib/nodemask.c b/lib/nodemask.c new file mode 100644 index 000000000000..e42a5bf44d33 --- /dev/null +++ b/lib/nodemask.c @@ -0,0 +1,30 @@ +#include <linux/nodemask.h> +#include <linux/module.h> +#include <linux/random.h> + +int __next_node_in(int node, const nodemask_t *srcp) +{ + int ret = __next_node(node, srcp); + + if (ret == MAX_NUMNODES) + ret = __first_node(srcp); + return ret; +} +EXPORT_SYMBOL(__next_node_in); + +#ifdef CONFIG_NUMA +/* + * Return the bit number of a random bit set in the nodemask. + * (returns NUMA_NO_NODE if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ + int w, bit = NUMA_NO_NODE; + + w = nodes_weight(*maskp); + if (w) + bit = bitmap_ord_to_pos(maskp->bits, + get_random_int() % w, MAX_NUMNODES); + return bit; +} +#endif diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index f051d69f0910..72d36113ccaa 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -19,7 +19,7 @@ static DEFINE_SPINLOCK(percpu_counters_lock); static struct debug_obj_descr percpu_counter_debug_descr; -static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state) +static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; @@ -27,9 +27,9 @@ static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); - return 1; + return true; default: - return 0; + return false; } } diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 1624c4117961..8e6d552c40dd 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -4,6 +4,8 @@ * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin * Copyright (C) 2012 Konstantin Khlebnikov + * Copyright (C) 2016 Intel, Matthew Wilcox + * Copyright (C) 2016 Intel, Ross Zwisler * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -36,11 +38,8 @@ #include <linux/preempt.h> /* in_interrupt() */ -/* - * The height_to_maxindex array needs to be one deeper than the maximum - * path as height 0 holds only 1 entry. - */ -static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; +/* Number of nodes in fully populated tree of given height */ +static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; /* * Radix tree node cache. @@ -64,20 +63,58 @@ static struct kmem_cache *radix_tree_node_cachep; * Per-cpu pool of preloaded nodes */ struct radix_tree_preload { - int nr; + unsigned nr; /* nodes->private_data points to next preallocated node */ struct radix_tree_node *nodes; }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; -static inline void *ptr_to_indirect(void *ptr) +static inline void *node_to_entry(void *ptr) { - return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); + return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); } -static inline void *indirect_to_ptr(void *ptr) +#define RADIX_TREE_RETRY node_to_entry(NULL) + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +/* Sibling slots point directly to another slot in the same node */ +static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node) { - return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); + void **ptr = node; + return (parent->slots <= ptr) && + (ptr < parent->slots + RADIX_TREE_MAP_SIZE); +} +#else +static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node) +{ + return false; +} +#endif + +static inline unsigned long get_slot_offset(struct radix_tree_node *parent, + void **slot) +{ + return slot - parent->slots; +} + +static unsigned int radix_tree_descend(struct radix_tree_node *parent, + struct radix_tree_node **nodep, unsigned long index) +{ + unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; + void **entry = rcu_dereference_raw(parent->slots[offset]); + +#ifdef CONFIG_RADIX_TREE_MULTIORDER + if (radix_tree_is_internal_node(entry)) { + if (is_sibling_entry(parent, entry)) { + void **sibentry = (void **) entry_to_node(entry); + offset = get_slot_offset(parent, sibentry); + entry = rcu_dereference_raw(*sibentry); + } + } +#endif + + *nodep = (void *)entry; + return offset; } static inline gfp_t root_gfp_mask(struct radix_tree_root *root) @@ -108,7 +145,7 @@ static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); } -static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag) +static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) { root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); } @@ -120,7 +157,12 @@ static inline void root_tag_clear_all(struct radix_tree_root *root) static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) { - return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); + return (__force int)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); +} + +static inline unsigned root_tags_get(struct radix_tree_root *root) +{ + return (__force unsigned)root->gfp_mask >> __GFP_BITS_SHIFT; } /* @@ -129,7 +171,7 @@ static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) */ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) { - int idx; + unsigned idx; for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { if (node->tags[tag][idx]) return 1; @@ -173,38 +215,45 @@ radix_tree_find_next_bit(const unsigned long *addr, return size; } -#if 0 -static void dump_node(void *slot, int height, int offset) +#ifndef __KERNEL__ +static void dump_node(struct radix_tree_node *node, unsigned long index) { - struct radix_tree_node *node; - int i; + unsigned long i; - if (!slot) - return; + pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", + node, node->offset, + node->tags[0][0], node->tags[1][0], node->tags[2][0], + node->shift, node->count, node->parent); - if (height == 0) { - pr_debug("radix entry %p offset %d\n", slot, offset); - return; + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + unsigned long first = index | (i << node->shift); + unsigned long last = first | ((1UL << node->shift) - 1); + void *entry = node->slots[i]; + if (!entry) + continue; + if (is_sibling_entry(node, entry)) { + pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n", + entry, i, + *(void **)entry_to_node(entry), + first, last); + } else if (!radix_tree_is_internal_node(entry)) { + pr_debug("radix entry %p offset %ld indices %ld-%ld\n", + entry, i, first, last); + } else { + dump_node(entry_to_node(entry), first); + } } - - node = indirect_to_ptr(slot); - pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n", - slot, offset, node->tags[0][0], node->tags[1][0], - node->tags[2][0], node->path, node->count, node->parent); - - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) - dump_node(node->slots[i], height - 1, i); } /* For debug */ static void radix_tree_dump(struct radix_tree_root *root) { - pr_debug("radix root: %p height %d rnode %p tags %x\n", - root, root->height, root->rnode, + pr_debug("radix root: %p rnode %p tags %x\n", + root, root->rnode, root->gfp_mask >> __GFP_BITS_SHIFT); - if (!radix_tree_is_indirect_ptr(root->rnode)) + if (!radix_tree_is_internal_node(root->rnode)) return; - dump_node(root->rnode, root->height, 0); + dump_node(entry_to_node(root->rnode), 0); } #endif @@ -219,19 +268,20 @@ radix_tree_node_alloc(struct radix_tree_root *root) gfp_t gfp_mask = root_gfp_mask(root); /* - * Preload code isn't irq safe and it doesn't make sence to use - * preloading in the interrupt anyway as all the allocations have to - * be atomic. So just do normal allocation when in interrupt. + * Preload code isn't irq safe and it doesn't make sense to use + * preloading during an interrupt anyway as all the allocations have + * to be atomic. So just do normal allocation when in interrupt. */ if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { struct radix_tree_preload *rtp; /* * Even if the caller has preloaded, try to allocate from the - * cache first for the new node to get accounted. + * cache first for the new node to get accounted to the memory + * cgroup. */ ret = kmem_cache_alloc(radix_tree_node_cachep, - gfp_mask | __GFP_ACCOUNT | __GFP_NOWARN); + gfp_mask | __GFP_NOWARN); if (ret) goto out; @@ -254,10 +304,9 @@ radix_tree_node_alloc(struct radix_tree_root *root) kmemleak_update_trace(ret); goto out; } - ret = kmem_cache_alloc(radix_tree_node_cachep, - gfp_mask | __GFP_ACCOUNT); + ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: - BUG_ON(radix_tree_is_indirect_ptr(ret)); + BUG_ON(radix_tree_is_internal_node(ret)); return ret; } @@ -296,22 +345,28 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ -static int __radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask, int nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; int ret = -ENOMEM; + /* + * Nodes preloaded by one cgroup can be be used by another cgroup, so + * they should never be accounted to any particular memory cgroup. + */ + gfp_mask &= ~__GFP_ACCOUNT; + preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + while (rtp->nr < nr) { preempt_enable(); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + if (rtp->nr < nr) { node->private_data = rtp->nodes; rtp->nodes = node; rtp->nr++; @@ -337,7 +392,7 @@ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); - return __radix_tree_preload(gfp_mask); + return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); } EXPORT_SYMBOL(radix_tree_preload); @@ -349,7 +404,7 @@ EXPORT_SYMBOL(radix_tree_preload); int radix_tree_maybe_preload(gfp_t gfp_mask) { if (gfpflags_allow_blocking(gfp_mask)) - return __radix_tree_preload(gfp_mask); + return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ preempt_disable(); return 0; @@ -357,38 +412,103 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) EXPORT_SYMBOL(radix_tree_maybe_preload); /* - * Return the maximum key which can be store into a - * radix tree with height HEIGHT. + * The same as function above, but preload number of nodes required to insert + * (1 << order) continuous naturally-aligned elements. */ -static inline unsigned long radix_tree_maxindex(unsigned int height) +int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) { - return height_to_maxindex[height]; + unsigned long nr_subtrees; + int nr_nodes, subtree_height; + + /* Preloading doesn't help anything with this gfp mask, skip it */ + if (!gfpflags_allow_blocking(gfp_mask)) { + preempt_disable(); + return 0; + } + + /* + * Calculate number and height of fully populated subtrees it takes to + * store (1 << order) elements. + */ + nr_subtrees = 1 << order; + for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE; + subtree_height++) + nr_subtrees >>= RADIX_TREE_MAP_SHIFT; + + /* + * The worst case is zero height tree with a single item at index 0 and + * then inserting items starting at ULONG_MAX - (1 << order). + * + * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to + * 0-index item. + */ + nr_nodes = RADIX_TREE_MAX_PATH; + + /* Plus branch to fully populated subtrees. */ + nr_nodes += RADIX_TREE_MAX_PATH - subtree_height; + + /* Root node is shared. */ + nr_nodes--; + + /* Plus nodes required to build subtrees. */ + nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height]; + + return __radix_tree_preload(gfp_mask, nr_nodes); +} + +/* + * The maximum index which can be stored in a radix tree + */ +static inline unsigned long shift_maxindex(unsigned int shift) +{ + return (RADIX_TREE_MAP_SIZE << shift) - 1; +} + +static inline unsigned long node_maxindex(struct radix_tree_node *node) +{ + return shift_maxindex(node->shift); +} + +static unsigned radix_tree_load_root(struct radix_tree_root *root, + struct radix_tree_node **nodep, unsigned long *maxindex) +{ + struct radix_tree_node *node = rcu_dereference_raw(root->rnode); + + *nodep = node; + + if (likely(radix_tree_is_internal_node(node))) { + node = entry_to_node(node); + *maxindex = node_maxindex(node); + return node->shift + RADIX_TREE_MAP_SHIFT; + } + + *maxindex = 0; + return 0; } /* * Extend a radix tree so it can store key @index. */ static int radix_tree_extend(struct radix_tree_root *root, - unsigned long index, unsigned order) + unsigned long index, unsigned int shift) { - struct radix_tree_node *node; struct radix_tree_node *slot; - unsigned int height; + unsigned int maxshift; int tag; - /* Figure out what the height should be. */ - height = root->height + 1; - while (index > radix_tree_maxindex(height)) - height++; + /* Figure out what the shift should be. */ + maxshift = shift; + while (index > shift_maxindex(maxshift)) + maxshift += RADIX_TREE_MAP_SHIFT; - if ((root->rnode == NULL) && (order == 0)) { - root->height = height; + slot = root->rnode; + if (!slot) goto out; - } do { - unsigned int newheight; - if (!(node = radix_tree_node_alloc(root))) + struct radix_tree_node *node = radix_tree_node_alloc(root); + + if (!node) return -ENOMEM; /* Propagate the aggregated tag info into the new root */ @@ -397,25 +517,20 @@ static int radix_tree_extend(struct radix_tree_root *root, tag_set(node, tag, 0); } - /* Increase the height. */ - newheight = root->height+1; - BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK); - node->path = newheight; + BUG_ON(shift > BITS_PER_LONG); + node->shift = shift; + node->offset = 0; node->count = 1; node->parent = NULL; - slot = root->rnode; - if (radix_tree_is_indirect_ptr(slot) && newheight > 1) { - slot = indirect_to_ptr(slot); - slot->parent = node; - slot = ptr_to_indirect(slot); - } + if (radix_tree_is_internal_node(slot)) + entry_to_node(slot)->parent = node; node->slots[0] = slot; - node = ptr_to_indirect(node); - rcu_assign_pointer(root->rnode, node); - root->height = newheight; - } while (height > root->height); + slot = node_to_entry(node); + rcu_assign_pointer(root->rnode, slot); + shift += RADIX_TREE_MAP_SHIFT; + } while (shift <= maxshift); out: - return 0; + return maxshift + RADIX_TREE_MAP_SHIFT; } /** @@ -439,71 +554,70 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, unsigned order, struct radix_tree_node **nodep, void ***slotp) { - struct radix_tree_node *node = NULL, *slot; - unsigned int height, shift, offset; - int error; + struct radix_tree_node *node = NULL, *child; + void **slot = (void **)&root->rnode; + unsigned long maxindex; + unsigned int shift, offset = 0; + unsigned long max = index | ((1UL << order) - 1); - BUG_ON((0 < order) && (order < RADIX_TREE_MAP_SHIFT)); + shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ - if (index > radix_tree_maxindex(root->height)) { - error = radix_tree_extend(root, index, order); - if (error) + if (max > maxindex) { + int error = radix_tree_extend(root, max, shift); + if (error < 0) return error; + shift = error; + child = root->rnode; + if (order == shift) + shift += RADIX_TREE_MAP_SHIFT; } - slot = root->rnode; - - height = root->height; - shift = height * RADIX_TREE_MAP_SHIFT; - - offset = 0; /* uninitialised var warning */ while (shift > order) { - if (slot == NULL) { + shift -= RADIX_TREE_MAP_SHIFT; + if (child == NULL) { /* Have to add a child node. */ - if (!(slot = radix_tree_node_alloc(root))) + child = radix_tree_node_alloc(root); + if (!child) return -ENOMEM; - slot->path = height; - slot->parent = node; - if (node) { - rcu_assign_pointer(node->slots[offset], - ptr_to_indirect(slot)); + child->shift = shift; + child->offset = offset; + child->parent = node; + rcu_assign_pointer(*slot, node_to_entry(child)); + if (node) node->count++; - slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT; - } else - rcu_assign_pointer(root->rnode, - ptr_to_indirect(slot)); - } else if (!radix_tree_is_indirect_ptr(slot)) + } else if (!radix_tree_is_internal_node(child)) break; /* Go a level down */ - height--; - shift -= RADIX_TREE_MAP_SHIFT; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - node = indirect_to_ptr(slot); - slot = node->slots[offset]; + node = entry_to_node(child); + offset = radix_tree_descend(node, &child, index); + slot = &node->slots[offset]; } +#ifdef CONFIG_RADIX_TREE_MULTIORDER /* Insert pointers to the canonical entry */ - if ((shift - order) > 0) { - int i, n = 1 << (shift - order); + if (order > shift) { + unsigned i, n = 1 << (order - shift); offset = offset & ~(n - 1); - slot = ptr_to_indirect(&node->slots[offset]); + slot = &node->slots[offset]; + child = node_to_entry(slot); for (i = 0; i < n; i++) { - if (node->slots[offset + i]) + if (slot[i]) return -EEXIST; } for (i = 1; i < n; i++) { - rcu_assign_pointer(node->slots[offset + i], slot); + rcu_assign_pointer(slot[i], child); node->count++; } } +#endif if (nodep) *nodep = node; if (slotp) - *slotp = node ? node->slots + offset : (void **)&root->rnode; + *slotp = slot; return 0; } @@ -523,7 +637,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, void **slot; int error; - BUG_ON(radix_tree_is_indirect_ptr(item)); + BUG_ON(radix_tree_is_internal_node(item)); error = __radix_tree_create(root, index, order, &node, &slot); if (error) @@ -533,12 +647,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, rcu_assign_pointer(*slot, item); if (node) { + unsigned offset = get_slot_offset(node, slot); node->count++; - BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK)); - BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK)); + BUG_ON(tag_get(node, 0, offset)); + BUG_ON(tag_get(node, 1, offset)); + BUG_ON(tag_get(node, 2, offset)); } else { - BUG_ON(root_tag_get(root, 0)); - BUG_ON(root_tag_get(root, 1)); + BUG_ON(root_tags_get(root)); } return 0; @@ -563,44 +678,25 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp) { struct radix_tree_node *node, *parent; - unsigned int height, shift; + unsigned long maxindex; void **slot; - node = rcu_dereference_raw(root->rnode); - if (node == NULL) + restart: + parent = NULL; + slot = (void **)&root->rnode; + radix_tree_load_root(root, &node, &maxindex); + if (index > maxindex) return NULL; - if (!radix_tree_is_indirect_ptr(node)) { - if (index > 0) - return NULL; + while (radix_tree_is_internal_node(node)) { + unsigned offset; - if (nodep) - *nodep = NULL; - if (slotp) - *slotp = (void **)&root->rnode; - return node; + if (node == RADIX_TREE_RETRY) + goto restart; + parent = entry_to_node(node); + offset = radix_tree_descend(parent, &node, index); + slot = parent->slots + offset; } - node = indirect_to_ptr(node); - - height = node->path & RADIX_TREE_HEIGHT_MASK; - if (index > radix_tree_maxindex(height)) - return NULL; - - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - - do { - parent = node; - slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK); - node = rcu_dereference_raw(*slot); - if (node == NULL) - return NULL; - if (!radix_tree_is_indirect_ptr(node)) - break; - node = indirect_to_ptr(node); - - shift -= RADIX_TREE_MAP_SHIFT; - height--; - } while (height > 0); if (nodep) *nodep = parent; @@ -654,59 +750,72 @@ EXPORT_SYMBOL(radix_tree_lookup); * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root * @index: index key - * @tag: tag index + * @tag: tag index * * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. From * the root all the way down to the leaf node. * - * Returns the address of the tagged item. Setting a tag on a not-present + * Returns the address of the tagged item. Setting a tag on a not-present * item is a bug. */ void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag) { - unsigned int height, shift; - struct radix_tree_node *slot; + struct radix_tree_node *node, *parent; + unsigned long maxindex; - height = root->height; - BUG_ON(index > radix_tree_maxindex(height)); + radix_tree_load_root(root, &node, &maxindex); + BUG_ON(index > maxindex); - slot = indirect_to_ptr(root->rnode); - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + while (radix_tree_is_internal_node(node)) { + unsigned offset; - while (height > 0) { - int offset; + parent = entry_to_node(node); + offset = radix_tree_descend(parent, &node, index); + BUG_ON(!node); - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - if (!tag_get(slot, tag, offset)) - tag_set(slot, tag, offset); - slot = slot->slots[offset]; - BUG_ON(slot == NULL); - if (!radix_tree_is_indirect_ptr(slot)) - break; - slot = indirect_to_ptr(slot); - shift -= RADIX_TREE_MAP_SHIFT; - height--; + if (!tag_get(parent, tag, offset)) + tag_set(parent, tag, offset); } /* set the root's tag bit */ - if (slot && !root_tag_get(root, tag)) + if (!root_tag_get(root, tag)) root_tag_set(root, tag); - return slot; + return node; } EXPORT_SYMBOL(radix_tree_tag_set); +static void node_tag_clear(struct radix_tree_root *root, + struct radix_tree_node *node, + unsigned int tag, unsigned int offset) +{ + while (node) { + if (!tag_get(node, tag, offset)) + return; + tag_clear(node, tag, offset); + if (any_tag_set(node, tag)) + return; + + offset = node->offset; + node = node->parent; + } + + /* clear the root's tag bit */ + if (root_tag_get(root, tag)) + root_tag_clear(root, tag); +} + /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root * @index: index key - * @tag: tag index + * @tag: tag index * * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) - * corresponding to @index in the radix tree. If - * this causes the leaf node to have no tags set then clear the tag in the + * corresponding to @index in the radix tree. If this causes + * the leaf node to have no tags set then clear the tag in the * next-to-leaf node, etc. * * Returns the address of the tagged item on success, else NULL. ie: @@ -715,52 +824,25 @@ EXPORT_SYMBOL(radix_tree_tag_set); void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag) { - struct radix_tree_node *node = NULL; - struct radix_tree_node *slot = NULL; - unsigned int height, shift; + struct radix_tree_node *node, *parent; + unsigned long maxindex; int uninitialized_var(offset); - height = root->height; - if (index > radix_tree_maxindex(height)) - goto out; - - shift = height * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; + radix_tree_load_root(root, &node, &maxindex); + if (index > maxindex) + return NULL; - while (shift) { - if (slot == NULL) - goto out; - if (!radix_tree_is_indirect_ptr(slot)) - break; - slot = indirect_to_ptr(slot); + parent = NULL; - shift -= RADIX_TREE_MAP_SHIFT; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - node = slot; - slot = slot->slots[offset]; + while (radix_tree_is_internal_node(node)) { + parent = entry_to_node(node); + offset = radix_tree_descend(parent, &node, index); } - if (slot == NULL) - goto out; + if (node) + node_tag_clear(root, parent, tag, offset); - while (node) { - if (!tag_get(node, tag, offset)) - goto out; - tag_clear(node, tag, offset); - if (any_tag_set(node, tag)) - goto out; - - index >>= RADIX_TREE_MAP_SHIFT; - offset = index & RADIX_TREE_MAP_MASK; - node = node->parent; - } - - /* clear the root's tag bit */ - if (root_tag_get(root, tag)) - root_tag_clear(root, tag); - -out: - return slot; + return node; } EXPORT_SYMBOL(radix_tree_tag_clear); @@ -768,7 +850,7 @@ EXPORT_SYMBOL(radix_tree_tag_clear); * radix_tree_tag_get - get a tag on a radix tree node * @root: radix tree root * @index: index key - * @tag: tag index (< RADIX_TREE_MAX_TAGS) + * @tag: tag index (< RADIX_TREE_MAX_TAGS) * * Return values: * @@ -782,48 +864,44 @@ EXPORT_SYMBOL(radix_tree_tag_clear); int radix_tree_tag_get(struct radix_tree_root *root, unsigned long index, unsigned int tag) { - unsigned int height, shift; - struct radix_tree_node *node; + struct radix_tree_node *node, *parent; + unsigned long maxindex; - /* check the root's tag bit */ if (!root_tag_get(root, tag)) return 0; - node = rcu_dereference_raw(root->rnode); - if (node == NULL) + radix_tree_load_root(root, &node, &maxindex); + if (index > maxindex) return 0; - - if (!radix_tree_is_indirect_ptr(node)) - return (index == 0); - node = indirect_to_ptr(node); - - height = node->path & RADIX_TREE_HEIGHT_MASK; - if (index > radix_tree_maxindex(height)) + if (node == NULL) return 0; - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + while (radix_tree_is_internal_node(node)) { + unsigned offset; - for ( ; ; ) { - int offset; + parent = entry_to_node(node); + offset = radix_tree_descend(parent, &node, index); - if (node == NULL) + if (!node) return 0; - node = indirect_to_ptr(node); - - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - if (!tag_get(node, tag, offset)) + if (!tag_get(parent, tag, offset)) return 0; - if (height == 1) - return 1; - node = rcu_dereference_raw(node->slots[offset]); - if (!radix_tree_is_indirect_ptr(node)) - return 1; - shift -= RADIX_TREE_MAP_SHIFT; - height--; + if (node == RADIX_TREE_RETRY) + break; } + + return 1; } EXPORT_SYMBOL(radix_tree_tag_get); +static inline void __set_iter_shift(struct radix_tree_iter *iter, + unsigned int shift) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + iter->shift = shift; +#endif +} + /** * radix_tree_next_chunk - find next chunk of slots for iteration * @@ -835,9 +913,9 @@ EXPORT_SYMBOL(radix_tree_tag_get); void **radix_tree_next_chunk(struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags) { - unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; - struct radix_tree_node *rnode, *node; - unsigned long index, offset, height; + unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; + struct radix_tree_node *node, *child; + unsigned long index, offset, maxindex; if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL; @@ -855,33 +933,28 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, if (!index && iter->index) return NULL; - rnode = rcu_dereference_raw(root->rnode); - if (radix_tree_is_indirect_ptr(rnode)) { - rnode = indirect_to_ptr(rnode); - } else if (rnode && !index) { + restart: + radix_tree_load_root(root, &child, &maxindex); + if (index > maxindex) + return NULL; + if (!child) + return NULL; + + if (!radix_tree_is_internal_node(child)) { /* Single-slot tree */ - iter->index = 0; - iter->next_index = 1; + iter->index = index; + iter->next_index = maxindex + 1; iter->tags = 1; + __set_iter_shift(iter, 0); return (void **)&root->rnode; - } else - return NULL; - -restart: - height = rnode->path & RADIX_TREE_HEIGHT_MASK; - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - offset = index >> shift; + } - /* Index outside of the tree */ - if (offset >= RADIX_TREE_MAP_SIZE) - return NULL; + do { + node = entry_to_node(child); + offset = radix_tree_descend(node, &child, index); - node = rnode; - while (1) { - struct radix_tree_node *slot; if ((flags & RADIX_TREE_ITER_TAGGED) ? - !test_bit(offset, node->tags[tag]) : - !node->slots[offset]) { + !tag_get(node, tag, offset) : !child) { /* Hole detected */ if (flags & RADIX_TREE_ITER_CONTIG) return NULL; @@ -893,35 +966,30 @@ restart: offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { - if (node->slots[offset]) + void *slot = node->slots[offset]; + if (is_sibling_entry(node, slot)) + continue; + if (slot) break; } - index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1); - index += offset << shift; + index &= ~node_maxindex(node); + index += offset << node->shift; /* Overflow after ~0UL */ if (!index) return NULL; if (offset == RADIX_TREE_MAP_SIZE) goto restart; + child = rcu_dereference_raw(node->slots[offset]); } - /* This is leaf-node */ - if (!shift) - break; - - slot = rcu_dereference_raw(node->slots[offset]); - if (slot == NULL) + if ((child == NULL) || (child == RADIX_TREE_RETRY)) goto restart; - if (!radix_tree_is_indirect_ptr(slot)) - break; - node = indirect_to_ptr(slot); - shift -= RADIX_TREE_MAP_SHIFT; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - } + } while (radix_tree_is_internal_node(child)); /* Update the iterator state */ - iter->index = index; - iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1; + iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); + iter->next_index = (index | node_maxindex(node)) + 1; + __set_iter_shift(iter, node->shift); /* Construct iter->tags bit-mask from node->tags[tag] array */ if (flags & RADIX_TREE_ITER_TAGGED) { @@ -967,7 +1035,7 @@ EXPORT_SYMBOL(radix_tree_next_chunk); * set is outside the range we are scanning. This reults in dangling tags and * can lead to problems with later tag operations (e.g. livelocks on lookups). * - * The function returns number of leaves where the tag was set and sets + * The function returns the number of leaves where the tag was set and sets * *first_indexp to the first unscanned index. * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must * be prepared to handle that. @@ -977,14 +1045,13 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, unsigned long nr_to_tag, unsigned int iftag, unsigned int settag) { - unsigned int height = root->height; - struct radix_tree_node *node = NULL; - struct radix_tree_node *slot; - unsigned int shift; + struct radix_tree_node *parent, *node, *child; + unsigned long maxindex; unsigned long tagged = 0; unsigned long index = *first_indexp; - last_index = min(last_index, radix_tree_maxindex(height)); + radix_tree_load_root(root, &child, &maxindex); + last_index = min(last_index, maxindex); if (index > last_index) return 0; if (!nr_to_tag) @@ -993,80 +1060,62 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, *first_indexp = last_index + 1; return 0; } - if (height == 0) { + if (!radix_tree_is_internal_node(child)) { *first_indexp = last_index + 1; root_tag_set(root, settag); return 1; } - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = indirect_to_ptr(root->rnode); + node = entry_to_node(child); for (;;) { - unsigned long upindex; - int offset; - - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - if (!slot->slots[offset]) + unsigned offset = radix_tree_descend(node, &child, index); + if (!child) goto next; - if (!tag_get(slot, iftag, offset)) + if (!tag_get(node, iftag, offset)) goto next; - if (shift) { - node = slot; - slot = slot->slots[offset]; - if (radix_tree_is_indirect_ptr(slot)) { - slot = indirect_to_ptr(slot); - shift -= RADIX_TREE_MAP_SHIFT; - continue; - } else { - slot = node; - node = node->parent; - } + /* Sibling slots never have tags set on them */ + if (radix_tree_is_internal_node(child)) { + node = entry_to_node(child); + continue; } /* tag the leaf */ - tagged += 1 << shift; - tag_set(slot, settag, offset); + tagged++; + tag_set(node, settag, offset); /* walk back up the path tagging interior nodes */ - upindex = index; - while (node) { - upindex >>= RADIX_TREE_MAP_SHIFT; - offset = upindex & RADIX_TREE_MAP_MASK; - + parent = node; + for (;;) { + offset = parent->offset; + parent = parent->parent; + if (!parent) + break; /* stop if we find a node with the tag already set */ - if (tag_get(node, settag, offset)) + if (tag_get(parent, settag, offset)) break; - tag_set(node, settag, offset); - node = node->parent; + tag_set(parent, settag, offset); } - - /* - * Small optimization: now clear that node pointer. - * Since all of this slot's ancestors now have the tag set - * from setting it above, we have no further need to walk - * back up the tree setting tags, until we update slot to - * point to another radix_tree_node. - */ - node = NULL; - -next: - /* Go to next item at level determined by 'shift' */ - index = ((index >> shift) + 1) << shift; + next: + /* Go to next entry in node */ + index = ((index >> node->shift) + 1) << node->shift; /* Overflow can happen when last_index is ~0UL... */ if (index > last_index || !index) break; - if (tagged >= nr_to_tag) - break; - while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) { + offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; + while (offset == 0) { /* * We've fully scanned this node. Go up. Because * last_index is guaranteed to be in the tree, what * we do below cannot wander astray. */ - slot = slot->parent; - shift += RADIX_TREE_MAP_SHIFT; + node = node->parent; + offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; } + if (is_sibling_entry(node, node->slots[offset])) + goto next; + if (tagged >= nr_to_tag) + break; } /* * We need not to tag the root tag if there is no tag which is set with @@ -1095,9 +1144,10 @@ EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); * * Like radix_tree_lookup, radix_tree_gang_lookup may be called under * rcu_read_lock. In this case, rather than the returned results being - * an atomic snapshot of the tree at a single point in time, the semantics - * of an RCU protected gang lookup are as though multiple radix_tree_lookups - * have been issued in individual locks, and results stored in 'results'. + * an atomic snapshot of the tree at a single point in time, the + * semantics of an RCU protected gang lookup are as though multiple + * radix_tree_lookups have been issued in individual locks, and results + * stored in 'results'. */ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, @@ -1114,7 +1164,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; - if (radix_tree_is_indirect_ptr(results[ret])) { + if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } @@ -1197,7 +1247,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; - if (radix_tree_is_indirect_ptr(results[ret])) { + if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } @@ -1247,58 +1297,48 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); #if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP) #include <linux/sched.h> /* for cond_resched() */ +struct locate_info { + unsigned long found_index; + bool stop; +}; + /* * This linear search is at present only useful to shmem_unuse_inode(). */ static unsigned long __locate(struct radix_tree_node *slot, void *item, - unsigned long index, unsigned long *found_index) + unsigned long index, struct locate_info *info) { - unsigned int shift, height; unsigned long i; - height = slot->path & RADIX_TREE_HEIGHT_MASK; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - - for ( ; height > 1; height--) { - i = (index >> shift) & RADIX_TREE_MAP_MASK; - for (;;) { - if (slot->slots[i] != NULL) - break; - index &= ~((1UL << shift) - 1); - index += 1UL << shift; - if (index == 0) - goto out; /* 32-bit wraparound */ - i++; - if (i == RADIX_TREE_MAP_SIZE) + do { + unsigned int shift = slot->shift; + + for (i = (index >> shift) & RADIX_TREE_MAP_MASK; + i < RADIX_TREE_MAP_SIZE; + i++, index += (1UL << shift)) { + struct radix_tree_node *node = + rcu_dereference_raw(slot->slots[i]); + if (node == RADIX_TREE_RETRY) goto out; - } - - slot = rcu_dereference_raw(slot->slots[i]); - if (slot == NULL) - goto out; - if (!radix_tree_is_indirect_ptr(slot)) { - if (slot == item) { - *found_index = index + i; - index = 0; - } else { - index += shift; + if (!radix_tree_is_internal_node(node)) { + if (node == item) { + info->found_index = index; + info->stop = true; + goto out; + } + continue; } - goto out; + node = entry_to_node(node); + if (is_sibling_entry(slot, node)) + continue; + slot = node; + break; } - slot = indirect_to_ptr(slot); - shift -= RADIX_TREE_MAP_SHIFT; - } + } while (i < RADIX_TREE_MAP_SIZE); - /* Bottom level: check items */ - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - if (slot->slots[i] == item) { - *found_index = index + i; - index = 0; - goto out; - } - } - index += RADIX_TREE_MAP_SIZE; out: + if ((index == 0) && (i == RADIX_TREE_MAP_SIZE)) + info->stop = true; return index; } @@ -1316,32 +1356,35 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) struct radix_tree_node *node; unsigned long max_index; unsigned long cur_index = 0; - unsigned long found_index = -1; + struct locate_info info = { + .found_index = -1, + .stop = false, + }; do { rcu_read_lock(); node = rcu_dereference_raw(root->rnode); - if (!radix_tree_is_indirect_ptr(node)) { + if (!radix_tree_is_internal_node(node)) { rcu_read_unlock(); if (node == item) - found_index = 0; + info.found_index = 0; break; } - node = indirect_to_ptr(node); - max_index = radix_tree_maxindex(node->path & - RADIX_TREE_HEIGHT_MASK); + node = entry_to_node(node); + + max_index = node_maxindex(node); if (cur_index > max_index) { rcu_read_unlock(); break; } - cur_index = __locate(node, item, cur_index, &found_index); + cur_index = __locate(node, item, cur_index, &info); rcu_read_unlock(); cond_resched(); - } while (cur_index != 0 && cur_index <= max_index); + } while (!info.stop && cur_index <= max_index); - return found_index; + return info.found_index; } #else unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) @@ -1351,47 +1394,45 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) #endif /* CONFIG_SHMEM && CONFIG_SWAP */ /** - * radix_tree_shrink - shrink height of a radix tree to minimal + * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline void radix_tree_shrink(struct radix_tree_root *root) +static inline bool radix_tree_shrink(struct radix_tree_root *root) { - /* try to shrink tree height */ - while (root->height > 0) { - struct radix_tree_node *to_free = root->rnode; - struct radix_tree_node *slot; + bool shrunk = false; + + for (;;) { + struct radix_tree_node *node = root->rnode; + struct radix_tree_node *child; - BUG_ON(!radix_tree_is_indirect_ptr(to_free)); - to_free = indirect_to_ptr(to_free); + if (!radix_tree_is_internal_node(node)) + break; + node = entry_to_node(node); /* * The candidate node has more than one child, or its child - * is not at the leftmost slot, or it is a multiorder entry, - * we cannot shrink. + * is not at the leftmost slot, or the child is a multiorder + * entry, we cannot shrink. */ - if (to_free->count != 1) + if (node->count != 1) + break; + child = node->slots[0]; + if (!child) break; - slot = to_free->slots[0]; - if (!slot) + if (!radix_tree_is_internal_node(child) && node->shift) break; + if (radix_tree_is_internal_node(child)) + entry_to_node(child)->parent = NULL; + /* * We don't need rcu_assign_pointer(), since we are simply * moving the node from one part of the tree to another: if it * was safe to dereference the old pointer to it - * (to_free->slots[0]), it will be safe to dereference the new + * (node->slots[0]), it will be safe to dereference the new * one (root->rnode) as far as dependent read barriers go. */ - if (root->height > 1) { - if (!radix_tree_is_indirect_ptr(slot)) - break; - - slot = indirect_to_ptr(slot); - slot->parent = NULL; - slot = ptr_to_indirect(slot); - } - root->rnode = slot; - root->height--; + root->rnode = child; /* * We have a dilemma here. The node's slot[0] must not be @@ -1403,7 +1444,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) * their slot to become empty sooner or later. * * For example, lockless pagecache will look up a slot, deref - * the page pointer, and if the page is 0 refcount it means it + * the page pointer, and if the page has 0 refcount it means it * was concurrently deleted from pagecache so try the deref * again. Fortunately there is already a requirement for logic * to retry the entire slot lookup -- the indirect pointer @@ -1411,12 +1452,14 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ - if (root->height == 0) - *((unsigned long *)&to_free->slots[0]) |= - RADIX_TREE_INDIRECT_PTR; + if (!radix_tree_is_internal_node(child)) + node->slots[0] = RADIX_TREE_RETRY; - radix_tree_node_free(to_free); + radix_tree_node_free(node); + shrunk = true; } + + return shrunk; } /** @@ -1439,24 +1482,17 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *parent; if (node->count) { - if (node == indirect_to_ptr(root->rnode)) { - radix_tree_shrink(root); - if (root->height == 0) - deleted = true; - } + if (node == entry_to_node(root->rnode)) + deleted |= radix_tree_shrink(root); return deleted; } parent = node->parent; if (parent) { - unsigned int offset; - - offset = node->path >> RADIX_TREE_HEIGHT_SHIFT; - parent->slots[offset] = NULL; + parent->slots[node->offset] = NULL; parent->count--; } else { root_tag_clear_all(root); - root->height = 0; root->rnode = NULL; } @@ -1469,6 +1505,20 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, return deleted; } +static inline void delete_sibling_entries(struct radix_tree_node *node, + void *ptr, unsigned offset) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + int i; + for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[offset + i] != ptr) + break; + node->slots[offset + i] = NULL; + node->count--; + } +#endif +} + /** * radix_tree_delete_item - delete an item from a radix tree * @root: radix tree root @@ -1484,7 +1534,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node; - unsigned int offset, i; + unsigned int offset; void **slot; void *entry; int tag; @@ -1502,24 +1552,13 @@ void *radix_tree_delete_item(struct radix_tree_root *root, return entry; } - offset = index & RADIX_TREE_MAP_MASK; + offset = get_slot_offset(node, slot); - /* - * Clear all tags associated with the item to be deleted. - * This way of doing it would be inefficient, but seldom is any set. - */ - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (tag_get(node, tag, offset)) - radix_tree_tag_clear(root, index, tag); - } + /* Clear all tags associated with the item to be deleted. */ + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + node_tag_clear(root, node, tag, offset); - /* Delete any sibling slots pointing to this slot */ - for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[offset + i] != ptr_to_indirect(slot)) - break; - node->slots[offset + i] = NULL; - node->count--; - } + delete_sibling_entries(node, node_to_entry(slot), offset); node->slots[offset] = NULL; node->count--; @@ -1544,6 +1583,20 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_delete); +void radix_tree_clear_tags(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot) +{ + if (node) { + unsigned int tag, offset = get_slot_offset(node, slot); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + node_tag_clear(root, node, tag, offset); + } else { + /* Clear root node tags */ + root->gfp_mask &= __GFP_BITS_MASK; + } +} + /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root @@ -1576,33 +1629,37 @@ static __init unsigned long __maxindex(unsigned int height) return ~0UL >> shift; } -static __init void radix_tree_init_maxindex(void) +static __init void radix_tree_init_maxnodes(void) { - unsigned int i; + unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1]; + unsigned int i, j; for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) height_to_maxindex[i] = __maxindex(i); + for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) { + for (j = i; j > 0; j--) + height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1; + } } static int radix_tree_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) + unsigned long action, void *hcpu) { - int cpu = (long)hcpu; - struct radix_tree_preload *rtp; - struct radix_tree_node *node; - - /* Free per-cpu pool of perloaded nodes */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - rtp = &per_cpu(radix_tree_preloads, cpu); - while (rtp->nr) { + int cpu = (long)hcpu; + struct radix_tree_preload *rtp; + struct radix_tree_node *node; + + /* Free per-cpu pool of preloaded nodes */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + rtp = &per_cpu(radix_tree_preloads, cpu); + while (rtp->nr) { node = rtp->nodes; rtp->nodes = node->private_data; kmem_cache_free(radix_tree_node_cachep, node); rtp->nr--; - } - } - return NOTIFY_OK; + } + } + return NOTIFY_OK; } void __init radix_tree_init(void) @@ -1611,6 +1668,6 @@ void __init radix_tree_init(void) sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); - radix_tree_init_maxindex(); + radix_tree_init_maxnodes(); hotcpu_notifier(radix_tree_callback, 0); } diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore index 0a7e494b2bcd..f01b1cb04f91 100644 --- a/lib/raid6/.gitignore +++ b/lib/raid6/.gitignore @@ -3,3 +3,4 @@ altivec*.c int*.c tables.c neon?.c +s390vx?.c diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 3b10a48fa040..3057011f5599 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -3,10 +3,11 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ int8.o int16.o int32.o -raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o +raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o raid6_pq-$(CONFIG_TILEGX) += tilegx8.o +raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o hostprogs-y += mktables @@ -116,6 +117,11 @@ $(obj)/tilegx8.c: UNROLL := 8 $(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) +targets += s390vx8.c +$(obj)/s390vx8.c: UNROLL := 8 +$(obj)/s390vx8.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + quiet_cmd_mktable = TABLE $@ cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 975c6e0434bd..7857049fd7d3 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -49,6 +49,10 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_avx2x1, &raid6_avx2x2, #endif +#ifdef CONFIG_AS_AVX512 + &raid6_avx512x1, + &raid6_avx512x2, +#endif #endif #if defined(__x86_64__) && !defined(__arch_um__) &raid6_sse2x1, @@ -59,6 +63,11 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_avx2x2, &raid6_avx2x4, #endif +#ifdef CONFIG_AS_AVX512 + &raid6_avx512x1, + &raid6_avx512x2, + &raid6_avx512x4, +#endif #endif #ifdef CONFIG_ALTIVEC &raid6_altivec1, @@ -69,6 +78,9 @@ const struct raid6_calls * const raid6_algos[] = { #if defined(CONFIG_TILEGX) &raid6_tilegx8, #endif +#if defined(CONFIG_S390) + &raid6_s390vx8, +#endif &raid6_intx1, &raid6_intx2, &raid6_intx4, @@ -89,12 +101,18 @@ void (*raid6_datap_recov)(int, size_t, int, void **); EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { +#ifdef CONFIG_AS_AVX512 + &raid6_recov_avx512, +#endif #ifdef CONFIG_AS_AVX2 &raid6_recov_avx2, #endif #ifdef CONFIG_AS_SSSE3 &raid6_recov_ssse3, #endif +#ifdef CONFIG_S390 + &raid6_recov_s390xc, +#endif &raid6_recov_intx1, NULL }; diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c new file mode 100644 index 000000000000..f524a7972006 --- /dev/null +++ b/lib/raid6/avx512.c @@ -0,0 +1,569 @@ +/* -*- linux-c -*- -------------------------------------------------------- + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + * + * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- + */ + +/* + * AVX512 implementation of RAID-6 syndrome functions + * + */ + +#ifdef CONFIG_AS_AVX512 + +#include <linux/raid/pq.h> +#include "x86.h" + +static const struct raid6_avx512_constants { + u64 x1d[8]; +} raid6_avx512_constants __aligned(512) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 64) { + asm volatile("prefetchnta %0\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "prefetchnta %1\n\t" + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %1,%%zmm6" + : + : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); + for (z = z0-2; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm6" + : + : "m" (dptr[z][d])); + } + asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm4,%1\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4" + : + : "m" (p[d]), "m" (q[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 64) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm2\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2" + : + : "m" (dptr[z0][d]), "m" (p[d])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : "m" (dptr[z][d])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + /* Don't use movntdq for r/w memory area < cache line */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm2,%1" + : + : "m" (q[d]), "m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x1 = { + raid6_avx5121_gen_syndrome, + raid6_avx5121_xor_syndrome, + raid6_have_avx512, + "avx512x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 AVX512 implementation + */ +static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + /* We uniformly assume a single prefetch covers at least 64 bytes */ + for (d = 0; d < bytes; d += 128) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); + for (z = z0-1; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm4,%2\n\t" + "vmovntdq %%zmm6,%3" + : + : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), + "m" (q[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 128) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm2\n\t" + "vmovdqa64 %3,%%zmm3\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (p[d]), "m" (p[d+64])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + "vpxorq %1,%%zmm6,%%zmm6\n\t" + /* Don't use movntdq for r/w + * memory area < cache line + */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm6,%1\n\t" + "vmovdqa64 %%zmm2,%2\n\t" + "vmovdqa64 %%zmm3,%3" + : + : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), + "m" (p[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x2 = { + raid6_avx5122_gen_syndrome, + raid6_avx5122_xor_syndrome, + raid6_have_avx512, + "avx512x2", + 1 /* Has cache hints */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ + "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 256) { + for (z = z0; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "prefetchnta %2\n\t" + "prefetchnta %3\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" + "vmovntdq %%zmm14,%7\n\t" + "vpxorq %%zmm14,%%zmm14,%%zmm14" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + :: "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 256) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm12\n\t" + "vmovdqa64 %3,%%zmm14\n\t" + "vmovdqa64 %4,%%zmm2\n\t" + "vmovdqa64 %5,%%zmm3\n\t" + "vmovdqa64 %6,%%zmm10\n\t" + "vmovdqa64 %7,%%zmm11\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm14,%%zmm11,%%zmm11" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), + "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "prefetchnta %0\n\t" + "prefetchnta %2\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), + "m" (dptr[z][d+192])); + } + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + : + : "m" (q[d]), "m" (q[d+128])); + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : ); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %4,%%zmm4,%%zmm4\n\t" + "vpxorq %5,%%zmm6,%%zmm6\n\t" + "vpxorq %6,%%zmm12,%%zmm12\n\t" + "vpxorq %7,%%zmm14,%%zmm14\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vmovntdq %%zmm14,%7" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} +const struct raid6_calls raid6_avx512x4 = { + raid6_avx5124_gen_syndrome, + raid6_avx5124_xor_syndrome, + raid6_have_avx512, + "avx512x4", + 1 /* Has cache hints */ +}; +#endif + +#endif /* CONFIG_AS_AVX512 */ diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c new file mode 100644 index 000000000000..625aafa33b61 --- /dev/null +++ b/lib/raid6/recov_avx512.c @@ -0,0 +1,388 @@ +/* + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ + +#ifdef CONFIG_AS_AVX512 + +#include <linux/raid/pq.h> +#include "x86.h" + +static int raid6_has_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* zmm0 = x0f[16] */ + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm9\n\t" + "vmovdqa64 %2, %%zmm0\n\t" + "vmovdqa64 %3, %%zmm8\n\t" + "vpxorq %4, %%zmm1, %%zmm1\n\t" + "vpxorq %5, %%zmm9, %%zmm9\n\t" + "vpxorq %6, %%zmm0, %%zmm0\n\t" + "vpxorq %7, %%zmm8, %%zmm8" + : + : "m" (q[0]), "m" (q[64]), "m" (p[0]), + "m" (p[64]), "m" (dq[0]), "m" (dq[64]), + "m" (dp[0]), "m" (dp[64])); + + /* + * 1 = dq[0] ^ q[0] + * 9 = dq[64] ^ q[64] + * 0 = dp[0] ^ p[0] + * 8 = dp[64] ^ p[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpsraw $4, %%zmm9, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm9, %%zmm9\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* + * 5 = qx[0] + * 15 = qx[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpsraw $4, %%zmm8, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm14\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm12, %%zmm13, %%zmm13" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + /* + * 1 = pbmul[px[0]] + * 13 = pbmul[px[64]] + */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm15, %%zmm13, %%zmm13" + : + : ); + + /* + * 1 = db = DQ + * 13 = db[64] = DQ[64] + */ + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm13,%1\n\t" + "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vpxorq %%zmm13, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64])); + + asm volatile("vmovdqa64 %%zmm0, %0\n\t" + "vmovdqa64 %%zmm8, %1" + : + : "m" (dp[0]), "m" (dp[64])); + + bytes -= 128; + p += 128; + q += 128; + dp += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm0\n\t" + "vpxorq %2, %%zmm1, %%zmm1\n\t" + "vpxorq %3, %%zmm0, %%zmm0" + : + : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp)); + + /* 1 = dq ^ q; 0 = dp ^ p */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + /* + * 1 = dq ^ q + * 3 = dq ^ p >> 4 + */ + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* 5 = qx */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1" + : + : ); + + /* 1 = pbmul[px] */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + /* 1 = db = DQ */ + "vmovdqa64 %%zmm1, %0\n\t" + : + : "m" (dq[0])); + + asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovdqa64 %%zmm0, %0" + : + : "m" (dp[0])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vmovdqa64 %1, %%zmm8\n\t" + "vpxorq %2, %%zmm3, %%zmm3\n\t" + "vpxorq %3, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64]), "m" (q[0]), + "m" (q[64])); + + /* + * 3 = q[0] ^ dq[0] + * 8 = q[64] ^ dq[64] + */ + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vmovapd %%zmm0, %%zmm13\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vmovapd %%zmm1, %%zmm14" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpsraw $4, %%zmm8, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm8\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm13, %%zmm14, %%zmm14" + : + : ); + + /* + * 1 = qmul[q[0] ^ dq[0]] + * 14 = qmul[q[64] ^ dq[64]] + */ + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vmovdqa64 %1, %%zmm12\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t" + "vpxorq %%zmm14, %%zmm12, %%zmm12" + : + : "m" (p[0]), "m" (p[64])); + + /* + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] + * 12 = p[64] ^ qmul[q[64] ^ dq[64]] + */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm14, %1\n\t" + "vmovdqa64 %%zmm2, %2\n\t" + "vmovdqa64 %%zmm12,%3" + : + : "m" (dq[0]), "m" (dq[64]), "m" (p[0]), + "m" (p[64])); + + bytes -= 128; + p += 128; + q += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vpxorq %1, %%zmm3, %%zmm3" + : + : "m" (dq[0]), "m" (q[0])); + + /* 3 = q ^ dq */ + + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1" + : + : ); + + /* 1 = qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2" + : + : "m" (p[0])); + + /* 2 = p ^ qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm2, %1" + : + : "m" (dq[0]), "m" (p[0])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx512 = { + .data2 = raid6_2data_recov_avx512, + .datap = raid6_datap_recov_avx512, + .valid = raid6_has_avx512, +#ifdef CONFIG_X86_64 + .name = "avx512x2", +#else + .name = "avx512x1", +#endif + .priority = 3, +}; + +#else +#warning "your version of binutils lacks AVX512 support" +#endif diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c new file mode 100644 index 000000000000..b042dac826cc --- /dev/null +++ b/lib/raid6/recov_s390xc.c @@ -0,0 +1,116 @@ +/* + * RAID-6 data recovery in dual failure mode based on the XC instruction. + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/export.h> +#include <linux/raid/pq.h> + +static inline void xor_block(u8 *p1, u8 *p2) +{ + typedef struct { u8 _[256]; } addrtype; + + asm volatile( + " xc 0(256,%[p1]),0(%[p2])\n" + : "+m" (*(addrtype *) p1) : "m" (*(addrtype *) p2), + [p1] "a" (p1), [p2] "a" (p2) : "cc"); +} + +/* Recover two failed data blocks. */ +static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + int i; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while (bytes) { + xor_block(dp, p); + xor_block(dq, q); + for (i = 0; i < 256; i++) + dq[i] = pbmul[dp[i]] ^ qmul[dq[i]]; + xor_block(dp, dq); + p += 256; + q += 256; + dp += 256; + dq += 256; + bytes -= 256; + } +} + +/* Recover failure of one data block plus the P block */ +static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + int i; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while (bytes) { + xor_block(dq, q); + for (i = 0; i < 256; i++) + dq[i] = qmul[dq[i]]; + xor_block(p, dq); + p += 256; + q += 256; + dq += 256; + bytes -= 256; + } +} + + +const struct raid6_recov_calls raid6_recov_s390xc = { + .data2 = raid6_2data_recov_s390xc, + .datap = raid6_datap_recov_s390xc, + .valid = NULL, + .name = "s390xc", + .priority = 1, +}; diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc new file mode 100644 index 000000000000..7b45191a655f --- /dev/null +++ b/lib/raid6/s390vx.uc @@ -0,0 +1,168 @@ +/* + * raid6_vx$#.c + * + * $#-way unrolled RAID6 gen/xor functions for s390 + * based on the vector facility + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * + * This file is postprocessed using unroll.awk. + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +asm(".include \"asm/vx-insn.h\"\n"); + +#define NSIZE 16 + +static inline void LOAD_CONST(void) +{ + asm volatile("VREPIB %v24,7"); + asm volatile("VREPIB %v25,0x1d"); +} + +/* + * The SHLBYTE() operation shifts each of the 16 bytes in + * vector register y left by 1 bit and stores the result in + * vector register x. + */ +static inline void SHLBYTE(int x, int y) +{ + asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y)); +} + +/* + * For each of the 16 bytes in the vector register y the MASK() + * operation returns 0xFF if the high bit of the byte is 1, + * or 0x00 if the high bit is 0. The result is stored in vector + * register x. + */ +static inline void MASK(int x, int y) +{ + asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y)); +} + +static inline void AND(int x, int y, int z) +{ + asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z)); +} + +static inline void XOR(int x, int y, int z) +{ + asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z)); +} + +static inline void LOAD_DATA(int x, int n, u8 *ptr) +{ + typedef struct { u8 _[16*n]; } addrtype; + register addrtype *__ptr asm("1") = (addrtype *) ptr; + + asm volatile ("VLM %2,%3,0,%r1" + : : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1)); +} + +static inline void STORE_DATA(int x, int n, u8 *ptr) +{ + typedef struct { u8 _[16*n]; } addrtype; + register addrtype *__ptr asm("1") = (addrtype *) ptr; + + asm volatile ("VSTM %2,%3,0,1" + : "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1)); +} + +static inline void COPY_VEC(int x, int y) +{ + asm volatile ("VLR %0,%1" : : "i" (x), "i" (y)); +} + +static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + struct kernel_fpu vxstate; + u8 **dptr, *p, *q; + int d, z, z0; + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + LOAD_CONST(); + + dptr = (u8 **) ptrs; + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + for (d = 0; d < bytes; d += $#*NSIZE) { + LOAD_DATA(0,$#,&dptr[z0][d]); + COPY_VEC(8+$$,0+$$); + for (z = z0 - 1; z >= 0; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + LOAD_DATA(16,$#,&dptr[z][d]); + XOR(0+$$,0+$$,16+$$); + XOR(8+$$,8+$$,16+$$); + } + STORE_DATA(0,$#,&p[d]); + STORE_DATA(8,$#,&q[d]); + } + kernel_fpu_end(&vxstate, KERNEL_VXR); +} + +static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + struct kernel_fpu vxstate; + u8 **dptr, *p, *q; + int d, z, z0; + + dptr = (u8 **) ptrs; + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + LOAD_CONST(); + + for (d = 0; d < bytes; d += $#*NSIZE) { + /* P/Q data pages */ + LOAD_DATA(0,$#,&dptr[z0][d]); + COPY_VEC(8+$$,0+$$); + for (z = z0 - 1; z >= start; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + LOAD_DATA(16,$#,&dptr[z][d]); + XOR(0+$$,0+$$,16+$$); + XOR(8+$$,8+$$,16+$$); + } + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + } + LOAD_DATA(16,$#,&p[d]); + XOR(16+$$,16+$$,0+$$); + STORE_DATA(16,$#,&p[d]); + LOAD_DATA(16,$#,&q[d]); + XOR(16+$$,16+$$,8+$$); + STORE_DATA(16,$#,&q[d]); + } + kernel_fpu_end(&vxstate, KERNEL_VXR); +} + +static int raid6_s390vx$#_valid(void) +{ + return MACHINE_HAS_VX; +} + +const struct raid6_calls raid6_s390vx$# = { + raid6_s390vx$#_gen_syndrome, + raid6_s390vx$#_xor_syndrome, + raid6_s390vx$#_valid, + "vx128x$#", + 1 +}; diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 29090f3db677..2c7b60edea04 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -32,10 +32,13 @@ ifeq ($(ARCH),arm64) endif ifeq ($(IS_X86),yes) - OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o + OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ gcc -c -x assembler - >&/dev/null && \ rm ./-.o && echo -DCONFIG_AS_AVX2=1) + CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ + gcc -c -x assembler - >&/dev/null && \ + rm ./-.o && echo -DCONFIG_AS_AVX512=1) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 3bebbabdb510..b07f4d8e6b03 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c @@ -21,12 +21,13 @@ #define NDISKS 16 /* Including P and Q */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); struct raid6_calls raid6_call; char *dataptrs[NDISKS]; -char data[NDISKS][PAGE_SIZE]; -char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; +char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); static void makedata(int start, int stop) { diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h index 8fe9d9662abb..834d268a4b05 100644 --- a/lib/raid6/x86.h +++ b/lib/raid6/x86.h @@ -46,6 +46,16 @@ static inline void kernel_fpu_end(void) #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 DQ (Double/Quad granular) + * Instructions + */ +#define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 BW (Byte/Word granular) + * Instructions + */ +#define X86_FEATURE_AVX512VL (9*32+31) /* AVX-512 VL (128/256 Vector Length) + * Extensions + */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ /* Should work well enough on modern CPUs for testing */ diff --git a/lib/random32.c b/lib/random32.c index 510d1ce7d4d2..915982b304bb 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -81,7 +81,7 @@ u32 prandom_u32(void) u32 res; res = prandom_u32_state(state); - put_cpu_var(state); + put_cpu_var(net_rand_state); return res; } @@ -128,7 +128,7 @@ void prandom_bytes(void *buf, size_t bytes) struct rnd_state *state = &get_cpu_var(net_rand_state); prandom_bytes_state(state, buf, bytes); - put_cpu_var(state); + put_cpu_var(net_rand_state); } EXPORT_SYMBOL(prandom_bytes); @@ -233,7 +233,6 @@ static void __prandom_timer(unsigned long dontcare) static void __init __prandom_start_seed_timer(void) { - set_timer_slack(&seed_timer, HZ); seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC); add_timer(&seed_timer); } diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 2c5de86460c5..08f8043cac61 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -46,12 +46,14 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) rs->begin = jiffies; if (time_is_before_jiffies(rs->begin + rs->interval)) { - if (rs->missed) - printk(KERN_WARNING "%s: %d callbacks suppressed\n", - func, rs->missed); + if (rs->missed) { + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { + pr_warn("%s: %d callbacks suppressed\n", func, rs->missed); + rs->missed = 0; + } + } rs->begin = jiffies; rs->printed = 0; - rs->missed = 0; } if (rs->burst && rs->burst > rs->printed) { rs->printed++; diff --git a/lib/rbtree.c b/lib/rbtree.c index 1356454e36de..eb8a19fee110 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -539,17 +539,39 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, { struct rb_node *parent = rb_parent(victim); + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; + /* Set the surrounding nodes to point to the replacement */ - __rb_change_child(victim, new, parent, root); if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) rb_set_parent(victim->rb_right, new); + __rb_change_child(victim, new, parent, root); +} +EXPORT_SYMBOL(rb_replace_node); + +void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *parent = rb_parent(victim); /* Copy the pointers/colour from the victim to the replacement */ *new = *victim; + + /* Set the surrounding nodes to point to the replacement */ + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + + /* Set the parent's pointer to the new node last after an RCU barrier + * so that the pointers onwards are seen to be set correctly when doing + * an RCU walk over the tree. + */ + __rb_change_child_rcu(victim, new, parent, root); } -EXPORT_SYMBOL(rb_replace_node); +EXPORT_SYMBOL(rb_replace_node_rcu); static struct rb_node *rb_left_deepest_node(const struct rb_node *node) { diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 5d845ffd7982..32d0ad058380 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -30,7 +30,7 @@ #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U -#define BUCKET_LOCKS_PER_CPU 128UL +#define BUCKET_LOCKS_PER_CPU 32UL static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, @@ -70,21 +70,25 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, unsigned int nr_pcpus = num_possible_cpus(); #endif - nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); + nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL); size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); /* Never allocate more than 0.5 locks per bucket */ size = min_t(unsigned int, size, tbl->size >> 1); if (sizeof(spinlock_t) != 0) { + tbl->locks = NULL; #ifdef CONFIG_NUMA if (size * sizeof(spinlock_t) > PAGE_SIZE && gfp == GFP_KERNEL) tbl->locks = vmalloc(size * sizeof(spinlock_t)); - else #endif - tbl->locks = kmalloc_array(size, sizeof(spinlock_t), - gfp); + if (gfp != GFP_KERNEL) + gfp |= __GFP_NOWARN | __GFP_NORETRY; + + if (!tbl->locks) + tbl->locks = kmalloc_array(size, sizeof(spinlock_t), + gfp); if (!tbl->locks) return -ENOMEM; for (i = 0; i < size; i++) @@ -321,12 +325,14 @@ static int rhashtable_expand(struct rhashtable *ht) static int rhashtable_shrink(struct rhashtable *ht) { struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); - unsigned int size; + unsigned int nelems = atomic_read(&ht->nelems); + unsigned int size = 0; int err; ASSERT_RHT_MUTEX(ht); - size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2); + if (nelems) + size = roundup_pow_of_two(nelems * 3 / 2); if (size < ht->p.min_size) size = ht->p.min_size; @@ -372,22 +378,8 @@ static void rht_deferred_worker(struct work_struct *work) schedule_work(&ht->run_work); } -static bool rhashtable_check_elasticity(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) -{ - unsigned int elasticity = ht->elasticity; - struct rhash_head *head; - - rht_for_each(head, tbl, hash) - if (!--elasticity) - return true; - - return false; -} - -int rhashtable_insert_rehash(struct rhashtable *ht, - struct bucket_table *tbl) +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; @@ -433,61 +425,172 @@ fail: return err; } -EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, - const void *key, - struct rhash_head *obj, - struct bucket_table *tbl) +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) { + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev; struct rhash_head *head; - unsigned int hash; - int err; + int elasticity; - tbl = rhashtable_last_table(ht, tbl); - hash = head_hashfn(ht, tbl, obj); - spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); + elasticity = ht->elasticity; + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; - err = -EEXIST; - if (key && rhashtable_lookup_fast(ht, key, ht->p)) - goto exit; + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) + continue; - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto exit; + if (!ht->rhlist) + return rht_obj(ht, head); + + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + rcu_assign_pointer(*pprev, obj); - err = -EAGAIN; - if (rhashtable_check_elasticity(ht, tbl, hash) || - rht_grow_above_100(ht, tbl)) - goto exit; + return NULL; + } - err = 0; + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash, + struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rcu_dereference(tbl->future_tbl); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); + + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); -exit: - spin_unlock(rht_bucket_lock(tbl, hash)); + return NULL; +} - if (err == 0) - return NULL; - else if (err == -EAGAIN) - return tbl; - else - return ERR_PTR(err); +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + unsigned int hash; + spinlock_t *lock; + void *data; + + tbl = rcu_dereference(ht->tbl); + + /* All insertions must grab the oldest table containing + * the hashed bucket that is yet to be rehashed. + */ + for (;;) { + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + lock = rht_bucket_lock(tbl, hash); + spin_lock_bh(lock); + + if (tbl->rehash <= hash) + break; + + spin_unlock_bh(lock); + tbl = rcu_dereference(tbl->future_tbl); + } + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + while (!IS_ERR_OR_NULL(new_tbl)) { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + spin_lock_nested(rht_bucket_lock(tbl, hash), + SINGLE_DEPTH_NESTING); + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + spin_unlock(rht_bucket_lock(tbl, hash)); + } + + spin_unlock_bh(lock); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); + + return data; } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** - * rhashtable_walk_init - Initialise an iterator + * rhashtable_walk_enter - Initialise an iterator * @ht: Table to walk over * @iter: Hash table Iterator - * @gfp: GFP flags for allocations * * This function prepares a hash table walk. * @@ -502,30 +605,22 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow); * This function may sleep so you must not call it from interrupt * context or with spin locks held. * - * You must call rhashtable_walk_exit if this function returns - * successfully. + * You must call rhashtable_walk_exit after this function returns. */ -int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter, - gfp_t gfp) +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) { iter->ht = ht; iter->p = NULL; iter->slot = 0; iter->skip = 0; - iter->walker = kmalloc(sizeof(*iter->walker), gfp); - if (!iter->walker) - return -ENOMEM; - spin_lock(&ht->lock); - iter->walker->tbl = + iter->walker.tbl = rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); - list_add(&iter->walker->list, &iter->walker->tbl->walkers); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); spin_unlock(&ht->lock); - - return 0; } -EXPORT_SYMBOL_GPL(rhashtable_walk_init); +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); /** * rhashtable_walk_exit - Free an iterator @@ -536,10 +631,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init); void rhashtable_walk_exit(struct rhashtable_iter *iter) { spin_lock(&iter->ht->lock); - if (iter->walker->tbl) - list_del(&iter->walker->list); + if (iter->walker.tbl) + list_del(&iter->walker.list); spin_unlock(&iter->ht->lock); - kfree(iter->walker); } EXPORT_SYMBOL_GPL(rhashtable_walk_exit); @@ -565,12 +659,12 @@ int rhashtable_walk_start(struct rhashtable_iter *iter) rcu_read_lock(); spin_lock(&ht->lock); - if (iter->walker->tbl) - list_del(&iter->walker->list); + if (iter->walker.tbl) + list_del(&iter->walker.list); spin_unlock(&ht->lock); - if (!iter->walker->tbl) { - iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht); + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); return -EAGAIN; } @@ -592,12 +686,17 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start); */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { - struct bucket_table *tbl = iter->walker->tbl; + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; if (p) { - p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot); + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } goto next; } @@ -605,6 +704,18 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter) int skip = iter->skip; rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } if (!skip) break; skip--; @@ -614,7 +725,8 @@ next: if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; - return rht_obj(ht, p); + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); } iter->skip = 0; @@ -625,8 +737,8 @@ next: /* Ensure we see any new tables. */ smp_rmb(); - iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht); - if (iter->walker->tbl) { + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { iter->slot = 0; iter->skip = 0; return ERR_PTR(-EAGAIN); @@ -646,7 +758,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) { struct rhashtable *ht; - struct bucket_table *tbl = iter->walker->tbl; + struct bucket_table *tbl = iter->walker.tbl; if (!tbl) goto out; @@ -655,9 +767,9 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) spin_lock(&ht->lock); if (tbl->rehash < tbl->size) - list_add(&iter->walker->list, &tbl->walkers); + list_add(&iter->walker.list, &tbl->walkers); else - iter->walker->tbl = NULL; + iter->walker.tbl = NULL; spin_unlock(&ht->lock); iter->p = NULL; @@ -803,6 +915,48 @@ int rhashtable_init(struct rhashtable *ht, EXPORT_SYMBOL_GPL(rhashtable_init); /** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) +{ + int err; + + /* No rhlist NULLs marking for now. */ + if (params->nulls_base) + return -EINVAL; + + err = rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy * @free_fn: callback to release resources of element @@ -839,7 +993,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, pos = next, next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL) - free_fn(rht_obj(ht, pos), arg); + rhashtable_free_one(ht, pos, free_fn, arg); } } diff --git a/lib/sg_pool.c b/lib/sg_pool.c new file mode 100644 index 000000000000..6dd30615a201 --- /dev/null +++ b/lib/sg_pool.c @@ -0,0 +1,172 @@ +#include <linux/module.h> +#include <linux/scatterlist.h> +#include <linux/mempool.h> +#include <linux/slab.h> + +#define SG_MEMPOOL_NR ARRAY_SIZE(sg_pools) +#define SG_MEMPOOL_SIZE 2 + +struct sg_pool { + size_t size; + char *name; + struct kmem_cache *slab; + mempool_t *pool; +}; + +#define SP(x) { .size = x, "sgpool-" __stringify(x) } +#if (SG_CHUNK_SIZE < 32) +#error SG_CHUNK_SIZE is too small (must be 32 or greater) +#endif +static struct sg_pool sg_pools[] = { + SP(8), + SP(16), +#if (SG_CHUNK_SIZE > 32) + SP(32), +#if (SG_CHUNK_SIZE > 64) + SP(64), +#if (SG_CHUNK_SIZE > 128) + SP(128), +#if (SG_CHUNK_SIZE > 256) +#error SG_CHUNK_SIZE is too large (256 MAX) +#endif +#endif +#endif +#endif + SP(SG_CHUNK_SIZE) +}; +#undef SP + +static inline unsigned int sg_pool_index(unsigned short nents) +{ + unsigned int index; + + BUG_ON(nents > SG_CHUNK_SIZE); + + if (nents <= 8) + index = 0; + else + index = get_count_order(nents) - 3; + + return index; +} + +static void sg_pool_free(struct scatterlist *sgl, unsigned int nents) +{ + struct sg_pool *sgp; + + sgp = sg_pools + sg_pool_index(nents); + mempool_free(sgl, sgp->pool); +} + +static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask) +{ + struct sg_pool *sgp; + + sgp = sg_pools + sg_pool_index(nents); + return mempool_alloc(sgp->pool, gfp_mask); +} + +/** + * sg_free_table_chained - Free a previously mapped sg table + * @table: The sg table header to use + * @first_chunk: was first_chunk not NULL in sg_alloc_table_chained? + * + * Description: + * Free an sg table previously allocated and setup with + * sg_alloc_table_chained(). + * + **/ +void sg_free_table_chained(struct sg_table *table, bool first_chunk) +{ + if (first_chunk && table->orig_nents <= SG_CHUNK_SIZE) + return; + __sg_free_table(table, SG_CHUNK_SIZE, first_chunk, sg_pool_free); +} +EXPORT_SYMBOL_GPL(sg_free_table_chained); + +/** + * sg_alloc_table_chained - Allocate and chain SGLs in an sg table + * @table: The sg table header to use + * @nents: Number of entries in sg list + * @first_chunk: first SGL + * + * Description: + * Allocate and chain SGLs in an sg table. If @nents@ is larger than + * SG_CHUNK_SIZE a chained sg table will be setup. + * + **/ +int sg_alloc_table_chained(struct sg_table *table, int nents, + struct scatterlist *first_chunk) +{ + int ret; + + BUG_ON(!nents); + + if (first_chunk) { + if (nents <= SG_CHUNK_SIZE) { + table->nents = table->orig_nents = nents; + sg_init_table(table->sgl, nents); + return 0; + } + } + + ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE, + first_chunk, GFP_ATOMIC, sg_pool_alloc); + if (unlikely(ret)) + sg_free_table_chained(table, (bool)first_chunk); + return ret; +} +EXPORT_SYMBOL_GPL(sg_alloc_table_chained); + +static __init int sg_pool_init(void) +{ + int i; + + for (i = 0; i < SG_MEMPOOL_NR; i++) { + struct sg_pool *sgp = sg_pools + i; + int size = sgp->size * sizeof(struct scatterlist); + + sgp->slab = kmem_cache_create(sgp->name, size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!sgp->slab) { + printk(KERN_ERR "SG_POOL: can't init sg slab %s\n", + sgp->name); + goto cleanup_sdb; + } + + sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, + sgp->slab); + if (!sgp->pool) { + printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n", + sgp->name); + goto cleanup_sdb; + } + } + + return 0; + +cleanup_sdb: + for (i = 0; i < SG_MEMPOOL_NR; i++) { + struct sg_pool *sgp = sg_pools + i; + if (sgp->pool) + mempool_destroy(sgp->pool); + if (sgp->slab) + kmem_cache_destroy(sgp->slab); + } + + return -ENOMEM; +} + +static __exit void sg_pool_exit(void) +{ + int i; + + for (i = 0; i < SG_MEMPOOL_NR; i++) { + struct sg_pool *sgp = sg_pools + i; + mempool_destroy(sgp->pool); + kmem_cache_destroy(sgp->slab); + } +} + +module_init(sg_pool_init); +module_exit(sg_pool_exit); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 53ad6c0831ae..60f77f1d470a 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -242,6 +242,7 @@ depot_stack_handle_t depot_save_stack(struct stack_trace *trace, */ alloc_flags &= ~GFP_ZONEMASK; alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); + alloc_flags |= __GFP_NOWARN; page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); if (page) prealloc = page_address(page); diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 5c88204b6f1f..ecaac2c0526f 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -10,6 +10,10 @@ #include <linux/export.h> #include <linux/ctype.h> #include <linux/errno.h> +#include <linux/fs.h> +#include <linux/limits.h> +#include <linux/mm.h> +#include <linux/slab.h> #include <linux/string.h> #include <linux/string_helpers.h> @@ -534,3 +538,91 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, return p - dst; } EXPORT_SYMBOL(string_escape_mem); + +/* + * Return an allocated string that has been escaped of special characters + * and double quotes, making it safe to log in quotes. + */ +char *kstrdup_quotable(const char *src, gfp_t gfp) +{ + size_t slen, dlen; + char *dst; + const int flags = ESCAPE_HEX; + const char esc[] = "\f\n\r\t\v\a\e\\\""; + + if (!src) + return NULL; + slen = strlen(src); + + dlen = string_escape_mem(src, slen, NULL, 0, flags, esc); + dst = kmalloc(dlen + 1, gfp); + if (!dst) + return NULL; + + WARN_ON(string_escape_mem(src, slen, dst, dlen, flags, esc) != dlen); + dst[dlen] = '\0'; + + return dst; +} +EXPORT_SYMBOL_GPL(kstrdup_quotable); + +/* + * Returns allocated NULL-terminated string containing process + * command line, with inter-argument NULLs replaced with spaces, + * and other special characters escaped. + */ +char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp) +{ + char *buffer, *quoted; + int i, res; + + buffer = kmalloc(PAGE_SIZE, GFP_TEMPORARY); + if (!buffer) + return NULL; + + res = get_cmdline(task, buffer, PAGE_SIZE - 1); + buffer[res] = '\0'; + + /* Collapse trailing NULLs, leave res pointing to last non-NULL. */ + while (--res >= 0 && buffer[res] == '\0') + ; + + /* Replace inter-argument NULLs. */ + for (i = 0; i <= res; i++) + if (buffer[i] == '\0') + buffer[i] = ' '; + + /* Make sure result is printable. */ + quoted = kstrdup_quotable(buffer, gfp); + kfree(buffer); + return quoted; +} +EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline); + +/* + * Returns allocated NULL-terminated string containing pathname, + * with special characters escaped, able to be safely logged. If + * there is an error, the leading character will be "<". + */ +char *kstrdup_quotable_file(struct file *file, gfp_t gfp) +{ + char *temp, *pathname; + + if (!file) + return kstrdup("<unknown>", gfp); + + /* We add 11 spaces for ' (deleted)' to be appended */ + temp = kmalloc(PATH_MAX + 11, GFP_TEMPORARY); + if (!temp) + return kstrdup("<no_memory>", gfp); + + pathname = file_path(file, temp, PATH_MAX + 11); + if (IS_ERR(pathname)) + pathname = kstrdup("<too_long>", gfp); + else + pathname = kstrdup_quotable(pathname, gfp); + + kfree(temp); + return pathname; +} +EXPORT_SYMBOL_GPL(kstrdup_quotable_file); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 33840324138c..9c5fe8110413 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,5 +1,6 @@ #include <linux/compiler.h> #include <linux/export.h> +#include <linux/kasan-checks.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -39,8 +40,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - break; + unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); + *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); @@ -55,8 +56,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(unsafe_get_user(c,src+res))) - return -EFAULT; + unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; @@ -75,6 +75,7 @@ byte_at_a_time: * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ +efault: return -EFAULT; } @@ -109,6 +110,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) unsigned long max = max_addr - src_addr; long retval; + kasan_check_write(dst, count); user_access_begin(); retval = do_strncpy_from_user(dst, src, count, max); user_access_end(); diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 2625943625d7..8e105ed4df12 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) - return 0; + unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { @@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - return 0; + unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; @@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ +efault: return 0; } diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 76f29ecba8f4..22e13a0e19d7 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -738,7 +738,7 @@ swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { phys_addr_t map, phys = page_to_phys(page) + offset; dma_addr_t dev_addr = phys_to_dma(dev, phys); @@ -807,7 +807,7 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { unmap_single(hwdev, dev_addr, size, dir); } @@ -877,7 +877,7 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_device); */ int swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; @@ -914,7 +914,7 @@ int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir) { - return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); + return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, 0); } EXPORT_SYMBOL(swiotlb_map_sg); @@ -924,7 +924,8 @@ EXPORT_SYMBOL(swiotlb_map_sg); */ void swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) + int nelems, enum dma_data_direction dir, + unsigned long attrs) { struct scatterlist *sg; int i; @@ -941,7 +942,7 @@ void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir) { - return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); + return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, 0); } EXPORT_SYMBOL(swiotlb_unmap_sg); diff --git a/lib/syscall.c b/lib/syscall.c index e30e03932480..63239e097b13 100644 --- a/lib/syscall.c +++ b/lib/syscall.c @@ -7,9 +7,19 @@ static int collect_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc) { - struct pt_regs *regs = task_pt_regs(target); - if (unlikely(!regs)) + struct pt_regs *regs; + + if (!try_get_task_stack(target)) { + /* Task has no stack, so the task isn't in a syscall. */ + *callno = -1; + return 0; + } + + regs = task_pt_regs(target); + if (unlikely(!regs)) { + put_task_stack(target); return -EAGAIN; + } *sp = user_stack_pointer(regs); *pc = instruction_pointer(regs); @@ -18,6 +28,7 @@ static int collect_syscall(struct task_struct *target, long *callno, if (*callno != -1L && maxargs > 0) syscall_get_arguments(target, regs, 0, maxargs, args); + put_task_stack(target); return 0; } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 93f45011a59d..94346b4d8984 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -5485,6 +5485,7 @@ static struct sk_buff *populate_skb(char *buf, int size) skb->hash = SKB_HASH; skb->queue_mapping = SKB_QUEUE_MAP; skb->vlan_tci = SKB_VLAN_TCI; + skb->vlan_proto = htons(ETH_P_IP); skb->dev = &dev; skb->dev->ifindex = SKB_DEV_IFINDEX; skb->dev->type = SKB_DEV_TYPE; diff --git a/lib/test_hash.c b/lib/test_hash.c new file mode 100644 index 000000000000..cac20c5fb304 --- /dev/null +++ b/lib/test_hash.c @@ -0,0 +1,256 @@ +/* + * Test cases for <linux/hash.h> and <linux/stringhash.h> + * This just verifies that various ways of computing a hash + * produce the same thing and, for cases where a k-bit hash + * value is requested, is of the requested size. + * + * We fill a buffer with a 255-byte null-terminated string, + * and use both full_name_hash() and hashlen_string() to hash the + * substrings from i to j, where 0 <= i < j < 256. + * + * The returned values are used to check that __hash_32() and + * __hash_32_generic() compute the same thing. Likewise hash_32() + * and hash_64(). + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt "\n" + +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/hash.h> +#include <linux/stringhash.h> +#include <linux/printk.h> + +/* 32-bit XORSHIFT generator. Seed must not be zero. */ +static u32 __init __attribute_const__ +xorshift(u32 seed) +{ + seed ^= seed << 13; + seed ^= seed >> 17; + seed ^= seed << 5; + return seed; +} + +/* Given a non-zero x, returns a non-zero byte. */ +static u8 __init __attribute_const__ +mod255(u32 x) +{ + x = (x & 0xffff) + (x >> 16); /* 1 <= x <= 0x1fffe */ + x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0x2fd */ + x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0x100 */ + x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0xff */ + return x; +} + +/* Fill the buffer with non-zero bytes. */ +static void __init +fill_buf(char *buf, size_t len, u32 seed) +{ + size_t i; + + for (i = 0; i < len; i++) { + seed = xorshift(seed); + buf[i] = mod255(seed); + } +} + +/* + * Test the various integer hash functions. h64 (or its low-order bits) + * is the integer to hash. hash_or accumulates the OR of the hash values, + * which are later checked to see that they cover all the requested bits. + * + * Because these functions (as opposed to the string hashes) are all + * inline, the code being tested is actually in the module, and you can + * recompile and re-test the module without rebooting. + */ +static bool __init +test_int_hash(unsigned long long h64, u32 hash_or[2][33]) +{ + int k; + u32 h0 = (u32)h64, h1, h2; + + /* Test __hash32 */ + hash_or[0][0] |= h1 = __hash_32(h0); +#ifdef HAVE_ARCH__HASH_32 + hash_or[1][0] |= h2 = __hash_32_generic(h0); +#if HAVE_ARCH__HASH_32 == 1 + if (h1 != h2) { + pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x", + h0, h1, h2); + return false; + } +#endif +#endif + + /* Test k = 1..32 bits */ + for (k = 1; k <= 32; k++) { + u32 const m = ((u32)2 << (k-1)) - 1; /* Low k bits set */ + + /* Test hash_32 */ + hash_or[0][k] |= h1 = hash_32(h0, k); + if (h1 > m) { + pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m); + return false; + } +#ifdef HAVE_ARCH_HASH_32 + h2 = hash_32_generic(h0, k); +#if HAVE_ARCH_HASH_32 == 1 + if (h1 != h2) { + pr_err("hash_32(%#x, %d) = %#x != hash_32_generic() " + " = %#x", h0, k, h1, h2); + return false; + } +#else + if (h2 > m) { + pr_err("hash_32_generic(%#x, %d) = %#x > %#x", + h0, k, h1, m); + return false; + } +#endif +#endif + /* Test hash_64 */ + hash_or[1][k] |= h1 = hash_64(h64, k); + if (h1 > m) { + pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, h1, m); + return false; + } +#ifdef HAVE_ARCH_HASH_64 + h2 = hash_64_generic(h64, k); +#if HAVE_ARCH_HASH_64 == 1 + if (h1 != h2) { + pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() " + "= %#x", h64, k, h1, h2); + return false; + } +#else + if (h2 > m) { + pr_err("hash_64_generic(%#llx, %d) = %#x > %#x", + h64, k, h1, m); + return false; + } +#endif +#endif + } + + (void)h2; /* Suppress unused variable warning */ + return true; +} + +#define SIZE 256 /* Run time is cubic in SIZE */ + +static int __init +test_hash_init(void) +{ + char buf[SIZE+1]; + u32 string_or = 0, hash_or[2][33] = { { 0, } }; + unsigned tests = 0; + unsigned long long h64 = 0; + int i, j; + + fill_buf(buf, SIZE, 1); + + /* Test every possible non-empty substring in the buffer. */ + for (j = SIZE; j > 0; --j) { + buf[j] = '\0'; + + for (i = 0; i <= j; i++) { + u64 hashlen = hashlen_string(buf+i, buf+i); + u32 h0 = full_name_hash(buf+i, buf+i, j-i); + + /* Check that hashlen_string gets the length right */ + if (hashlen_len(hashlen) != j-i) { + pr_err("hashlen_string(%d..%d) returned length" + " %u, expected %d", + i, j, hashlen_len(hashlen), j-i); + return -EINVAL; + } + /* Check that the hashes match */ + if (hashlen_hash(hashlen) != h0) { + pr_err("hashlen_string(%d..%d) = %08x != " + "full_name_hash() = %08x", + i, j, hashlen_hash(hashlen), h0); + return -EINVAL; + } + + string_or |= h0; + h64 = h64 << 32 | h0; /* For use with hash_64 */ + if (!test_int_hash(h64, hash_or)) + return -EINVAL; + tests++; + } /* i */ + } /* j */ + + /* The OR of all the hash values should cover all the bits */ + if (~string_or) { + pr_err("OR of all string hash results = %#x != %#x", + string_or, -1u); + return -EINVAL; + } + if (~hash_or[0][0]) { + pr_err("OR of all __hash_32 results = %#x != %#x", + hash_or[0][0], -1u); + return -EINVAL; + } +#ifdef HAVE_ARCH__HASH_32 +#if HAVE_ARCH__HASH_32 != 1 /* Test is pointless if results match */ + if (~hash_or[1][0]) { + pr_err("OR of all __hash_32_generic results = %#x != %#x", + hash_or[1][0], -1u); + return -EINVAL; + } +#endif +#endif + + /* Likewise for all the i-bit hash values */ + for (i = 1; i <= 32; i++) { + u32 const m = ((u32)2 << (i-1)) - 1; /* Low i bits set */ + + if (hash_or[0][i] != m) { + pr_err("OR of all hash_32(%d) results = %#x " + "(%#x expected)", i, hash_or[0][i], m); + return -EINVAL; + } + if (hash_or[1][i] != m) { + pr_err("OR of all hash_64(%d) results = %#x " + "(%#x expected)", i, hash_or[1][i], m); + return -EINVAL; + } + } + + /* Issue notices about skipped tests. */ +#ifdef HAVE_ARCH__HASH_32 +#if HAVE_ARCH__HASH_32 != 1 + pr_info("__hash_32() is arch-specific; not compared to generic."); +#endif +#else + pr_info("__hash_32() has no arch implementation to test."); +#endif +#ifdef HAVE_ARCH_HASH_32 +#if HAVE_ARCH_HASH_32 != 1 + pr_info("hash_32() is arch-specific; not compared to generic."); +#endif +#else + pr_info("hash_32() has no arch implementation to test."); +#endif +#ifdef HAVE_ARCH_HASH_64 +#if HAVE_ARCH_HASH_64 != 1 + pr_info("hash_64() is arch-specific; not compared to generic."); +#endif +#else + pr_info("hash_64() has no arch implementation to test."); +#endif + + pr_notice("%u tests passed.", tests); + + return 0; +} + +static void __exit test_hash_exit(void) +{ +} + +module_init(test_hash_init); /* Does everything */ +module_exit(test_hash_exit); /* Does nothing */ + +MODULE_LICENSE("GPL"); diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 82169fbf2453..5e51872b3fc1 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -12,9 +12,12 @@ #define pr_fmt(fmt) "kasan test: %s " fmt, __func__ #include <linux/kernel.h> +#include <linux/mman.h> +#include <linux/mm.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/uaccess.h> #include <linux/module.h> static noinline void __init kmalloc_oob_right(void) @@ -344,6 +347,70 @@ static noinline void __init kasan_stack_oob(void) *(volatile char *)p; } +static noinline void __init ksize_unpoisons_memory(void) +{ + char *ptr; + size_t size = 123, real_size = size; + + pr_info("ksize() unpoisons the whole allocated chunk\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + real_size = ksize(ptr); + /* This access doesn't trigger an error. */ + ptr[size] = 'x'; + /* This one does. */ + ptr[real_size] = 'y'; + kfree(ptr); +} + +static noinline void __init copy_user_test(void) +{ + char *kmem; + char __user *usermem; + size_t size = 10; + int unused; + + kmem = kmalloc(size, GFP_KERNEL); + if (!kmem) + return; + + usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE, 0); + if (IS_ERR(usermem)) { + pr_err("Failed to allocate user memory\n"); + kfree(kmem); + return; + } + + pr_info("out-of-bounds in copy_from_user()\n"); + unused = copy_from_user(kmem, usermem, size + 1); + + pr_info("out-of-bounds in copy_to_user()\n"); + unused = copy_to_user(usermem, kmem, size + 1); + + pr_info("out-of-bounds in __copy_from_user()\n"); + unused = __copy_from_user(kmem, usermem, size + 1); + + pr_info("out-of-bounds in __copy_to_user()\n"); + unused = __copy_to_user(usermem, kmem, size + 1); + + pr_info("out-of-bounds in __copy_from_user_inatomic()\n"); + unused = __copy_from_user_inatomic(kmem, usermem, size + 1); + + pr_info("out-of-bounds in __copy_to_user_inatomic()\n"); + unused = __copy_to_user_inatomic(usermem, kmem, size + 1); + + pr_info("out-of-bounds in strncpy_from_user()\n"); + unused = strncpy_from_user(kmem, usermem, size + 1); + + vm_munmap((unsigned long)usermem, PAGE_SIZE); + kfree(kmem); +} + static int __init kmalloc_tests_init(void) { kmalloc_oob_right(); @@ -367,6 +434,8 @@ static int __init kmalloc_tests_init(void) kmem_cache_oob(); kasan_stack_oob(); kasan_global_oob(); + ksize_unpoisons_memory(); + copy_user_test(); return -EAGAIN; } diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 297fdb5e74bd..64e899b63337 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -38,7 +38,7 @@ MODULE_PARM_DESC(runs, "Number of test runs per variant (default: 4)"); static int max_size = 0; module_param(max_size, int, 0); -MODULE_PARM_DESC(runs, "Maximum table size (default: calculated)"); +MODULE_PARM_DESC(max_size, "Maximum table size (default: calculated)"); static bool shrinking = false; module_param(shrinking, bool, 0); diff --git a/lib/test_uuid.c b/lib/test_uuid.c new file mode 100644 index 000000000000..547d3127a3cf --- /dev/null +++ b/lib/test_uuid.c @@ -0,0 +1,133 @@ +/* + * Test cases for lib/uuid.c module. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/uuid.h> + +struct test_uuid_data { + const char *uuid; + uuid_le le; + uuid_be be; +}; + +static const struct test_uuid_data test_uuid_test_data[] = { + { + .uuid = "c33f4995-3701-450e-9fbf-206a2e98e576", + .le = UUID_LE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76), + .be = UUID_BE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76), + }, + { + .uuid = "64b4371c-77c1-48f9-8221-29f054fc023b", + .le = UUID_LE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b), + .be = UUID_BE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b), + }, + { + .uuid = "0cb4ddff-a545-4401-9d06-688af53e7f84", + .le = UUID_LE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84), + .be = UUID_BE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84), + }, +}; + +static const char * const test_uuid_wrong_data[] = { + "c33f4995-3701-450e-9fbf206a2e98e576 ", /* no hyphen(s) */ + "64b4371c-77c1-48f9-8221-29f054XX023b", /* invalid character(s) */ + "0cb4ddff-a545-4401-9d06-688af53e", /* not enough data */ +}; + +static unsigned total_tests __initdata; +static unsigned failed_tests __initdata; + +static void __init test_uuid_failed(const char *prefix, bool wrong, bool be, + const char *data, const char *actual) +{ + pr_err("%s test #%u %s %s data: '%s'\n", + prefix, + total_tests, + wrong ? "passed on wrong" : "failed on", + be ? "BE" : "LE", + data); + if (actual && *actual) + pr_err("%s test #%u actual data: '%s'\n", + prefix, + total_tests, + actual); + failed_tests++; +} + +static void __init test_uuid_test(const struct test_uuid_data *data) +{ + uuid_le le; + uuid_be be; + char buf[48]; + + /* LE */ + total_tests++; + if (uuid_le_to_bin(data->uuid, &le)) + test_uuid_failed("conversion", false, false, data->uuid, NULL); + + total_tests++; + if (uuid_le_cmp(data->le, le)) { + sprintf(buf, "%pUl", &le); + test_uuid_failed("cmp", false, false, data->uuid, buf); + } + + /* BE */ + total_tests++; + if (uuid_be_to_bin(data->uuid, &be)) + test_uuid_failed("conversion", false, true, data->uuid, NULL); + + total_tests++; + if (uuid_be_cmp(data->be, be)) { + sprintf(buf, "%pUb", &be); + test_uuid_failed("cmp", false, true, data->uuid, buf); + } +} + +static void __init test_uuid_wrong(const char *data) +{ + uuid_le le; + uuid_be be; + + /* LE */ + total_tests++; + if (!uuid_le_to_bin(data, &le)) + test_uuid_failed("negative", true, false, data, NULL); + + /* BE */ + total_tests++; + if (!uuid_be_to_bin(data, &be)) + test_uuid_failed("negative", true, true, data, NULL); +} + +static int __init test_uuid_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(test_uuid_test_data); i++) + test_uuid_test(&test_uuid_test_data[i]); + + for (i = 0; i < ARRAY_SIZE(test_uuid_wrong_data); i++) + test_uuid_wrong(test_uuid_wrong_data[i]); + + if (failed_tests == 0) + pr_info("all %u tests passed\n", total_tests); + else + pr_err("failed %u out of %u tests\n", failed_tests, total_tests); + + return failed_tests ? -EINVAL : 0; +} +module_init(test_uuid_init); + +static void __exit test_uuid_exit(void) +{ + /* do nothing */ +} +module_exit(test_uuid_exit); + +MODULE_AUTHOR("Andy Shevchenko <andriy.shevchenko@linux.intel.com>"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/ubsan.c b/lib/ubsan.c index 8799ae5e2e42..fb0409df1bcf 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -308,7 +308,7 @@ static void handle_object_size_mismatch(struct type_mismatch_data *data, return; ubsan_prologue(&data->location, &flags); - pr_err("%s address %pk with insufficient space\n", + pr_err("%s address %p with insufficient space\n", type_check_kinds[data->type_check_kind], (void *) ptr); pr_err("for an object of type %s\n", data->type->type_name); diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c index f0b323abb4c6..ae8d2491133c 100644 --- a/lib/ucs2_string.c +++ b/lib/ucs2_string.c @@ -56,7 +56,7 @@ ucs2_utf8size(const ucs2_char_t *src) unsigned long i; unsigned long j = 0; - for (i = 0; i < ucs2_strlen(src); i++) { + for (i = 0; src[i]; i++) { u16 c = src[i]; if (c >= 0x800) diff --git a/lib/usercopy.c b/lib/usercopy.c deleted file mode 100644 index 4f5b1ddbcd25..000000000000 --- a/lib/usercopy.c +++ /dev/null @@ -1,9 +0,0 @@ -#include <linux/export.h> -#include <linux/bug.h> -#include <linux/uaccess.h> - -void copy_from_user_overflow(void) -{ - WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); diff --git a/lib/uuid.c b/lib/uuid.c index 398821e4dce1..37687af77ff8 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -1,7 +1,7 @@ /* * Unified UUID/GUID definition * - * Copyright (C) 2009, Intel Corp. + * Copyright (C) 2009, 2016 Intel Corp. * Huang Ying <ying.huang@intel.com> * * This program is free software; you can redistribute it and/or @@ -12,17 +12,40 @@ * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/errno.h> #include <linux/export.h> #include <linux/uuid.h> #include <linux/random.h> +const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; +EXPORT_SYMBOL(uuid_le_index); +const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; +EXPORT_SYMBOL(uuid_be_index); + +/*************************************************************** + * Random UUID interface + * + * Used here for a Boot ID, but can be useful for other kernel + * drivers. + ***************************************************************/ + +/* + * Generate random UUID + */ +void generate_random_uuid(unsigned char uuid[16]) +{ + get_random_bytes(uuid, 16); + /* Set UUID version to 4 --- truly random generation */ + uuid[6] = (uuid[6] & 0x0F) | 0x40; + /* Set the UUID variant to DCE */ + uuid[8] = (uuid[8] & 0x3F) | 0x80; +} +EXPORT_SYMBOL(generate_random_uuid); + static void __uuid_gen_common(__u8 b[16]) { prandom_bytes(b, 16); @@ -45,3 +68,61 @@ void uuid_be_gen(uuid_be *bu) bu->b[6] = (bu->b[6] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(uuid_be_gen); + +/** + * uuid_is_valid - checks if UUID string valid + * @uuid: UUID string to check + * + * Description: + * It checks if the UUID string is following the format: + * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + * where x is a hex digit. + * + * Return: true if input is valid UUID string. + */ +bool uuid_is_valid(const char *uuid) +{ + unsigned int i; + + for (i = 0; i < UUID_STRING_LEN; i++) { + if (i == 8 || i == 13 || i == 18 || i == 23) { + if (uuid[i] != '-') + return false; + } else if (!isxdigit(uuid[i])) { + return false; + } + } + + return true; +} +EXPORT_SYMBOL(uuid_is_valid); + +static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16]) +{ + static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; + unsigned int i; + + if (!uuid_is_valid(uuid)) + return -EINVAL; + + for (i = 0; i < 16; i++) { + int hi = hex_to_bin(uuid[si[i] + 0]); + int lo = hex_to_bin(uuid[si[i] + 1]); + + b[ei[i]] = (hi << 4) | lo; + } + + return 0; +} + +int uuid_le_to_bin(const char *uuid, uuid_le *u) +{ + return __uuid_to_bin(uuid, u->b, uuid_le_index); +} +EXPORT_SYMBOL(uuid_le_to_bin); + +int uuid_be_to_bin(const char *uuid, uuid_be *u) +{ + return __uuid_to_bin(uuid, u->b, uuid_be_index); +} +EXPORT_SYMBOL(uuid_be_to_bin); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index ccb664b54280..0967771d8f7f 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -30,6 +30,7 @@ #include <linux/ioport.h> #include <linux/dcache.h> #include <linux/cred.h> +#include <linux/uuid.h> #include <net/addrconf.h> #ifdef CONFIG_BLOCK #include <linux/blkdev.h> @@ -1304,19 +1305,17 @@ static noinline_for_stack char *uuid_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { - char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; + char uuid[UUID_STRING_LEN + 1]; char *p = uuid; int i; - static const u8 be[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; - static const u8 le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; - const u8 *index = be; + const u8 *index = uuid_be_index; bool uc = false; switch (*(++fmt)) { case 'L': uc = true; /* fall-through */ case 'l': - index = le; + index = uuid_le_index; break; case 'B': uc = true; @@ -1324,7 +1323,10 @@ char *uuid_string(char *buf, char *end, const u8 *addr, } for (i = 0; i < 16; i++) { - p = hex_byte_pack(p, addr[index[i]]); + if (uc) + p = hex_byte_pack_upper(p, addr[index[i]]); + else + p = hex_byte_pack(p, addr[index[i]]); switch (i) { case 3: case 5: @@ -1337,13 +1339,6 @@ char *uuid_string(char *buf, char *end, const u8 *addr, *p = 0; - if (uc) { - p = uuid; - do { - *p = toupper(*p); - } while (*(++p)); - } - return string(buf, end, uuid, spec); } diff --git a/lib/win_minmax.c b/lib/win_minmax.c new file mode 100644 index 000000000000..c8420d404926 --- /dev/null +++ b/lib/win_minmax.c @@ -0,0 +1,98 @@ +/** + * lib/minmax.c: windowed min/max tracker + * + * Kathleen Nichols' algorithm for tracking the minimum (or maximum) + * value of a data stream over some fixed time interval. (E.g., + * the minimum RTT over the past five minutes.) It uses constant + * space and constant time per update yet almost always delivers + * the same minimum as an implementation that has to keep all the + * data in the window. + * + * The algorithm keeps track of the best, 2nd best & 3rd best min + * values, maintaining an invariant that the measurement time of + * the n'th best >= n-1'th best. It also makes sure that the three + * values are widely separated in the time window since that bounds + * the worse case error when that data is monotonically increasing + * over the window. + * + * Upon getting a new min, we can forget everything earlier because + * it has no value - the new min is <= everything else in the window + * by definition and it's the most recent. So we restart fresh on + * every new min and overwrites 2nd & 3rd choices. The same property + * holds for 2nd & 3rd best. + */ +#include <linux/module.h> +#include <linux/win_minmax.h> + +/* As time advances, update the 1st, 2nd, and 3rd choices. */ +static u32 minmax_subwin_update(struct minmax *m, u32 win, + const struct minmax_sample *val) +{ + u32 dt = val->t - m->s[0].t; + + if (unlikely(dt > win)) { + /* + * Passed entire window without a new val so make 2nd + * choice the new val & 3rd choice the new 2nd choice. + * we may have to iterate this since our 2nd choice + * may also be outside the window (we checked on entry + * that the third choice was in the window). + */ + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + if (unlikely(val->t - m->s[0].t > win)) { + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + } + } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) { + /* + * We've passed a quarter of the window without a new val + * so take a 2nd choice from the 2nd quarter of the window. + */ + m->s[2] = m->s[1] = *val; + } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) { + /* + * We've passed half the window without finding a new val + * so take a 3rd choice from the last half of the window + */ + m->s[2] = *val; + } + return m->s[0].v; +} + +/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */ +u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v >= m->s[0].v) || /* found new max? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v >= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v >= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} +EXPORT_SYMBOL(minmax_running_max); + +/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */ +u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v <= m->s[0].v) || /* found new min? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v <= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v <= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} |