From 3073774e638ef18d222465fe92bfc8fccb90d288 Mon Sep 17 00:00:00 2001 From: Serhii Popovych Date: Mon, 4 Dec 2017 09:36:41 -0500 Subject: KVM: PPC: Book3S HV: Drop prepare_done from struct kvm_resize_hpt Currently the kvm_resize_hpt structure has two fields relevant to the state of an ongoing resize: 'prepare_done', which indicates whether the worker thread has completed or not, and 'error' which indicates whether it was successful or not. Since the success/failure isn't known until completion, this is confusingly redundant. This patch consolidates the information into just the 'error' value: -EBUSY indicates the worked is still in progress, other negative values indicate (completed) failure, 0 indicates successful completion. As a bonus this reduces size of struct kvm_resize_hpt by __alignof__(struct kvm_hpt_info) and saves few bytes of code. While there correct comment in struct kvm_resize_hpt which references a non-existent semaphore (leftover from an early draft). Assert with WARN_ON() in case of HPT allocation thread work runs more than once for resize request or resize_hpt_allocate() returns -EBUSY that is treated specially. Change comparison against zero to make checkpatch.pl happy. Cc: stable@vger.kernel.org # v4.10+ Signed-off-by: Serhii Popovych [dwg: Changed BUG_ON()s to WARN_ON()s and altered commit message for clarity] Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 44 +++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 966097232d21..f5f2c6bf5856 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -65,11 +65,17 @@ struct kvm_resize_hpt { u32 order; /* These fields protected by kvm->lock */ + + /* Possible values and their usage: + * <0 an error occurred during allocation, + * -EBUSY allocation is in the progress, + * 0 allocation made successfuly. + */ int error; - bool prepare_done; - /* Private to the work thread, until prepare_done is true, - * then protected by kvm->resize_hpt_sem */ + /* Private to the work thread, until error != -EBUSY, + * then protected by kvm->lock. + */ struct kvm_hpt_info hpt; }; @@ -1433,15 +1439,23 @@ static void resize_hpt_prepare_work(struct work_struct *work) struct kvm *kvm = resize->kvm; int err; + if (WARN_ON(resize->error != -EBUSY)) + return; + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", resize->order); err = resize_hpt_allocate(resize); + /* We have strict assumption about -EBUSY + * when preparing for HPT resize. + */ + if (WARN_ON(err == -EBUSY)) + err = -EINPROGRESS; + mutex_lock(&kvm->lock); resize->error = err; - resize->prepare_done = true; mutex_unlock(&kvm->lock); } @@ -1466,14 +1480,12 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, if (resize) { if (resize->order == shift) { - /* Suitable resize in progress */ - if (resize->prepare_done) { - ret = resize->error; - if (ret != 0) - resize_hpt_release(kvm, resize); - } else { + /* Suitable resize in progress? */ + ret = resize->error; + if (ret == -EBUSY) ret = 100; /* estimated time in ms */ - } + else if (ret) + resize_hpt_release(kvm, resize); goto out; } @@ -1493,6 +1505,8 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, ret = -ENOMEM; goto out; } + + resize->error = -EBUSY; resize->order = shift; resize->kvm = kvm; INIT_WORK(&resize->work, resize_hpt_prepare_work); @@ -1547,16 +1561,12 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, if (!resize || (resize->order != shift)) goto out; - ret = -EBUSY; - if (!resize->prepare_done) - goto out; - ret = resize->error; - if (ret != 0) + if (ret) goto out; ret = resize_hpt_rehash(resize); - if (ret != 0) + if (ret) goto out; resize_hpt_pivot(resize); -- cgit v1.2.3 From 4ed11aeefda439c76ddae3ceebcfa4fad111f149 Mon Sep 17 00:00:00 2001 From: Serhii Popovych Date: Mon, 4 Dec 2017 09:36:42 -0500 Subject: KVM: PPC: Book3S HV: Fix use after free in case of multiple resize requests When serving multiple resize requests following could happen: CPU0 CPU1 ---- ---- kvm_vm_ioctl_resize_hpt_prepare(1); -> schedule_work() /* system_rq might be busy: delay */ kvm_vm_ioctl_resize_hpt_prepare(2); mutex_lock(); if (resize) { ... release_hpt_resize(); } ... resize_hpt_prepare_work() -> schedule_work() { mutex_unlock() /* resize->kvm could be wrong */ struct kvm *kvm = resize->kvm; mutex_lock(&kvm->lock); <<<< UAF ... } i.e. a second resize request with different order could be started by kvm_vm_ioctl_resize_hpt_prepare(), causing the previous request to be free()d when there's still an active worker thread which will try to access it. This leads to a use after free in point marked with UAF on the diagram above. To prevent this from happening, instead of unconditionally releasing a pre-existing resize structure from the prepare ioctl(), we check if the existing structure has an in-progress worker. We do that by checking if the resize->error == -EBUSY, which is safe because the resize->error field is protected by the kvm->lock. If there is an active worker, instead of releasing, we mark the structure as stale by unlinking it from kvm_struct. In the worker thread we check for a stale structure (with kvm->lock held), and in that case abort, releasing the stale structure ourself. We make the check both before and the actual allocation. Strictly, only the check afterwards is needed, the check before is an optimization: if the structure happens to become stale before the worker thread is dispatched, rather than during the allocation, it means we can avoid allocating then immediately freeing a potentially substantial amount of memory. This fixes following or similar host kernel crash message: [ 635.277361] Unable to handle kernel paging request for data at address 0x00000000 [ 635.277438] Faulting instruction address: 0xc00000000052f568 [ 635.277446] Oops: Kernel access of bad area, sig: 11 [#1] [ 635.277451] SMP NR_CPUS=2048 NUMA PowerNV [ 635.277470] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter nfsv3 nfs_acl nfs lockd grace fscache kvm_hv kvm rpcrdma sunrpc ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ext4 ib_srp scsi_transport_srp ib_ipoib mbcache jbd2 rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ocrdma(T) ib_core ses enclosure scsi_transport_sas sg shpchp leds_powernv ibmpowernv i2c_opal i2c_core powernv_rng ipmi_powernv ipmi_devintf ipmi_msghandler ip_tables xfs libcrc32c sr_mod sd_mod cdrom lpfc nvme_fc(T) nvme_fabrics nvme_core ipr nvmet_fc(T) tg3 nvmet libata be2net crc_t10dif crct10dif_generic scsi_transport_fc ptp scsi_tgt pps_core crct10dif_common dm_mirror dm_region_hash dm_log dm_mod [ 635.278687] CPU: 40 PID: 749 Comm: kworker/40:1 Tainted: G ------------ T 3.10.0.bz1510771+ #1 [ 635.278782] Workqueue: events resize_hpt_prepare_work [kvm_hv] [ 635.278851] task: c0000007e6840000 ti: c0000007e9180000 task.ti: c0000007e9180000 [ 635.278919] NIP: c00000000052f568 LR: c0000000009ea310 CTR: c0000000009ea4f0 [ 635.278988] REGS: c0000007e91837f0 TRAP: 0300 Tainted: G ------------ T (3.10.0.bz1510771+) [ 635.279077] MSR: 9000000100009033 CR: 24002022 XER: 00000000 [ 635.279248] CFAR: c000000000009368 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1 GPR00: c0000000009ea310 c0000007e9183a70 c000000001250b00 c0000007e9183b10 GPR04: 0000000000000000 0000000000000000 c0000007e9183650 0000000000000000 GPR08: c0000007ffff7b80 00000000ffffffff 0000000080000028 d00000000d2529a0 GPR12: 0000000000002200 c000000007b56800 c000000000120028 c0000007f135bb40 GPR16: 0000000000000000 c000000005c1e018 c000000005c1e018 0000000000000000 GPR20: 0000000000000001 c0000000011bf778 0000000000000001 fffffffffffffef7 GPR24: 0000000000000000 c000000f1e262e50 0000000000000002 c0000007e9180000 GPR28: c000000f1e262e4c c000000f1e262e50 0000000000000000 c0000007e9183b10 [ 635.280149] NIP [c00000000052f568] __list_add+0x38/0x110 [ 635.280197] LR [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0 [ 635.280253] Call Trace: [ 635.280277] [c0000007e9183af0] [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0 [ 635.280356] [c0000007e9183b70] [c0000000009ea554] mutex_lock+0x64/0x70 [ 635.280426] [c0000007e9183ba0] [d00000000d24da04] resize_hpt_prepare_work+0xe4/0x1c0 [kvm_hv] [ 635.280507] [c0000007e9183c40] [c000000000113c0c] process_one_work+0x1dc/0x680 [ 635.280587] [c0000007e9183ce0] [c000000000114250] worker_thread+0x1a0/0x520 [ 635.280655] [c0000007e9183d80] [c00000000012010c] kthread+0xec/0x100 [ 635.280724] [c0000007e9183e30] [c00000000000a4b8] ret_from_kernel_thread+0x5c/0xa4 [ 635.280814] Instruction dump: [ 635.280880] 7c0802a6 fba1ffe8 fbc1fff0 7cbd2b78 fbe1fff8 7c9e2378 7c7f1b78 f8010010 [ 635.281099] f821ff81 e8a50008 7fa52040 40de00b8 7fbd2840 40de008c 7fbff040 [ 635.281324] ---[ end trace b628b73449719b9d ]--- Cc: stable@vger.kernel.org # v4.10+ Fixes: b5baa6877315 ("KVM: PPC: Book3S HV: KVM-HV HPT resizing implementation") Signed-off-by: Serhii Popovych [dwg: Replaced BUG_ON()s with WARN_ONs() and reworded commit message for clarity] Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 50 ++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index f5f2c6bf5856..8355398f0bb6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1419,16 +1419,20 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize) static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) { - BUG_ON(kvm->arch.resize_hpt != resize); + if (WARN_ON(!mutex_is_locked(&kvm->lock))) + return; if (!resize) return; - if (resize->hpt.virt) - kvmppc_free_hpt(&resize->hpt); + if (resize->error != -EBUSY) { + if (resize->hpt.virt) + kvmppc_free_hpt(&resize->hpt); + kfree(resize); + } - kvm->arch.resize_hpt = NULL; - kfree(resize); + if (kvm->arch.resize_hpt == resize) + kvm->arch.resize_hpt = NULL; } static void resize_hpt_prepare_work(struct work_struct *work) @@ -1437,26 +1441,42 @@ static void resize_hpt_prepare_work(struct work_struct *work) struct kvm_resize_hpt, work); struct kvm *kvm = resize->kvm; - int err; + int err = 0; if (WARN_ON(resize->error != -EBUSY)) return; - resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", - resize->order); + mutex_lock(&kvm->lock); - err = resize_hpt_allocate(resize); + /* Request is still current? */ + if (kvm->arch.resize_hpt == resize) { + /* We may request large allocations here: + * do not sleep with kvm->lock held for a while. + */ + mutex_unlock(&kvm->lock); - /* We have strict assumption about -EBUSY - * when preparing for HPT resize. - */ - if (WARN_ON(err == -EBUSY)) - err = -EINPROGRESS; + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", + resize->order); - mutex_lock(&kvm->lock); + err = resize_hpt_allocate(resize); + + /* We have strict assumption about -EBUSY + * when preparing for HPT resize. + */ + if (WARN_ON(err == -EBUSY)) + err = -EINPROGRESS; + + mutex_lock(&kvm->lock); + /* It is possible that kvm->arch.resize_hpt != resize + * after we grab kvm->lock again. + */ + } resize->error = err; + if (kvm->arch.resize_hpt != resize) + resize_hpt_release(kvm, resize); + mutex_unlock(&kvm->lock); } -- cgit v1.2.3 From cfe17c9bbe6a673fdafdab179c32b355ed447f66 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 27 Nov 2017 21:15:13 +0900 Subject: kbuild: move cc-option and cc-disable-warning after incl. arch Makefile Geert reported commit ae6b289a3789 ("kbuild: Set KBUILD_CFLAGS before incl. arch Makefile") broke cross-compilation using a cross-compiler that supports less compiler options than the host compiler. For example, cc1: error: unrecognized command line option "-Wno-unused-but-set-variable" This problem happens on architectures that setup CROSS_COMPILE in their arch/*/Makefile. Move the cc-option and cc-disable-warning back to the original position, but keep the Clang target options untouched. Fixes: ae6b289a3789 ("kbuild: Set KBUILD_CFLAGS before incl. arch Makefile") Reported-by: Geert Uytterhoeven Signed-off-by: Masahiro Yamada Tested-by: Geert Uytterhoeven --- Makefile | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index c988e46a53cd..477c4cf01cae 100644 --- a/Makefile +++ b/Makefile @@ -484,26 +484,6 @@ CLANG_GCC_TC := --gcc-toolchain=$(GCC_TOOLCHAIN) endif KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) -KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) -KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable) -KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) -KBUILD_CFLAGS += $(call cc-disable-warning, gnu) -KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) -# Quiet clang warning: comparison of unsigned expression < 0 is always false -KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) -# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the -# source of a reference will be _MergedGlobals and not on of the whitelisted names. -# See modpost pattern 2 -KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,) -KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior) -KBUILD_CFLAGS += $(call cc-option, -no-integrated-as) -KBUILD_AFLAGS += $(call cc-option, -no-integrated-as) -else - -# These warnings generated too much noise in a regular build. -# Use make W=1 to enable them (see scripts/Makefile.extrawarn) -KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) -KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable) endif ifeq ($(config-targets),1) @@ -716,6 +696,29 @@ ifdef CONFIG_CC_STACKPROTECTOR endif KBUILD_CFLAGS += $(stackp-flag) +ifeq ($(cc-name),clang) +KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) +KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable) +KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) +KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) +# Quiet clang warning: comparison of unsigned expression < 0 is always false +KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) +# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the +# source of a reference will be _MergedGlobals and not on of the whitelisted names. +# See modpost pattern 2 +KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,) +KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior) +KBUILD_CFLAGS += $(call cc-option, -no-integrated-as) +KBUILD_AFLAGS += $(call cc-option, -no-integrated-as) +else + +# These warnings generated too much noise in a regular build. +# Use make W=1 to enable them (see scripts/Makefile.extrawarn) +KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) +KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable) +endif + ifdef CONFIG_FRAME_POINTER KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls else -- cgit v1.2.3 From 967a6a07e95c58eb9c1581d22a1d9c2d1929843f Mon Sep 17 00:00:00 2001 From: Masaharu Hayakawa Date: Wed, 13 Dec 2017 11:33:00 +0900 Subject: mmc: renesas_sdhi: Add MODULE_LICENSE The following error occurs when loading renesas_sdhi_core.c module, so add MODULE_LICENSE("GPL v2"). renesas_sdhi_core: module license 'unspecified' taints kernel. Signed-off-by: Masaharu Hayakawa Fixes: 9d08428afb72 ("mmc: renesas-sdhi: make renesas_sdhi_sys_dmac main module file") Cc: # v4.13+ [Shimoda: Added Fixes tag and Cc to the stable ML] Signed-off-by: Yoshihiro Shimoda Reviewed-by: Simon Horman Acked-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index fcf7235d5742..157e1d9e7725 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -667,3 +668,5 @@ int renesas_sdhi_remove(struct platform_device *pdev) return 0; } EXPORT_SYMBOL_GPL(renesas_sdhi_remove); + +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 257a4b018d1b514a1cc738e3ca11b566d8f3a3d8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 Dec 2017 17:34:44 +1100 Subject: xfrm: Forbid state updates from changing encap type Currently we allow state updates to competely replace the contents of x->encap. This is bad because on the user side ESP only sets up header lengths depending on encap_type once when the state is first created. This could result in the header lengths getting out of sync with the actual state configuration. In practice key managers will never do a state update to change the encapsulation type. Only the port numbers need to be changed as the peer NAT entry is updated. Therefore this patch adds a check in xfrm_state_update to forbid any changes to the encap_type. Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 500b3391f474..1e80f68e2266 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1534,8 +1534,12 @@ out: err = -EINVAL; spin_lock_bh(&x1->lock); if (likely(x1->km.state == XFRM_STATE_VALID)) { - if (x->encap && x1->encap) + if (x->encap && x1->encap && + x->encap->encap_type == x1->encap->encap_type) memcpy(x1->encap, x->encap, sizeof(*x1->encap)); + else if (x->encap || x1->encap) + goto fail; + if (x->coaddr && x1->coaddr) { memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr)); } @@ -1552,6 +1556,8 @@ out: x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); } + +fail: spin_unlock_bh(&x1->lock); xfrm_state_put(x1); -- cgit v1.2.3 From 862591bf4f519d1b8d859af720fafeaebdd0162a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 27 Dec 2017 23:25:45 +0100 Subject: xfrm: skip policies marked as dead while rehashing syzkaller triggered following KASAN splat: BUG: KASAN: slab-out-of-bounds in xfrm_hash_rebuild+0xdbe/0xf00 net/xfrm/xfrm_policy.c:618 read of size 2 at addr ffff8801c8e92fe4 by task kworker/1:1/23 [..] Workqueue: events xfrm_hash_rebuild [..] __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:428 xfrm_hash_rebuild+0xdbe/0xf00 net/xfrm/xfrm_policy.c:618 process_one_work+0xbbf/0x1b10 kernel/workqueue.c:2112 worker_thread+0x223/0x1990 kernel/workqueue.c:2246 [..] The reproducer triggers: 1016 if (error) { 1017 list_move_tail(&walk->walk.all, &x->all); 1018 goto out; 1019 } in xfrm_policy_walk() via pfkey (it sets tiny rcv space, dump callback returns -ENOBUFS). In this case, *walk is located the pfkey socket struct, so this socket becomes visible in the global policy list. It looks like this is intentional -- phony walker has walk.dead set to 1 and all other places skip such "policies". Ccing original authors of the two commits that seem to expose this issue (first patch missed ->dead check, second patch adds pfkey sockets to policies dumper list). Fixes: 880a6fab8f6ba5b ("xfrm: configure policy hash table thresholds by netlink") Fixes: 12a169e7d8f4b1c ("ipsec: Put dumpers on the dump list") Cc: Herbert Xu Cc: Timo Teras Cc: Christophe Gouault Reported-by: syzbot Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 70aa5cb0c659..2ef6db98e9ba 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -609,7 +609,8 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { - if (xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { + if (policy->walk.dead || + xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { /* skip socket policies */ continue; } -- cgit v1.2.3 From 06b335cb51af018d5feeff5dd4fd53847ddb675a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 18:13:05 -0600 Subject: af_key: fix buffer overread in verify_address_len() If a message sent to a PF_KEY socket ended with one of the extensions that takes a 'struct sadb_address' but there were not enough bytes remaining in the message for the ->sa_family member of the 'struct sockaddr' which is supposed to follow, then verify_address_len() read past the end of the message, into uninitialized memory. Fix it by returning -EINVAL in this case. This bug was found using syzkaller with KMSAN. Reproducer: #include #include #include int main() { int sock = socket(PF_KEY, SOCK_RAW, PF_KEY_V2); char buf[24] = { 0 }; struct sadb_msg *msg = (void *)buf; struct sadb_address *addr = (void *)(msg + 1); msg->sadb_msg_version = PF_KEY_V2; msg->sadb_msg_type = SADB_DELETE; msg->sadb_msg_len = 3; addr->sadb_address_len = 1; addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; write(sock, buf, 24); } Reported-by: Alexander Potapenko Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Steffen Klassert --- net/key/af_key.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/key/af_key.c b/net/key/af_key.c index 3dffb892d52c..596499cc8b2f 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -401,6 +401,11 @@ static int verify_address_len(const void *p) #endif int len; + if (sp->sadb_address_len < + DIV_ROUND_UP(sizeof(*sp) + offsetofend(typeof(*addr), sa_family), + sizeof(uint64_t))) + return -EINVAL; + switch (addr->sa_family) { case AF_INET: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); -- cgit v1.2.3 From 4e765b4972af7b07adcb1feb16e7a525ce1f6b28 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 18:15:23 -0600 Subject: af_key: fix buffer overread in parse_exthdrs() If a message sent to a PF_KEY socket ended with an incomplete extension header (fewer than 4 bytes remaining), then parse_exthdrs() read past the end of the message, into uninitialized memory. Fix it by returning -EINVAL in this case. Reproducer: #include #include #include int main() { int sock = socket(PF_KEY, SOCK_RAW, PF_KEY_V2); char buf[17] = { 0 }; struct sadb_msg *msg = (void *)buf; msg->sadb_msg_version = PF_KEY_V2; msg->sadb_msg_type = SADB_DELETE; msg->sadb_msg_len = 2; write(sock, buf, 17); } Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Steffen Klassert --- net/key/af_key.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/key/af_key.c b/net/key/af_key.c index 596499cc8b2f..d40861a048fe 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -516,6 +516,9 @@ static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void * uint16_t ext_type; int ext_len; + if (len < sizeof(*ehdr)) + return -EINVAL; + ext_len = ehdr->sadb_ext_len; ext_len *= sizeof(uint64_t); ext_type = ehdr->sadb_ext_type; -- cgit v1.2.3 From 2f10a61cee8fdb9f8da90f5db687e1862b22cf06 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Sun, 31 Dec 2017 16:18:56 +0100 Subject: xfrm: fix rcu usage in xfrm_get_type_offload request_module can sleep, thus we cannot hold rcu_read_lock() while calling it. The function also jumps back and takes rcu_read_lock() again (in xfrm_state_get_afinfo()), resulting in an imbalance. This codepath is triggered whenever a new offloaded state is created. Fixes: ffdb5211da1c ("xfrm: Auto-load xfrm offload modules") Reported-by: syzbot+ca425f44816d749e8eb49755567a75ee48cf4a30@syzkaller.appspotmail.com Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1e80f68e2266..429957412633 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -313,13 +313,14 @@ retry: if ((type && !try_module_get(type->owner))) type = NULL; + rcu_read_unlock(); + if (!type && try_load) { request_module("xfrm-offload-%d-%d", family, proto); try_load = 0; goto retry; } - rcu_read_unlock(); return type; } -- cgit v1.2.3 From 4307413256ac1e09b8f53e8715af3df9e49beec3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 29 Dec 2017 09:54:25 +0000 Subject: USB: serial: cp210x: add IDs for LifeScan OneTouch Verio IQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add IDs for the OneTouch Verio IQ that comes with an embedded USB-to-serial converter. Signed-off-by: Diego Elio Pettenò Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 7c6273bf5beb..38814225a816 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -124,6 +124,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x10C4, 0x8470) }, /* Juniper Networks BX Series System Console */ { USB_DEVICE(0x10C4, 0x8477) }, /* Balluff RFID */ { USB_DEVICE(0x10C4, 0x84B6) }, /* Starizona Hyperion */ + { USB_DEVICE(0x10C4, 0x85A7) }, /* LifeScan OneTouch Verio IQ */ { USB_DEVICE(0x10C4, 0x85EA) }, /* AC-Services IBUS-IF */ { USB_DEVICE(0x10C4, 0x85EB) }, /* AC-Services CIS-IBUS */ { USB_DEVICE(0x10C4, 0x85F8) }, /* Virtenio Preon32 */ -- cgit v1.2.3 From ce9caf2f79a5aa170a4b6456a03db639eed9c988 Mon Sep 17 00:00:00 2001 From: Stefan Schake Date: Fri, 29 Dec 2017 17:05:43 +0100 Subject: drm/vc4: Move IRQ enable to PM path We were calling enable_irq on bind, where it was already enabled previously by the IRQ helper. Additionally, dev->irq is not set correctly until after postinstall and so was always zero here, triggering a warning in 4.15. Fix both by moving the enable to the power management resume path, where we know there was a previous disable invocation during suspend. Fixes: 253696ccd613 ("drm/vc4: Account for interrupts in flight") Signed-off-by: Stefan Schake Signed-off-by: Eric Anholt Link: https://patchwork.freedesktop.org/patch/msgid/1514563543-32511-1-git-send-email-stschake@gmail.com Tested-by: Stefan Wahren Reviewed-by: Eric Anholt --- drivers/gpu/drm/vc4/vc4_irq.c | 3 --- drivers/gpu/drm/vc4/vc4_v3d.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c index 26eddbb62893..3dd62d75f531 100644 --- a/drivers/gpu/drm/vc4/vc4_irq.c +++ b/drivers/gpu/drm/vc4/vc4_irq.c @@ -209,9 +209,6 @@ vc4_irq_postinstall(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); - /* Undo the effects of a previous vc4_irq_uninstall. */ - enable_irq(dev->irq); - /* Enable both the render done and out of memory interrupts. */ V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS); diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c index 622cd43840b8..493f392b3a0a 100644 --- a/drivers/gpu/drm/vc4/vc4_v3d.c +++ b/drivers/gpu/drm/vc4/vc4_v3d.c @@ -327,6 +327,9 @@ static int vc4_v3d_runtime_resume(struct device *dev) return ret; vc4_v3d_init_hw(vc4->dev); + + /* We disabled the IRQ as part of vc4_irq_uninstall in suspend. */ + enable_irq(vc4->dev->irq); vc4_irq_postinstall(vc4->dev); return 0; -- cgit v1.2.3 From 121d760d0788f95619049c63449d977065cab69d Mon Sep 17 00:00:00 2001 From: Zhi Wang Date: Fri, 29 Dec 2017 02:50:08 +0800 Subject: drm/i915/gvt: Clear the shadow page table entry after post-sync A shadow page table entry needs to be cleared after being set as post-sync. This patch fixes the recent error reported in Win7-32 test. Fixes: 2707e4446688 ("drm/i915/gvt: vGPU graphics memory virtualization") Signed-off-by: Zhi Wang CC: Stable Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/gtt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index 8e331142badb..64d67ff9bf08 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -1359,12 +1359,15 @@ static int ppgtt_handle_guest_write_page_table_bytes(void *gp, return ret; } else { if (!test_bit(index, spt->post_shadow_bitmap)) { + int type = spt->shadow_page.type; + ppgtt_get_shadow_entry(spt, &se, index); ret = ppgtt_handle_guest_entry_removal(gpt, &se, index); if (ret) return ret; + ops->set_pfn(&se, vgpu->gtt.scratch_pt[type].page_mfn); + ppgtt_set_shadow_entry(spt, &se, index); } - ppgtt_set_post_shadow(spt, index); } -- cgit v1.2.3 From 2bd7b4aacdb6efa5ccd4749c365c171b884791d2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Jan 2018 23:49:18 +0100 Subject: mmc: s3mci: mark debug_regs[] as static The global array clashes with a newly added symbol of the same name: drivers/staging/ccree/cc_debugfs.o:(.data+0x0): multiple definition of `debug_regs' drivers/mmc/host/s3cmci.o:(.data+0x70): first defined here We should fix both, this one addresses the s3cmci driver by removing the symbol from the global namespace. While at it, this separates the declaration from the type definition and makes the variable const. Fixes: 9bdd203b4dc8 ("s3cmci: add debugfs support for examining driver and hardware state") Fixes: b3ec9a6736f2 ("staging: ccree: staging: ccree: replace sysfs by debugfs interface") Signed-off-by: Arnd Bergmann Signed-off-by: Ulf Hansson --- drivers/mmc/host/s3cmci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c index f7f157a62a4a..555c7f133eb8 100644 --- a/drivers/mmc/host/s3cmci.c +++ b/drivers/mmc/host/s3cmci.c @@ -1424,7 +1424,9 @@ static const struct file_operations s3cmci_fops_state = { struct s3cmci_reg { unsigned short addr; unsigned char *name; -} debug_regs[] = { +}; + +static const struct s3cmci_reg debug_regs[] = { DBG_REG(CON), DBG_REG(PRE), DBG_REG(CMDARG), @@ -1446,7 +1448,7 @@ struct s3cmci_reg { static int s3cmci_regs_show(struct seq_file *seq, void *v) { struct s3cmci_host *host = seq->private; - struct s3cmci_reg *rptr = debug_regs; + const struct s3cmci_reg *rptr = debug_regs; for (; rptr->name; rptr++) seq_printf(seq, "SDI%s\t=0x%08x\n", rptr->name, -- cgit v1.2.3 From d14ac576d10f865970bb1324d337e5e24d79aaf4 Mon Sep 17 00:00:00 2001 From: Christian Holl Date: Wed, 3 Jan 2018 19:53:02 +0100 Subject: USB: serial: cp210x: add new device ID ELV ALC 8xxx This adds the ELV ALC 8xxx Battery Charging device to the list of USB IDs of drivers/usb/serial/cp210x.c Signed-off-by: Christian Holl Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 38814225a816..06d502b3e913 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -175,6 +175,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x1843, 0x0200) }, /* Vaisala USB Instrument Cable */ { USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */ { USB_DEVICE(0x18EF, 0xE025) }, /* ELV Marble Sound Board 1 */ + { USB_DEVICE(0x18EF, 0xE030) }, /* ELV ALC 8xxx Battery Charger */ { USB_DEVICE(0x18EF, 0xE032) }, /* ELV TFD500 Data Logger */ { USB_DEVICE(0x1901, 0x0190) }, /* GE B850 CP2105 Recorder interface */ { USB_DEVICE(0x1901, 0x0193) }, /* GE B650 CP2104 PMC interface */ -- cgit v1.2.3 From 06e7e776ca4d36547e503279aeff996cbb292c16 Mon Sep 17 00:00:00 2001 From: Ben Seri Date: Fri, 8 Dec 2017 15:14:47 +0100 Subject: Bluetooth: Prevent stack info leak from the EFS element. In the function l2cap_parse_conf_rsp and in the function l2cap_parse_conf_req the following variable is declared without initialization: struct l2cap_conf_efs efs; In addition, when parsing input configuration parameters in both of these functions, the switch case for handling EFS elements may skip the memcpy call that will write to the efs variable: ... case L2CAP_CONF_EFS: if (olen == sizeof(efs)) memcpy(&efs, (void *)val, olen); ... The olen in the above if is attacker controlled, and regardless of that if, in both of these functions the efs variable would eventually be added to the outgoing configuration request that is being built: l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), (unsigned long) &efs); So by sending a configuration request, or response, that contains an L2CAP_CONF_EFS element, but with an element length that is not sizeof(efs) - the memcpy to the uninitialized efs variable can be avoided, and the uninitialized variable would be returned to the attacker (16 bytes). This issue has been assigned CVE-2017-1000410 Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Cc: stable Signed-off-by: Ben Seri Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/l2cap_core.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 43ba91c440bc..fc6615d59165 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -3363,9 +3363,10 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data break; case L2CAP_CONF_EFS: - remote_efs = 1; - if (olen == sizeof(efs)) + if (olen == sizeof(efs)) { + remote_efs = 1; memcpy(&efs, (void *) val, olen); + } break; case L2CAP_CONF_EWS: @@ -3584,16 +3585,17 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, break; case L2CAP_CONF_EFS: - if (olen == sizeof(efs)) + if (olen == sizeof(efs)) { memcpy(&efs, (void *)val, olen); - if (chan->local_stype != L2CAP_SERV_NOTRAFIC && - efs.stype != L2CAP_SERV_NOTRAFIC && - efs.stype != chan->local_stype) - return -ECONNREFUSED; + if (chan->local_stype != L2CAP_SERV_NOTRAFIC && + efs.stype != L2CAP_SERV_NOTRAFIC && + efs.stype != chan->local_stype) + return -ECONNREFUSED; - l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs, endptr - ptr); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), + (unsigned long) &efs, endptr - ptr); + } break; case L2CAP_CONF_FCS: -- cgit v1.2.3 From b78d830f0049ef1966dc1e0ebd1ec2a594e2cf25 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 22 Dec 2017 19:23:46 -0700 Subject: usbip: fix vudc_rx: harden CMD_SUBMIT path to handle malicious input Harden CMD_SUBMIT path to handle malicious input that could trigger large memory allocations. Add checks to validate transfer_buffer_length and number_of_packets to protect against bad input requesting for unbounded memory allocations. Signed-off-by: Shuah Khan Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/vudc_rx.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/usb/usbip/vudc_rx.c b/drivers/usb/usbip/vudc_rx.c index df1e30989148..1e8a23d92cb4 100644 --- a/drivers/usb/usbip/vudc_rx.c +++ b/drivers/usb/usbip/vudc_rx.c @@ -120,6 +120,25 @@ static int v_recv_cmd_submit(struct vudc *udc, urb_p->new = 1; urb_p->seqnum = pdu->base.seqnum; + if (urb_p->ep->type == USB_ENDPOINT_XFER_ISOC) { + /* validate packet size and number of packets */ + unsigned int maxp, packets, bytes; + + maxp = usb_endpoint_maxp(urb_p->ep->desc); + maxp *= usb_endpoint_maxp_mult(urb_p->ep->desc); + bytes = pdu->u.cmd_submit.transfer_buffer_length; + packets = DIV_ROUND_UP(bytes, maxp); + + if (pdu->u.cmd_submit.number_of_packets < 0 || + pdu->u.cmd_submit.number_of_packets > packets) { + dev_err(&udc->gadget.dev, + "CMD_SUBMIT: isoc invalid num packets %d\n", + pdu->u.cmd_submit.number_of_packets); + ret = -EMSGSIZE; + goto free_urbp; + } + } + ret = alloc_urb_from_cmd(&urb_p->urb, pdu, urb_p->ep->type); if (ret) { usbip_event_add(&udc->ud, VUDC_EVENT_ERROR_MALLOC); -- cgit v1.2.3 From e1346fd87c71a1f61de1fe476ec8df1425ac931c Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 22 Dec 2017 17:00:06 -0700 Subject: usbip: remove kernel addresses from usb device and urb debug msgs usbip_dump_usb_device() and usbip_dump_urb() print kernel addresses. Remove kernel addresses from usb device and urb debug msgs and improve the message content. Instead of printing parent device and bus addresses, print parent device and bus names. Signed-off-by: Shuah Khan Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/usbip_common.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c index 7b219d9109b4..ee2bbce24584 100644 --- a/drivers/usb/usbip/usbip_common.c +++ b/drivers/usb/usbip/usbip_common.c @@ -91,7 +91,7 @@ static void usbip_dump_usb_device(struct usb_device *udev) dev_dbg(dev, " devnum(%d) devpath(%s) usb speed(%s)", udev->devnum, udev->devpath, usb_speed_string(udev->speed)); - pr_debug("tt %p, ttport %d\n", udev->tt, udev->ttport); + pr_debug("tt hub ttport %d\n", udev->ttport); dev_dbg(dev, " "); for (i = 0; i < 16; i++) @@ -124,12 +124,8 @@ static void usbip_dump_usb_device(struct usb_device *udev) } pr_debug("\n"); - dev_dbg(dev, "parent %p, bus %p\n", udev->parent, udev->bus); - - dev_dbg(dev, - "descriptor %p, config %p, actconfig %p, rawdescriptors %p\n", - &udev->descriptor, udev->config, - udev->actconfig, udev->rawdescriptors); + dev_dbg(dev, "parent %s, bus %s\n", dev_name(&udev->parent->dev), + udev->bus->bus_name); dev_dbg(dev, "have_langid %d, string_langid %d\n", udev->have_langid, udev->string_langid); @@ -237,9 +233,6 @@ void usbip_dump_urb(struct urb *urb) dev = &urb->dev->dev; - dev_dbg(dev, " urb :%p\n", urb); - dev_dbg(dev, " dev :%p\n", urb->dev); - usbip_dump_usb_device(urb->dev); dev_dbg(dev, " pipe :%08x ", urb->pipe); @@ -248,11 +241,9 @@ void usbip_dump_urb(struct urb *urb) dev_dbg(dev, " status :%d\n", urb->status); dev_dbg(dev, " transfer_flags :%08X\n", urb->transfer_flags); - dev_dbg(dev, " transfer_buffer :%p\n", urb->transfer_buffer); dev_dbg(dev, " transfer_buffer_length:%d\n", urb->transfer_buffer_length); dev_dbg(dev, " actual_length :%d\n", urb->actual_length); - dev_dbg(dev, " setup_packet :%p\n", urb->setup_packet); if (urb->setup_packet && usb_pipetype(urb->pipe) == PIPE_CONTROL) usbip_dump_usb_ctrlrequest( @@ -262,8 +253,6 @@ void usbip_dump_urb(struct urb *urb) dev_dbg(dev, " number_of_packets :%d\n", urb->number_of_packets); dev_dbg(dev, " interval :%d\n", urb->interval); dev_dbg(dev, " error_count :%d\n", urb->error_count); - dev_dbg(dev, " context :%p\n", urb->context); - dev_dbg(dev, " complete :%p\n", urb->complete); } EXPORT_SYMBOL_GPL(usbip_dump_urb); -- cgit v1.2.3 From 5fd77a3a0e408c23ab4002a57db980e46bc16e72 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 22 Dec 2017 19:23:47 -0700 Subject: usbip: vudc_tx: fix v_send_ret_submit() vulnerability to null xfer buffer v_send_ret_submit() handles urb with a null transfer_buffer, when it replays a packet with potential malicious data that could contain a null buffer. Add a check for the condition when actual_length > 0 and transfer_buffer is null. Signed-off-by: Shuah Khan Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/vudc_tx.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/usb/usbip/vudc_tx.c b/drivers/usb/usbip/vudc_tx.c index 1440ae0919ec..3ccb17c3e840 100644 --- a/drivers/usb/usbip/vudc_tx.c +++ b/drivers/usb/usbip/vudc_tx.c @@ -85,6 +85,13 @@ static int v_send_ret_submit(struct vudc *udc, struct urbp *urb_p) memset(&pdu_header, 0, sizeof(pdu_header)); memset(&msg, 0, sizeof(msg)); + if (urb->actual_length > 0 && !urb->transfer_buffer) { + dev_err(&udc->gadget.dev, + "urb: actual_length %d transfer_buffer null\n", + urb->actual_length); + return -1; + } + if (urb_p->type == USB_ENDPOINT_XFER_ISOC) iovnum = 2 + urb->number_of_packets; else @@ -100,8 +107,8 @@ static int v_send_ret_submit(struct vudc *udc, struct urbp *urb_p) /* 1. setup usbip_header */ setup_ret_submit_pdu(&pdu_header, urb_p); - usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n", - pdu_header.base.seqnum, urb); + usbip_dbg_stub_tx("setup txdata seqnum: %d\n", + pdu_header.base.seqnum); usbip_header_correct_endian(&pdu_header, 1); iov[iovnum].iov_base = &pdu_header; -- cgit v1.2.3 From 9a00674213a3f00394f4e3221b88f2d21fc05789 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 14:30:19 -0600 Subject: crypto: algapi - fix NULL dereference in crypto_remove_spawns() syzkaller triggered a NULL pointer dereference in crypto_remove_spawns() via a program that repeatedly and concurrently requests AEADs "authenc(cmac(des3_ede-asm),pcbc-aes-aesni)" and hashes "cmac(des3_ede)" through AF_ALG, where the hashes are requested as "untested" (CRYPTO_ALG_TESTED is set in ->salg_mask but clear in ->salg_feat; this causes the template to be instantiated for every request). Although AF_ALG users really shouldn't be able to request an "untested" algorithm, the NULL pointer dereference is actually caused by a longstanding race condition where crypto_remove_spawns() can encounter an instance which has had spawn(s) "grabbed" but hasn't yet been registered, resulting in ->cra_users still being NULL. We probably should properly initialize ->cra_users earlier, but that would require updating many templates individually. For now just fix the bug in a simple way that can easily be backported: make crypto_remove_spawns() treat a NULL ->cra_users list as empty. Reported-by: syzbot Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/algapi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/crypto/algapi.c b/crypto/algapi.c index 60d7366ed343..9a636f961572 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -167,6 +167,18 @@ void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, spawn->alg = NULL; spawns = &inst->alg.cra_users; + + /* + * We may encounter an unregistered instance here, since + * an instance's spawns are set up prior to the instance + * being registered. An unregistered instance will have + * NULL ->cra_users.next, since ->cra_users isn't + * properly initialized until registration. But an + * unregistered instance cannot have any users, so treat + * it the same as ->cra_users being empty. + */ + if (spawns->next == NULL) + break; } } while ((spawns = crypto_more_spawns(alg, &stack, &top, &secondary_spawns))); -- cgit v1.2.3 From d16b46e4fd8bc6063624605f25b8c0835bb1fbe3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 4 Jan 2018 22:25:07 +1100 Subject: xfrm: Use __skb_queue_tail in xfrm_trans_queue We do not need locking in xfrm_trans_queue because it is designed to use per-CPU buffers. However, the original code incorrectly used skb_queue_tail which takes the lock. This patch switches it to __skb_queue_tail instead. Reported-and-tested-by: Artem Savkov Fixes: acf568ee859f ("xfrm: Reinject transport-mode packets...") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 3f6f6f8c9fa5..5b2409746ae0 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -518,7 +518,7 @@ int xfrm_trans_queue(struct sk_buff *skb, return -ENOBUFS; XFRM_TRANS_SKB_CB(skb)->finish = finish; - skb_queue_tail(&trans->queue, skb); + __skb_queue_tail(&trans->queue, skb); tasklet_schedule(&trans->tasklet); return 0; } -- cgit v1.2.3 From 9059a3493efea6492451430c7e2fa0af799a2abb Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 16 Nov 2017 20:06:39 -0500 Subject: kconfig: fix relational operators for bool and tristate symbols Since commit 31847b67bec0 ("kconfig: allow use of relations other than (in)equality") it is possible to use relational operators in Kconfig statements. However, those operators give unexpected results when applied to bool/tristate values: (n < y) = y (correct) (m < y) = y (correct) (n < m) = n (wrong) This happens because relational operators process bool and tristate symbols as strings and m sorts before n. It makes little sense to do a lexicographical compare on bool and tristate values though. Documentation/kbuild/kconfig-language.txt states that expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2 respectively for calculations). Let's make it so for relational comparisons with bool/tristate expressions as well and document them. If at least one symbol is an actual string then the lexicographical compare works just as before. Signed-off-by: Nicolas Pitre Acked-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Masahiro Yamada --- Documentation/kbuild/kconfig-language.txt | 23 +++++++++++++++-------- scripts/kconfig/expr.c | 5 ++++- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index 262722d8867b..c4a293a03c33 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt @@ -200,10 +200,14 @@ module state. Dependency expressions have the following syntax: ::= (1) '=' (2) '!=' (3) - '(' ')' (4) - '!' (5) - '&&' (6) - '||' (7) + '<' (4) + '>' (4) + '<=' (4) + '>=' (4) + '(' ')' (5) + '!' (6) + '&&' (7) + '||' (8) Expressions are listed in decreasing order of precedence. @@ -214,10 +218,13 @@ Expressions are listed in decreasing order of precedence. otherwise 'n'. (3) If the values of both symbols are equal, it returns 'n', otherwise 'y'. -(4) Returns the value of the expression. Used to override precedence. -(5) Returns the result of (2-/expr/). -(6) Returns the result of min(/expr/, /expr/). -(7) Returns the result of max(/expr/, /expr/). +(4) If value of is respectively lower, greater, lower-or-equal, + or greater-or-equal than value of , it returns 'y', + otherwise 'n'. +(5) Returns the value of the expression. Used to override precedence. +(6) Returns the result of (2-/expr/). +(7) Returns the result of min(/expr/, /expr/). +(8) Returns the result of max(/expr/, /expr/). An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2 respectively for calculations). A menu entry becomes visible when its diff --git a/scripts/kconfig/expr.c b/scripts/kconfig/expr.c index cbf4996dd9c1..8cee597d33a5 100644 --- a/scripts/kconfig/expr.c +++ b/scripts/kconfig/expr.c @@ -893,7 +893,10 @@ static enum string_value_kind expr_parse_string(const char *str, switch (type) { case S_BOOLEAN: case S_TRISTATE: - return k_string; + val->s = !strcmp(str, "n") ? 0 : + !strcmp(str, "m") ? 1 : + !strcmp(str, "y") ? 2 : -1; + return k_signed; case S_INT: val->s = strtoll(str, &tail, 10); kind = k_signed; -- cgit v1.2.3 From 7729bebc619307a0233c86f8585a4bf3eadc7ce4 Mon Sep 17 00:00:00 2001 From: Valentin Ilie Date: Fri, 5 Jan 2018 23:12:59 +0000 Subject: ia64, sched/cputime: Fix build error if CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y Remove the extra parenthesis. This bug was introduced by: e2339a4caa5e: ("ia64: Convert vtime to use nsec units directly") Signed-off-by: Valentin Ilie Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: linux-ia64@vger.kernel.org Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1515193979-24873-1-git-send-email-valentin.ilie@gmail.com Signed-off-by: Ingo Molnar --- arch/ia64/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index c6ecb97151a2..9025699049ca 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -88,7 +88,7 @@ void vtime_flush(struct task_struct *tsk) } if (ti->softirq_time) { - delta = cycle_to_nsec(ti->softirq_time)); + delta = cycle_to_nsec(ti->softirq_time); account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ); } -- cgit v1.2.3 From b94b7373317164402ff7728d10f7023127a02b60 Mon Sep 17 00:00:00 2001 From: Jia Zhang Date: Mon, 1 Jan 2018 10:04:47 +0800 Subject: x86/microcode/intel: Extend BDW late-loading with a revision check Instead of blacklisting all model 79 CPUs when attempting a late microcode loading, limit that only to CPUs with microcode revisions < 0x0b000021 because only on those late loading may cause a system hang. For such processors either: a) a BIOS update which might contain a newer microcode revision or b) the early microcode loading method should be considered. Processors with revisions 0x0b000021 or higher will not experience such hangs. For more details, see erratum BDF90 in document #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family Specification Update) from September 2017. [ bp: Heavily massage commit message and pr_* statements. ] Fixes: 723f2828a98c ("x86/microcode/intel: Disable late loading on model 79") Signed-off-by: Jia Zhang Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Acked-by: Tony Luck Cc: x86-ml Cc: # v4.14 Link: http://lkml.kernel.org/r/1514772287-92959-1-git-send-email-qianyue.zj@alibaba-inc.com --- arch/x86/kernel/cpu/microcode/intel.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 8ccdca6d3f9e..d9e460fc7a3b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -910,8 +910,17 @@ static bool is_blacklisted(unsigned int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { - pr_err_once("late loading on model 79 is disabled.\n"); + /* + * Late loading on model 79 with microcode revision less than 0x0b000021 + * may result in a system hang. This behavior is documented in item + * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family). + */ + if (c->x86 == 6 && + c->x86_model == INTEL_FAM6_BROADWELL_X && + c->x86_mask == 0x01 && + c->microcode < 0x0b000021) { + pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); return true; } -- cgit v1.2.3 From de53c3786a3ce162a1c815d0c04c766c23ec9c0a Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 5 Jan 2018 22:35:41 +0100 Subject: x86/pti: Unbreak EFI old_memmap EFI_OLD_MEMMAP's efi_call_phys_prolog() calls set_pgd() with swapper PGD that has PAGE_USER set, which makes PTI set NX on it, and therefore EFI can't execute it's code. Fix that by forcefully clearing _PAGE_NX from the PGD (this can't be done by the pgprot API). _PAGE_NX will be automatically reintroduced in efi_call_phys_epilog(), as _set_pgd() will again notice that this is _PAGE_USER, and set _PAGE_NX on it. Tested-by: Dimitri Sivanich Signed-off-by: Jiri Kosina Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Acked-by: Dave Hansen Cc: Andrea Arcangeli Cc: Ard Biesheuvel Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/nycvar.YFH.7.76.1801052215460.11852@cbobk.fhfr.pm --- arch/x86/platform/efi/efi_64.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 39c4b35ac7a4..61975b6bcb1a 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -134,7 +134,9 @@ pgd_t * __init efi_call_phys_prolog(void) pud[j] = *pud_offset(p4d_k, vaddr); } } + pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX; } + out: __flush_tlb_all(); -- cgit v1.2.3 From 01c9b17bf673b05bb401b76ec763e9730ccf1376 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 5 Jan 2018 09:44:36 -0800 Subject: x86/Documentation: Add PTI description Add some details about how PTI works, what some of the downsides are, and how to debug it when things go wrong. Also document the kernel parameter: 'pti/nopti'. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Randy Dunlap Reviewed-by: Kees Cook Cc: Moritz Lipp Cc: Daniel Gruss Cc: Michael Schwarz Cc: Richard Fellner Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Hugh Dickins Cc: Andi Lutomirsky Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180105174436.1BC6FA2B@viggo.jf.intel.com --- Documentation/admin-guide/kernel-parameters.txt | 21 ++- Documentation/x86/pti.txt | 186 ++++++++++++++++++++++++ 2 files changed, 200 insertions(+), 7 deletions(-) create mode 100644 Documentation/x86/pti.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 520fdec15bbb..905991745d26 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2685,8 +2685,6 @@ steal time is computed, but won't influence scheduler behaviour - nopti [X86-64] Disable kernel page table isolation - nolapic [X86-32,APIC] Do not enable or use the local APIC. nolapic_timer [X86-32,APIC] Do not use the local APIC timer. @@ -3255,11 +3253,20 @@ pt. [PARIDE] See Documentation/blockdev/paride.txt. - pti= [X86_64] - Control user/kernel address space isolation: - on - enable - off - disable - auto - default setting + pti= [X86_64] Control Page Table Isolation of user and + kernel address spaces. Disabling this feature + removes hardening, but improves performance of + system calls and interrupts. + + on - unconditionally enable + off - unconditionally disable + auto - kernel detects whether your CPU model is + vulnerable to issues that PTI mitigates + + Not specifying this option is equivalent to pti=auto. + + nopti [X86_64] + Equivalent to pti=off pty.legacy_count= [KNL] Number of legacy pty's. Overwrites compiled-in diff --git a/Documentation/x86/pti.txt b/Documentation/x86/pti.txt new file mode 100644 index 000000000000..d11eff61fc9a --- /dev/null +++ b/Documentation/x86/pti.txt @@ -0,0 +1,186 @@ +Overview +======== + +Page Table Isolation (pti, previously known as KAISER[1]) is a +countermeasure against attacks on the shared user/kernel address +space such as the "Meltdown" approach[2]. + +To mitigate this class of attacks, we create an independent set of +page tables for use only when running userspace applications. When +the kernel is entered via syscalls, interrupts or exceptions, the +page tables are switched to the full "kernel" copy. When the system +switches back to user mode, the user copy is used again. + +The userspace page tables contain only a minimal amount of kernel +data: only what is needed to enter/exit the kernel such as the +entry/exit functions themselves and the interrupt descriptor table +(IDT). There are a few strictly unnecessary things that get mapped +such as the first C function when entering an interrupt (see +comments in pti.c). + +This approach helps to ensure that side-channel attacks leveraging +the paging structures do not function when PTI is enabled. It can be +enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time. +Once enabled at compile-time, it can be disabled at boot with the +'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). + +Page Table Management +===================== + +When PTI is enabled, the kernel manages two sets of page tables. +The first set is very similar to the single set which is present in +kernels without PTI. This includes a complete mapping of userspace +that the kernel can use for things like copy_to_user(). + +Although _complete_, the user portion of the kernel page tables is +crippled by setting the NX bit in the top level. This ensures +that any missed kernel->user CR3 switch will immediately crash +userspace upon executing its first instruction. + +The userspace page tables map only the kernel data needed to enter +and exit the kernel. This data is entirely contained in the 'struct +cpu_entry_area' structure which is placed in the fixmap which gives +each CPU's copy of the area a compile-time-fixed virtual address. + +For new userspace mappings, the kernel makes the entries in its +page tables like normal. The only difference is when the kernel +makes entries in the top (PGD) level. In addition to setting the +entry in the main kernel PGD, a copy of the entry is made in the +userspace page tables' PGD. + +This sharing at the PGD level also inherently shares all the lower +layers of the page tables. This leaves a single, shared set of +userspace page tables to manage. One PTE to lock, one set of +accessed bits, dirty bits, etc... + +Overhead +======== + +Protection against side-channel attacks is important. But, +this protection comes at a cost: + +1. Increased Memory Use + a. Each process now needs an order-1 PGD instead of order-0. + (Consumes an additional 4k per process). + b. The 'cpu_entry_area' structure must be 2MB in size and 2MB + aligned so that it can be mapped by setting a single PMD + entry. This consumes nearly 2MB of RAM once the kernel + is decompressed, but no space in the kernel image itself. + +2. Runtime Cost + a. CR3 manipulation to switch between the page table copies + must be done at interrupt, syscall, and exception entry + and exit (it can be skipped when the kernel is interrupted, + though.) Moves to CR3 are on the order of a hundred + cycles, and are required at every entry and exit. + b. A "trampoline" must be used for SYSCALL entry. This + trampoline depends on a smaller set of resources than the + non-PTI SYSCALL entry code, so requires mapping fewer + things into the userspace page tables. The downside is + that stacks must be switched at entry time. + d. Global pages are disabled for all kernel structures not + mapped into both kernel and userspace page tables. This + feature of the MMU allows different processes to share TLB + entries mapping the kernel. Losing the feature means more + TLB misses after a context switch. The actual loss of + performance is very small, however, never exceeding 1%. + d. Process Context IDentifiers (PCID) is a CPU feature that + allows us to skip flushing the entire TLB when switching page + tables by setting a special bit in CR3 when the page tables + are changed. This makes switching the page tables (at context + switch, or kernel entry/exit) cheaper. But, on systems with + PCID support, the context switch code must flush both the user + and kernel entries out of the TLB. The user PCID TLB flush is + deferred until the exit to userspace, minimizing the cost. + See intel.com/sdm for the gory PCID/INVPCID details. + e. The userspace page tables must be populated for each new + process. Even without PTI, the shared kernel mappings + are created by copying top-level (PGD) entries into each + new process. But, with PTI, there are now *two* kernel + mappings: one in the kernel page tables that maps everything + and one for the entry/exit structures. At fork(), we need to + copy both. + f. In addition to the fork()-time copying, there must also + be an update to the userspace PGD any time a set_pgd() is done + on a PGD used to map userspace. This ensures that the kernel + and userspace copies always map the same userspace + memory. + g. On systems without PCID support, each CR3 write flushes + the entire TLB. That means that each syscall, interrupt + or exception flushes the TLB. + h. INVPCID is a TLB-flushing instruction which allows flushing + of TLB entries for non-current PCIDs. Some systems support + PCIDs, but do not support INVPCID. On these systems, addresses + can only be flushed from the TLB for the current PCID. When + flushing a kernel address, we need to flush all PCIDs, so a + single kernel address flush will require a TLB-flushing CR3 + write upon the next use of every PCID. + +Possible Future Work +==================== +1. We can be more careful about not actually writing to CR3 + unless its value is actually changed. +2. Allow PTI to be enabled/disabled at runtime in addition to the + boot-time switching. + +Testing +======== + +To test stability of PTI, the following test procedure is recommended, +ideally doing all of these in parallel: + +1. Set CONFIG_DEBUG_ENTRY=y +2. Run several copies of all of the tools/testing/selftests/x86/ tests + (excluding MPX and protection_keys) in a loop on multiple CPUs for + several minutes. These tests frequently uncover corner cases in the + kernel entry code. In general, old kernels might cause these tests + themselves to crash, but they should never crash the kernel. +3. Run the 'perf' tool in a mode (top or record) that generates many + frequent performance monitoring non-maskable interrupts (see "NMI" + in /proc/interrupts). This exercises the NMI entry/exit code which + is known to trigger bugs in code paths that did not expect to be + interrupted, including nested NMIs. Using "-c" boosts the rate of + NMIs, and using two -c with separate counters encourages nested NMIs + and less deterministic behavior. + + while true; do perf record -c 10000 -e instructions,cycles -a sleep 10; done + +4. Launch a KVM virtual machine. +5. Run 32-bit binaries on systems supporting the SYSCALL instruction. + This has been a lightly-tested code path and needs extra scrutiny. + +Debugging +========= + +Bugs in PTI cause a few different signatures of crashes +that are worth noting here. + + * Failures of the selftests/x86 code. Usually a bug in one of the + more obscure corners of entry_64.S + * Crashes in early boot, especially around CPU bringup. Bugs + in the trampoline code or mappings cause these. + * Crashes at the first interrupt. Caused by bugs in entry_64.S, + like screwing up a page table switch. Also caused by + incorrectly mapping the IRQ handler entry code. + * Crashes at the first NMI. The NMI code is separate from main + interrupt handlers and can have bugs that do not affect + normal interrupts. Also caused by incorrectly mapping NMI + code. NMIs that interrupt the entry code must be very + careful and can be the cause of crashes that show up when + running perf. + * Kernel crashes at the first exit to userspace. entry_64.S + bugs, or failing to map some of the exit code. + * Crashes at first interrupt that interrupts userspace. The paths + in entry_64.S that return to userspace are sometimes separate + from the ones that return to the kernel. + * Double faults: overflowing the kernel stack because of page + faults upon page faults. Caused by touching non-pti-mapped + data in the entry code, or forgetting to switch to kernel + CR3 before calling into C functions which are not pti-mapped. + * Userspace segfaults early in boot, sometimes manifesting + as mount(8) failing to mount the rootfs. These have + tended to be TLB invalidation issues. Usually invalidating + the wrong PCID, or otherwise missing an invalidation. + +1. https://gruss.cc/files/kaiser.pdf +2. https://meltdownattack.com/meltdown.pdf -- cgit v1.2.3 From 99c6fa2511d8a683e61468be91b83f85452115fa Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 6 Jan 2018 11:49:23 +0000 Subject: x86/cpufeatures: Add X86_BUG_SPECTRE_V[12] Add the bug bits for spectre v1/2 and force them unconditionally for all cpus. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1515239374-23361-2-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/kernel/cpu/common.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 21ac898df2d8..1641c2f96363 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -342,5 +342,7 @@ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2d3bd2215e5b..372ba3fb400f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -902,6 +902,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_AMD) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + fpu__init_system(c); #ifdef CONFIG_X86_32 -- cgit v1.2.3 From e2d5915293ffdff977ddcfc12b817b08c53ffa7a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 8 Jan 2018 14:54:32 +1100 Subject: powerpc/pseries: Make RAS IRQ explicitly dependent on DLPAR WQ The hotplug code uses its own workqueue to handle IRQ requests (pseries_hp_wq), however that workqueue is initialized after init_ras_IRQ(). That can lead to a kernel panic if any hotplug interrupts fire after init_ras_IRQ() but before pseries_hp_wq is initialised. eg: UDP-Lite hash table entries: 2048 (order: 0, 65536 bytes) NET: Registered protocol family 1 Unpacking initramfs... (qemu) object_add memory-backend-ram,id=mem1,size=10G (qemu) device_add pc-dimm,id=dimm1,memdev=mem1 Unable to handle kernel paging request for data at address 0xf94d03007c421378 Faulting instruction address: 0xc00000000012d744 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.15.0-rc2-ziviani+ #26 task: (ptrval) task.stack: (ptrval) NIP: c00000000012d744 LR: c00000000012d744 CTR: 0000000000000000 REGS: (ptrval) TRAP: 0380 Not tainted (4.15.0-rc2-ziviani+) MSR: 8000000000009033 CR: 28088042 XER: 20040000 CFAR: c00000000012d3c4 SOFTE: 0 ... NIP [c00000000012d744] __queue_work+0xd4/0x5c0 LR [c00000000012d744] __queue_work+0xd4/0x5c0 Call Trace: [c0000000fffefb90] [c00000000012d744] __queue_work+0xd4/0x5c0 (unreliable) [c0000000fffefc70] [c00000000012dce4] queue_work_on+0xb4/0xf0 This commit makes the RAS IRQ registration explicitly dependent on the creation of the pseries_hp_wq. Reported-by: Min Deng Reported-by: Daniel Henrique Barboza Tested-by: Jose Ricardo Ziviani Signed-off-by: Michael Ellerman Reviewed-by: David Gibson --- arch/powerpc/platforms/pseries/dlpar.c | 21 ++++++++++++++++++--- arch/powerpc/platforms/pseries/pseries.h | 2 ++ arch/powerpc/platforms/pseries/ras.c | 3 ++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 6e35780c5962..a0b20c03f078 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -574,11 +574,26 @@ static ssize_t dlpar_show(struct class *class, struct class_attribute *attr, static CLASS_ATTR_RW(dlpar); -static int __init pseries_dlpar_init(void) +int __init dlpar_workqueue_init(void) { + if (pseries_hp_wq) + return 0; + pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue", - WQ_UNBOUND, 1); + WQ_UNBOUND, 1); + + return pseries_hp_wq ? 0 : -ENOMEM; +} + +static int __init dlpar_sysfs_init(void) +{ + int rc; + + rc = dlpar_workqueue_init(); + if (rc) + return rc; + return sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr); } -machine_device_initcall(pseries, pseries_dlpar_init); +machine_device_initcall(pseries, dlpar_sysfs_init); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 4470a3194311..1ae1d9f4dbe9 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -98,4 +98,6 @@ static inline unsigned long cmo_get_page_size(void) return CMO_PageSize; } +int dlpar_workqueue_init(void); + #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 4923ffe230cf..81d8614e7379 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -69,7 +69,8 @@ static int __init init_ras_IRQ(void) /* Hotplug Events */ np = of_find_node_by_path("/event-sources/hot-plug-events"); if (np != NULL) { - request_event_sources_irqs(np, ras_hotplug_interrupt, + if (dlpar_workqueue_init() == 0) + request_event_sources_irqs(np, ras_hotplug_interrupt, "RAS_HOTPLUG"); of_node_put(np); } -- cgit v1.2.3 From 65e7439204b57b7a7f6e4694f9e2a9adde5e77ed Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 21 Dec 2017 10:29:32 +0800 Subject: drm/i915/gvt: Fix stack-out-of-bounds bug in cmd parser for_each_set_bit() only accepts variable of type unsigned long, and we can not cast it from smaller types. [ 16.499365] ================================================================== [ 16.506655] BUG: KASAN: stack-out-of-bounds in find_first_bit+0x1d/0x70 [ 16.513313] Read of size 8 at addr ffff8803616cf510 by task systemd-udevd/180 [ 16.521998] CPU: 0 PID: 180 Comm: systemd-udevd Tainted: G U O 4.15.0-rc3+ #14 [ 16.530317] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.2.8 01/26/2016 [ 16.537760] Call Trace: [ 16.540230] dump_stack+0x7c/0xbb [ 16.543569] print_address_description+0x6b/0x290 [ 16.548306] kasan_report+0x28a/0x370 [ 16.551993] ? find_first_bit+0x1d/0x70 [ 16.555858] find_first_bit+0x1d/0x70 [ 16.559625] intel_gvt_init_cmd_parser+0x127/0x3c0 [i915] [ 16.565060] ? __lock_is_held+0x8f/0xf0 [ 16.568990] ? intel_gvt_clean_cmd_parser+0x10/0x10 [i915] [ 16.574514] ? __hrtimer_init+0x5d/0xb0 [ 16.578445] intel_gvt_init_device+0x2c3/0x690 [i915] [ 16.583537] ? unregister_module_notifier+0x20/0x20 [ 16.588515] intel_gvt_init+0x89/0x100 [i915] [ 16.592962] i915_driver_load+0x1992/0x1c70 [i915] [ 16.597846] ? __i915_printk+0x210/0x210 [i915] [ 16.602410] ? wait_for_completion+0x280/0x280 [ 16.606883] ? lock_downgrade+0x2c0/0x2c0 [ 16.610923] ? __pm_runtime_resume+0x46/0x90 [ 16.615238] ? acpi_dev_found+0x76/0x80 [ 16.619162] ? i915_pci_remove+0x30/0x30 [i915] [ 16.623733] local_pci_probe+0x74/0xe0 [ 16.627518] pci_device_probe+0x208/0x310 [ 16.631561] ? pci_device_remove+0x100/0x100 [ 16.635871] ? __list_add_valid+0x29/0xa0 [ 16.639919] driver_probe_device+0x40b/0x6b0 [ 16.644223] ? driver_probe_device+0x6b0/0x6b0 [ 16.648696] __driver_attach+0x11d/0x130 [ 16.652649] bus_for_each_dev+0xe7/0x160 [ 16.656600] ? subsys_dev_iter_exit+0x10/0x10 [ 16.660987] ? __list_add_valid+0x29/0xa0 [ 16.665028] bus_add_driver+0x31d/0x3a0 [ 16.668893] driver_register+0xc6/0x170 [ 16.672758] ? 0xffffffffc0ad8000 [ 16.676108] do_one_initcall+0x9c/0x206 [ 16.679984] ? initcall_blacklisted+0x150/0x150 [ 16.684545] ? do_init_module+0x35/0x33b [ 16.688494] ? kasan_unpoison_shadow+0x31/0x40 [ 16.692968] ? kasan_kmalloc+0xa6/0xd0 [ 16.696743] ? do_init_module+0x35/0x33b [ 16.700694] ? kasan_unpoison_shadow+0x31/0x40 [ 16.705168] ? __asan_register_globals+0x82/0xa0 [ 16.709819] do_init_module+0xe7/0x33b [ 16.713597] load_module+0x4481/0x4ce0 [ 16.717397] ? module_frob_arch_sections+0x20/0x20 [ 16.722228] ? vfs_read+0x13b/0x190 [ 16.725742] ? kernel_read+0x74/0xa0 [ 16.729351] ? get_user_arg_ptr.isra.17+0x70/0x70 [ 16.734099] ? SYSC_finit_module+0x175/0x1b0 [ 16.738399] SYSC_finit_module+0x175/0x1b0 [ 16.742524] ? SYSC_init_module+0x1e0/0x1e0 [ 16.746741] ? __fget+0x157/0x240 [ 16.750090] ? trace_hardirqs_on_thunk+0x1a/0x1c [ 16.754747] entry_SYSCALL_64_fastpath+0x23/0x9a [ 16.759397] RIP: 0033:0x7f8fbc837499 [ 16.762996] RSP: 002b:00007ffead76c138 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 16.770618] RAX: ffffffffffffffda RBX: 0000000000000012 RCX: 00007f8fbc837499 [ 16.777800] RDX: 0000000000000000 RSI: 000056484e67b080 RDI: 0000000000000012 [ 16.784979] RBP: 00007ffead76b140 R08: 0000000000000000 R09: 0000000000000021 [ 16.792164] R10: 0000000000000012 R11: 0000000000000246 R12: 000056484e67b460 [ 16.799345] R13: 00007ffead76b120 R14: 0000000000000005 R15: 0000000000000000 [ 16.808052] The buggy address belongs to the page: [ 16.812876] page:00000000dc4b8c1e count:0 mapcount:0 mapping: (null) index:0x0 [ 16.820934] flags: 0x17ffffc0000000() [ 16.824621] raw: 0017ffffc0000000 0000000000000000 0000000000000000 00000000ffffffff [ 16.832416] raw: ffffea000d85b3e0 ffffea000d85b3e0 0000000000000000 0000000000000000 [ 16.840208] page dumped because: kasan: bad access detected [ 16.847318] Memory state around the buggy address: [ 16.852143] ffff8803616cf400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 16.859427] ffff8803616cf480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 [ 16.866708] >ffff8803616cf500: f1 f1 04 f4 f4 f4 f3 f3 f3 f3 00 00 00 00 00 00 [ 16.873988] ^ [ 16.877770] ffff8803616cf580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 16.885042] ffff8803616cf600: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 [ 16.892312] ================================================================== Signed-off-by: Changbin Du Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/cmd_parser.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c index 701a3c6f1669..9d12090939e3 100644 --- a/drivers/gpu/drm/i915/gvt/cmd_parser.c +++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c @@ -2777,12 +2777,12 @@ int intel_gvt_scan_and_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx) } static struct cmd_info *find_cmd_entry_any_ring(struct intel_gvt *gvt, - unsigned int opcode, int rings) + unsigned int opcode, unsigned long rings) { struct cmd_info *info = NULL; unsigned int ring; - for_each_set_bit(ring, (unsigned long *)&rings, I915_NUM_ENGINES) { + for_each_set_bit(ring, &rings, I915_NUM_ENGINES) { info = find_cmd_entry(gvt, opcode, ring); if (info) break; -- cgit v1.2.3 From bcfd09f7837f5240c30fd2f52ee7293516641faa Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 5 Jan 2018 22:12:32 +1100 Subject: xfrm: Return error on unknown encap_type in init_state Currently esp will happily create an xfrm state with an unknown encap type for IPv4, without setting the necessary state parameters. This patch fixes it by returning -EINVAL. There is a similar problem in IPv6 where if the mode is unknown we will skip initialisation while returning zero. However, this is harmless as the mode has already been checked further up the stack. This patch removes this anomaly by aligning the IPv6 behaviour with IPv4 and treating unknown modes (which cannot actually happen) as transport mode. Fixes: 38320c70d282 ("[IPSEC]: Use crypto_aead and authenc in ESP") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 1 + net/ipv6/esp6.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d57aa64fa7c7..61fe6e4d23fc 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -981,6 +981,7 @@ static int esp_init_state(struct xfrm_state *x) switch (encap->encap_type) { default: + err = -EINVAL; goto error; case UDP_ENCAP_ESPINUDP: x->props.header_len += sizeof(struct udphdr); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index a902ff8f59be..1a7f00cd4803 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -890,13 +890,12 @@ static int esp6_init_state(struct xfrm_state *x) x->props.header_len += IPV4_BEET_PHMAXLEN + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); break; + default: case XFRM_MODE_TRANSPORT: break; case XFRM_MODE_TUNNEL: x->props.header_len += sizeof(struct ipv6hdr); break; - default: - goto error; } align = ALIGN(crypto_aead_blocksize(aead), 4); -- cgit v1.2.3 From b1bdcb59b64f806ef08d25a85c39ffb3ad841ce6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 6 Jan 2018 01:13:08 +0100 Subject: xfrm: don't call xfrm_policy_cache_flush while holding spinlock xfrm_policy_cache_flush can sleep, so it cannot be called while holding a spinlock. We could release the lock first, but I don't see why we need to invoke this function here in first place, the packet path won't reuse an xdst entry unless its still valid. While at it, add an annotation to xfrm_policy_cache_flush, it would have probably caught this bug sooner. Fixes: ec30d78c14a813 ("xfrm: add xdst pcpu cache") Reported-by: syzbot+e149f7d1328c26f9c12f@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2ef6db98e9ba..bc5eae12fb09 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -975,8 +975,6 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) } if (!cnt) err = -ESRCH; - else - xfrm_policy_cache_flush(); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; @@ -1744,6 +1742,8 @@ void xfrm_policy_cache_flush(void) bool found = 0; int cpu; + might_sleep(); + local_bh_disable(); rcu_read_lock(); for_each_possible_cpu(cpu) { -- cgit v1.2.3 From 6b018235b4daabae96d855219fae59c3fb8be417 Mon Sep 17 00:00:00 2001 From: "Ewan D. Milne" Date: Fri, 5 Jan 2018 12:44:06 -0500 Subject: nvme-fabrics: initialize default host->id in nvmf_host_default() The field was uninitialized before use. Signed-off-by: Ewan D. Milne Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 76b4fe6816a0..894c2ccb3891 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -74,6 +74,7 @@ static struct nvmf_host *nvmf_host_default(void) return NULL; kref_init(&host->ref); + uuid_gen(&host->id); snprintf(host->nqn, NVMF_NQN_SIZE, "nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id); -- cgit v1.2.3 From 87590ce6e373d1a5401f6539f0c59ef92dd924a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 7 Jan 2018 22:48:00 +0100 Subject: sysfs/cpu: Add vulnerability folder As the meltdown/spectre problem affects several CPU architectures, it makes sense to have common way to express whether a system is affected by a particular vulnerability or not. If affected the way to express the mitigation should be common as well. Create /sys/devices/system/cpu/vulnerabilities folder and files for meltdown, spectre_v1 and spectre_v2. Allow architectures to override the show function. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: Will Deacon Cc: Dave Hansen Cc: Linus Torvalds Cc: Borislav Petkov Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180107214913.096657732@linutronix.de --- Documentation/ABI/testing/sysfs-devices-system-cpu | 16 ++++++++ drivers/base/Kconfig | 3 ++ drivers/base/cpu.c | 48 ++++++++++++++++++++++ include/linux/cpu.h | 7 ++++ 4 files changed, 74 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index f3d5817c4ef0..bd3a88e16d8b 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -373,3 +373,19 @@ Contact: Linux kernel mailing list Description: information about CPUs heterogeneity. cpu_capacity: capacity of cpu#. + +What: /sys/devices/system/cpu/vulnerabilities + /sys/devices/system/cpu/vulnerabilities/meltdown + /sys/devices/system/cpu/vulnerabilities/spectre_v1 + /sys/devices/system/cpu/vulnerabilities/spectre_v2 +Date: Januar 2018 +Contact: Linux kernel mailing list +Description: Information about CPU vulnerabilities + + The files are named after the code names of CPU + vulnerabilities. The output of those files reflects the + state of the CPUs in the system. Possible output values: + + "Not affected" CPU is not affected by the vulnerability + "Vulnerable" CPU is affected and no mitigation in effect + "Mitigation: $M" CPU is affetcted and mitigation $M is in effect diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 2f6614c9a229..37a71fd9043f 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -235,6 +235,9 @@ config GENERIC_CPU_DEVICES config GENERIC_CPU_AUTOPROBE bool +config GENERIC_CPU_VULNERABILITIES + bool + config SOC_BUS bool select GLOB diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 321cd7b4d817..825964efda1d 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -501,10 +501,58 @@ static void __init cpu_dev_register_generic(void) #endif } +#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES + +ssize_t __weak cpu_show_meltdown(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + +ssize_t __weak cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + +ssize_t __weak cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + +static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); +static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); +static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); + +static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_meltdown.attr, + &dev_attr_spectre_v1.attr, + &dev_attr_spectre_v2.attr, + NULL +}; + +static const struct attribute_group cpu_root_vulnerabilities_group = { + .name = "vulnerabilities", + .attrs = cpu_root_vulnerabilities_attrs, +}; + +static void __init cpu_register_vulnerabilities(void) +{ + if (sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_vulnerabilities_group)) + pr_err("Unable to register CPU vulnerabilities\n"); +} + +#else +static inline void cpu_register_vulnerabilities(void) { } +#endif + void __init cpu_dev_init(void) { if (subsys_system_register(&cpu_subsys, cpu_root_attr_groups)) panic("Failed to register CPU subsystem"); cpu_dev_register_generic(); + cpu_register_vulnerabilities(); } diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 938ea8ae0ba4..c816e6f2730c 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -47,6 +47,13 @@ extern void cpu_remove_dev_attr(struct device_attribute *attr); extern int cpu_add_dev_attr_group(struct attribute_group *attrs); extern void cpu_remove_dev_attr_group(struct attribute_group *attrs); +extern ssize_t cpu_show_meltdown(struct device *dev, + struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf); + extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, const struct attribute_group **groups, -- cgit v1.2.3 From 61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 7 Jan 2018 22:48:01 +0100 Subject: x86/cpu: Implement CPU vulnerabilites sysfs functions Implement the CPU vulnerabilty show functions for meltdown, spectre_v1 and spectre_v2. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: Will Deacon Cc: Dave Hansen Cc: Linus Torvalds Cc: Borislav Petkov Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180107214913.177414879@linutronix.de --- arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/bugs.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cd5199de231e..e23d21ac745a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -89,6 +89,7 @@ config X86 select GENERIC_CLOCKEVENTS_MIN_ADJUST select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ba0b2424c9b0..76ad6cb44b40 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -60,3 +61,31 @@ void __init check_bugs(void) set_memory_4k((unsigned long)__va(0), 1); #endif } + +#ifdef CONFIG_SYSFS +ssize_t cpu_show_meltdown(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + return sprintf(buf, "Not affected\n"); + if (boot_cpu_has(X86_FEATURE_PTI)) + return sprintf(buf, "Mitigation: PTI\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) + return sprintf(buf, "Not affected\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return sprintf(buf, "Not affected\n"); + return sprintf(buf, "Vulnerable\n"); +} +#endif -- cgit v1.2.3 From 0dd6d272d39c7c1fe2f4253197b505f2b66538ee Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 23 Dec 2017 21:50:13 -0500 Subject: x86/xen/time: fix section mismatch for xen_init_time_ops() The header declares this function as __init but is defined in __ref section. Signed-off-by: Nick Desaulniers Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/xen/xen-ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f96dbedb33d4..1a7a9469e5a7 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -71,7 +71,7 @@ u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); void xen_save_time_memory_area(void); void xen_restore_time_memory_area(void); -void __init xen_init_time_ops(void); +void __ref xen_init_time_ops(void); void __init xen_hvm_init_time_ops(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); -- cgit v1.2.3 From 66a640e7823da803fdb68d5d88f7a8fbd11c29e6 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 6 Jan 2018 13:39:48 -0800 Subject: x86: xen: remove the use of VLAIS Variable Length Arrays In Structs (VLAIS) is not supported by Clang, and frowned upon by others. https://lkml.org/lkml/2013/9/23/500 Here, the VLAIS was used because the size of the bitmap returned from xen_mc_entry() depended on possibly (based on kernel configuration) runtime sized data. Rather than declaring args as a VLAIS then calling sizeof on *args, we calculate the appropriate sizeof args manually. Further, we can get rid of the #ifdef's and rely on num_possible_cpus() (thanks to a helpful checkpatch warning from an earlier version of this patch). Suggested-by: Juergen Gross Signed-off-by: Nick Desaulniers Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/xen/mmu_pv.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 7118f776cd49..aa701d2a5023 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1339,20 +1339,18 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, { struct { struct mmuext_op op; -#ifdef CONFIG_SMP - DECLARE_BITMAP(mask, num_processors); -#else DECLARE_BITMAP(mask, NR_CPUS); -#endif } *args; struct multicall_space mcs; + const size_t mc_entry_size = sizeof(args->op) + + sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus()); trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); if (cpumask_empty(cpus)) return; /* nothing to do */ - mcs = xen_mc_entry(sizeof(*args)); + mcs = xen_mc_entry(mc_entry_size); args = mcs.args; args->op.arg2.vcpumask = to_cpumask(args->mask); -- cgit v1.2.3 From dba04eb76df982703fefc021a4d278347b6176a9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 8 Jan 2018 16:27:31 +0100 Subject: locking/Documentation: Remove stale crossrelease_fullstack parameter The cross-release lockdep functionality has been removed in: e966eaeeb623: ("locking/lockdep: Remove the cross-release locking checks") ... leaving the kernel parameter docs behind. The code handling the parameter does not exist so this is a plain documentation change. Signed-off-by: David Sterba Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: byungchul.park@lge.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20180108152731.27613-1-dsterba@suse.com Signed-off-by: Ingo Molnar --- Documentation/admin-guide/kernel-parameters.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index af7104aaffd9..a626465dd877 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -713,9 +713,6 @@ It will be ignored when crashkernel=X,high is not used or memory reserved is below 4G. - crossrelease_fullstack - [KNL] Allow to record full stack trace in cross-release - cryptomgr.notests [KNL] Disable crypto self-tests -- cgit v1.2.3 From 262b6b30087246abf09d6275eb0c0dc421bcbe38 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 6 Jan 2018 18:41:14 +0100 Subject: x86/tboot: Unbreak tboot with PTI enabled This is another case similar to what EFI does: create a new set of page tables, map some code at a low address, and jump to it. PTI mistakes this low address for userspace and mistakenly marks it non-executable in an effort to make it unusable for userspace. Undo the poison to allow execution. Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") Signed-off-by: Dave Hansen Signed-off-by: Andrea Arcangeli Signed-off-by: Thomas Gleixner Cc: Alan Cox Cc: Tim Chen Cc: Jon Masters Cc: Dave Hansen Cc: Andi Kleen Cc: Jeff Law Cc: Paolo Bonzini Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David" Cc: Nick Clifton Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180108102805.GK25546@redhat.com --- arch/x86/kernel/tboot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a4eb27918ceb..75869a4b6c41 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, p4d = p4d_alloc(&tboot_mm, pgd, vaddr); if (!p4d) return -1; + pgd->pgd &= ~_PAGE_NX; pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; -- cgit v1.2.3 From 527187d28569e39c5d489d6306d3b79605cf85a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jan 2018 17:27:19 +0100 Subject: locking/lockdep: Remove cross-release leftovers There's two cross-release leftover facilities: - the crossrelease_hist_*() irq-tracing callbacks (NOPs currently) - the complete_release_commit() callback (NOP as well) Remove them. Cc: David Sterba Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/completion.h | 1 - include/linux/irqflags.h | 4 ---- include/linux/lockdep.h | 2 -- kernel/sched/completion.c | 5 ----- 4 files changed, 12 deletions(-) diff --git a/include/linux/completion.h b/include/linux/completion.h index 94a59ba7d422..519e94915d18 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -32,7 +32,6 @@ struct completion { #define init_completion(x) __init_completion(x) static inline void complete_acquire(struct completion *x) {} static inline void complete_release(struct completion *x) {} -static inline void complete_release_commit(struct completion *x) {} #define COMPLETION_INITIALIZER(work) \ { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 46cb57d5eb13..1b3996ff3f16 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -27,22 +27,18 @@ # define trace_hardirq_enter() \ do { \ current->hardirq_context++; \ - crossrelease_hist_start(XHLOCK_HARD); \ } while (0) # define trace_hardirq_exit() \ do { \ current->hardirq_context--; \ - crossrelease_hist_end(XHLOCK_HARD); \ } while (0) # define lockdep_softirq_enter() \ do { \ current->softirq_context++; \ - crossrelease_hist_start(XHLOCK_SOFT); \ } while (0) # define lockdep_softirq_exit() \ do { \ current->softirq_context--; \ - crossrelease_hist_end(XHLOCK_SOFT); \ } while (0) # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, #else diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 2e75dc34bff5..3251d9c0d313 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -475,8 +475,6 @@ enum xhlock_context_t { #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } -static inline void crossrelease_hist_start(enum xhlock_context_t c) {} -static inline void crossrelease_hist_end(enum xhlock_context_t c) {} static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_init_task(struct task_struct *task) {} static inline void lockdep_free_task(struct task_struct *task) {} diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 2ddaec40956f..0926aef10dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -34,11 +34,6 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); - /* - * Perform commit of crossrelease here. - */ - complete_release_commit(x); - if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); -- cgit v1.2.3 From 8d56eff266f3e41a6c39926269c4c3f58f881a8e Mon Sep 17 00:00:00 2001 From: Jike Song Date: Tue, 9 Jan 2018 00:03:41 +0800 Subject: x86/mm/pti: Remove dead logic in pti_user_pagetable_walk*() The following code contains dead logic: 162 if (pgd_none(*pgd)) { 163 unsigned long new_p4d_page = __get_free_page(gfp); 164 if (!new_p4d_page) 165 return NULL; 166 167 if (pgd_none(*pgd)) { 168 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); 169 new_p4d_page = 0; 170 } 171 if (new_p4d_page) 172 free_page(new_p4d_page); 173 } There can't be any difference between two pgd_none(*pgd) at L162 and L167, so it's always false at L171. Dave Hansen explained: Yes, the double-test was part of an optimization where we attempted to avoid using a global spinlock in the fork() path. We would check for unallocated mid-level page tables without the lock. The lock was only taken when we needed to *make* an entry to avoid collisions. Now that it is all single-threaded, there is no chance of a collision, no need for a lock, and no need for the re-check. As all these functions are only called during init, mark them __init as well. Fixes: 03f4424f348e ("x86/mm/pti: Add functions to clone kernel PMDs") Signed-off-by: Jike Song Signed-off-by: Thomas Gleixner Cc: Alan Cox Cc: Andi Kleen Cc: Tom Lendacky Cc: Peter Zijlstra Cc: Tim Chen Cc: Jiri Koshina Cc: Dave Hansen Cc: Borislav Petkov Cc: Kees Cook Cc: Andi Lutomirski Cc: Linus Torvalds Cc: Greg KH Cc: David Woodhouse Cc: Paul Turner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180108160341.3461-1-albcamus@gmail.com --- arch/x86/mm/pti.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 43d4a4a29037..ce38f165489b 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -149,7 +149,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) * * Returns a pointer to a P4D on success, or NULL on failure. */ -static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) { pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); @@ -164,12 +164,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) if (!new_p4d_page) return NULL; - if (pgd_none(*pgd)) { - set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); - new_p4d_page = 0; - } - if (new_p4d_page) - free_page(new_p4d_page); + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } BUILD_BUG_ON(pgd_large(*pgd) != 0); @@ -182,7 +177,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) * * Returns a pointer to a PMD on success, or NULL on failure. */ -static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); p4d_t *p4d = pti_user_pagetable_walk_p4d(address); @@ -194,12 +189,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!new_pud_page) return NULL; - if (p4d_none(*p4d)) { - set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); - new_pud_page = 0; - } - if (new_pud_page) - free_page(new_pud_page); + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); } pud = pud_offset(p4d, address); @@ -213,12 +203,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!new_pmd_page) return NULL; - if (pud_none(*pud)) { - set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); - new_pmd_page = 0; - } - if (new_pmd_page) - free_page(new_pmd_page); + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); } return pmd_offset(pud, address); @@ -251,12 +236,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) if (!new_pte_page) return NULL; - if (pmd_none(*pmd)) { - set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); - new_pte_page = 0; - } - if (new_pte_page) - free_page(new_pte_page); + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); } pte = pte_offset_kernel(pmd, address); -- cgit v1.2.3 From 9d0513d82f1a8fe17b41f113ac5922fa57dbaf5c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 28 Dec 2017 14:25:23 +0200 Subject: x86/platform/intel-mid: Revert "Make 'bt_sfi_data' const" So one of the constification patches unearthed a type casting fragility of the underlying code: 276c87054751 ("x86/platform/intel-mid: Make 'bt_sfi_data' const") converted the struct to be const while it is also used as a temporary container for important data that is used to fill 'parent' and 'name' fields in struct platform_device_info. The compiler doesn't notice this due to an explicit type cast that loses the const - which fragility will be fixed separately. This type cast turned a seemingly trivial const propagation patch into a hard to debug data corruptor and crasher bug. Signed-off-by: Andy Shevchenko Cc: Bhumika Goyal Cc: Darren Hart Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: julia.lawall@lip6.fr Cc: platform-driver-x86@vger.kernel.org Link: http://lkml.kernel.org/r/20171228122523.21802-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/device_libs/platform_bt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index dc036e511f48..5a0483e7bf66 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -60,7 +60,7 @@ static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata) return 0; } -static const struct bt_sfi_data tng_bt_sfi_data __initdata = { +static struct bt_sfi_data tng_bt_sfi_data __initdata = { .setup = tng_bt_sfi_setup, }; -- cgit v1.2.3 From 414a2dc138838642d28938506e31ad461648b898 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 2 Jan 2018 12:13:10 +0100 Subject: sched/isolation: Make CONFIG_CPU_ISOLATION=y depend on SMP or COMPILE_TEST On uniprocessor systems, critical and non-critical tasks cannot be isolated, as there is only a single CPU core. Hence enabling CPU isolation by default on such systems does not make much sense. Instead of changing the default for !SMP, fix this by making the feature depend on SMP, with an override for compile-testing. Note that its sole selector (NO_HZ_FULL) already depends on SMP. This decreases kernel size for a default uniprocessor kernel by ca. 1 KiB. Signed-off-by: Geert Uytterhoeven Acked-by: Nicolas Pitre Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 2c43838c99d9d23f ("sched/isolation: Enable CONFIG_CPU_ISOLATION=y by default") Link: http://lkml.kernel.org/r/1514891590-20782-1-git-send-email-geert@linux-m68k.org Signed-off-by: Ingo Molnar --- init/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/init/Kconfig b/init/Kconfig index 690a381adee0..c1221332e128 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -461,6 +461,7 @@ endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION bool "CPU isolation" + depends on SMP || COMPILE_TEST default y help Make sure that CPUs running critical tasks are not disturbed by -- cgit v1.2.3 From f328299e54a94998b31baf788d2b33d8122a4acb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 13:53:03 -0600 Subject: locking/refcounts: Remove stale comment from the ARCH_HAS_REFCOUNT Kconfig entry ARCH_HAS_REFCOUNT is no longer marked as broken ('if BROKEN'), so remove the stale comment regarding it being broken. Signed-off-by: Eric Biggers Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171229195303.17781-1-ebiggers3@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d4fc98c50378..ff4e9cd99854 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,7 +55,6 @@ config X86 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_PMEM_API if X86_64 - # Causing hangs/crashes, see the commit that added this change for details. select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY -- cgit v1.2.3 From e4d0e84e490790798691aaa0f2e598637f1867ec Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 8 Jan 2018 16:09:21 -0600 Subject: x86/cpu/AMD: Make LFENCE a serializing instruction To aid in speculation control, make LFENCE a serializing instruction since it has less overhead than MFENCE. This is done by setting bit 1 of MSR 0xc0011029 (DE_CFG). Some families that support LFENCE do not have this MSR. For these families, the LFENCE instruction is already serializing. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Reviewed-by: Borislav Petkov Cc: Peter Zijlstra Cc: Tim Chen Cc: Dave Hansen Cc: Borislav Petkov Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David Woodhouse Cc: Paul Turner Link: https://lkml.kernel.org/r/20180108220921.12580.71694.stgit@tlendack-t1.amdoffice.net --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/amd.c | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ab022618a50a..1e7d710fef43 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -352,6 +352,8 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c +#define MSR_F10H_DECFG 0xc0011029 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bcb75dc97d44..5b438d81beb2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -829,6 +829,16 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has(c, X86_FEATURE_XMM2)) { + /* + * A serializing LFENCE has less overhead than MFENCE, so + * use it for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_F10H_DECFG, + MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } -- cgit v1.2.3 From 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 8 Jan 2018 16:09:32 -0600 Subject: x86/cpu/AMD: Use LFENCE_RDTSC in preference to MFENCE_RDTSC With LFENCE now a serializing instruction, use LFENCE_RDTSC in preference to MFENCE_RDTSC. However, since the kernel could be running under a hypervisor that does not support writing that MSR, read the MSR back and verify that the bit has been set successfully. If the MSR can be read and the bit is set, then set the LFENCE_RDTSC feature, otherwise set the MFENCE_RDTSC feature. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Reviewed-by: Borislav Petkov Cc: Peter Zijlstra Cc: Tim Chen Cc: Dave Hansen Cc: Borislav Petkov Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David Woodhouse Cc: Paul Turner Link: https://lkml.kernel.org/r/20180108220932.12580.52458.stgit@tlendack-t1.amdoffice.net --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/amd.c | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1e7d710fef43..fa11fb1fa570 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -354,6 +354,7 @@ #define MSR_FAM10H_NODE_ID 0xc001100c #define MSR_F10H_DECFG 0xc0011029 #define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5b438d81beb2..ea831c858195 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -829,6 +829,9 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has(c, X86_FEATURE_XMM2)) { + unsigned long long val; + int ret; + /* * A serializing LFENCE has less overhead than MFENCE, so * use it for execution serialization. On families which @@ -839,8 +842,19 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_F10H_DECFG, MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + /* + * Verify that the MSR write was successful (could be running + * under a hypervisor) and only then assume that LFENCE is + * serializing. + */ + ret = rdmsrl_safe(MSR_F10H_DECFG, &val); + if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } else { + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } } /* -- cgit v1.2.3 From 1b5c7ef3d0d0610bda9b63263f7c5b7178d11015 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 6 Jan 2018 10:59:41 -0500 Subject: drm/nouveau/disp/gf119: add missing drive vfunc ptr Fixes broken dp on GF119: Call Trace: ? nvkm_dp_train_drive+0x183/0x2c0 [nouveau] nvkm_dp_acquire+0x4f3/0xcd0 [nouveau] nv50_disp_super_2_2+0x5d/0x470 [nouveau] ? nvkm_devinit_pll_set+0xf/0x20 [nouveau] gf119_disp_super+0x19c/0x2f0 [nouveau] process_one_work+0x193/0x3c0 worker_thread+0x35/0x3b0 kthread+0x125/0x140 ? process_one_work+0x3c0/0x3c0 ? kthread_park+0x60/0x60 ret_from_fork+0x25/0x30 Code: Bad RIP value. RIP: (null) RSP: ffffb1e243e4bc38 CR2: 0000000000000000 Fixes: af85389c614a drm/nouveau/disp: shuffle functions around Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103421 Signed-off-by: Rob Clark Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c index a2978a37b4f3..700fc754f28a 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c @@ -174,6 +174,7 @@ gf119_sor = { .links = gf119_sor_dp_links, .power = g94_sor_dp_power, .pattern = gf119_sor_dp_pattern, + .drive = gf119_sor_dp_drive, .vcpi = gf119_sor_dp_vcpi, .audio = gf119_sor_dp_audio, .audio_sym = gf119_sor_dp_audio_sym, -- cgit v1.2.3 From 374d1b5a81f7f9cc5e7f095ac3d5aff3f6600376 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 5 Jan 2018 08:35:47 +0100 Subject: esp: Fix GRO when the headers not fully in the linear part of the skb. The GRO layer does not necessarily pull the complete headers into the linear part of the skb, a part may remain on the first page fragment. This can lead to a crash if we try to pull the headers, so make sure we have them on the linear part before pulling. Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") Reported-by: syzbot+82bbd65569c49c6c0c4d@syzkaller.appspotmail.com Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 3 ++- net/ipv6/esp6_offload.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index f8b918c766b0..b1338e576d00 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -38,7 +38,8 @@ static struct sk_buff **esp4_gro_receive(struct sk_buff **head, __be32 spi; int err; - skb_pull(skb, offset); + if (!pskb_pull(skb, offset)) + return NULL; if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) goto out; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 333a478aa161..dd9627490c7c 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -60,7 +60,8 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head, int nhoff; int err; - skb_pull(skb, offset); + if (!pskb_pull(skb, offset)) + return NULL; if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) goto out; -- cgit v1.2.3 From aa1f10e85b0ab53dee85d8e293c8159d18d293a8 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 29 Dec 2017 00:22:54 +0100 Subject: mux: core: fix double get_device() class_find_device already does a get_device on the returned device. So the device returned by of_find_mux_chip_by_node is already referenced and we should not reference it again (and unref it on error). Signed-off-by: Hans de Goede Signed-off-by: Peter Rosin Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/mux/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mux/core.c b/drivers/mux/core.c index 2260063b0ea8..6e5cf9d9cd99 100644 --- a/drivers/mux/core.c +++ b/drivers/mux/core.c @@ -413,6 +413,7 @@ static int of_dev_node_match(struct device *dev, const void *data) return dev->of_node == data; } +/* Note this function returns a reference to the mux_chip dev. */ static struct mux_chip *of_find_mux_chip_by_node(struct device_node *np) { struct device *dev; @@ -466,6 +467,7 @@ struct mux_control *mux_control_get(struct device *dev, const char *mux_name) (!args.args_count && (mux_chip->controllers > 1))) { dev_err(dev, "%pOF: wrong #mux-control-cells for %pOF\n", np, args.np); + put_device(&mux_chip->dev); return ERR_PTR(-EINVAL); } @@ -476,10 +478,10 @@ struct mux_control *mux_control_get(struct device *dev, const char *mux_name) if (controller >= mux_chip->controllers) { dev_err(dev, "%pOF: bad mux controller %u specified in %pOF\n", np, controller, args.np); + put_device(&mux_chip->dev); return ERR_PTR(-EINVAL); } - get_device(&mux_chip->dev); return &mux_chip->mux[controller]; } EXPORT_SYMBOL_GPL(mux_control_get); -- cgit v1.2.3 From 443064cb0b1fb4569fe0a71209da7625129fb760 Mon Sep 17 00:00:00 2001 From: Viktor Slavkovic Date: Mon, 8 Jan 2018 10:43:03 -0800 Subject: staging: android: ashmem: fix a race condition in ASHMEM_SET_SIZE ioctl A lock-unlock is missing in ASHMEM_SET_SIZE ioctl which can result in a race condition when mmap is called. After the !asma->file check, before setting asma->size, asma->file can be set in mmap. That would result in having different asma->size than the mapped memory size. Combined with ASHMEM_UNPIN ioctl and shrinker invocation, this can result in memory corruption. Signed-off-by: Viktor Slavkovic Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/android/ashmem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index 0f695df14c9d..372ce9913e6d 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -765,10 +765,12 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; case ASHMEM_SET_SIZE: ret = -EINVAL; + mutex_lock(&ashmem_mutex); if (!asma->file) { ret = 0; asma->size = (size_t)arg; } + mutex_unlock(&ashmem_mutex); break; case ASHMEM_GET_SIZE: ret = asma->size; -- cgit v1.2.3 From 98648ae6ef6bdcdcb88c46cad963906ab452e96d Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Tue, 9 Jan 2018 15:33:42 +0100 Subject: drm/vmwgfx: Don't cache framebuffer maps Buffer objects need to be either pinned or reserved while a map is active, that's not the case here, so avoid caching the framebuffer map. This will cause increasing mapping activity mainly when we don't do page flipping. This fixes occasional garbage filled screens when the framebuffer has been evicted after the map. Since in-kernel mapping of whole buffer objects is error-prone on 32-bit architectures and also quite inefficient, we will revisit this later. Signed-off-by: Thomas Hellstrom Reviewed-by: Sinclair Yeh Cc: --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 6 ------ drivers/gpu/drm/vmwgfx/vmwgfx_kms.h | 2 +- drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c | 41 +++++++++++------------------------- 3 files changed, 13 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 0545740b3724..641294aef165 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -697,7 +697,6 @@ vmw_du_plane_duplicate_state(struct drm_plane *plane) vps->pinned = 0; /* Mapping is managed by prepare_fb/cleanup_fb */ - memset(&vps->guest_map, 0, sizeof(vps->guest_map)); memset(&vps->host_map, 0, sizeof(vps->host_map)); vps->cpp = 0; @@ -760,11 +759,6 @@ vmw_du_plane_destroy_state(struct drm_plane *plane, /* Should have been freed by cleanup_fb */ - if (vps->guest_map.virtual) { - DRM_ERROR("Guest mapping not freed\n"); - ttm_bo_kunmap(&vps->guest_map); - } - if (vps->host_map.virtual) { DRM_ERROR("Host mapping not freed\n"); ttm_bo_kunmap(&vps->host_map); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h index ff9c8389ff21..cd9da2dd79af 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h @@ -175,7 +175,7 @@ struct vmw_plane_state { int pinned; /* For CPU Blit */ - struct ttm_bo_kmap_obj host_map, guest_map; + struct ttm_bo_kmap_obj host_map; unsigned int cpp; }; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c index 90b5437fd787..b68d74888ab1 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c @@ -114,7 +114,7 @@ struct vmw_screen_target_display_unit { bool defined; /* For CPU Blit */ - struct ttm_bo_kmap_obj host_map, guest_map; + struct ttm_bo_kmap_obj host_map; unsigned int cpp; }; @@ -695,7 +695,8 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty) s32 src_pitch, dst_pitch; u8 *src, *dst; bool not_used; - + struct ttm_bo_kmap_obj guest_map; + int ret; if (!dirty->num_hits) return; @@ -706,6 +707,13 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty) if (width == 0 || height == 0) return; + ret = ttm_bo_kmap(&ddirty->buf->base, 0, ddirty->buf->base.num_pages, + &guest_map); + if (ret) { + DRM_ERROR("Failed mapping framebuffer for blit: %d\n", + ret); + goto out_cleanup; + } /* Assume we are blitting from Host (display_srf) to Guest (dmabuf) */ src_pitch = stdu->display_srf->base_size.width * stdu->cpp; @@ -713,7 +721,7 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty) src += ddirty->top * src_pitch + ddirty->left * stdu->cpp; dst_pitch = ddirty->pitch; - dst = ttm_kmap_obj_virtual(&stdu->guest_map, ¬_used); + dst = ttm_kmap_obj_virtual(&guest_map, ¬_used); dst += ddirty->fb_top * dst_pitch + ddirty->fb_left * stdu->cpp; @@ -772,6 +780,7 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty) vmw_fifo_commit(dev_priv, sizeof(*cmd)); } + ttm_bo_kunmap(&guest_map); out_cleanup: ddirty->left = ddirty->top = ddirty->fb_left = ddirty->fb_top = S32_MAX; ddirty->right = ddirty->bottom = S32_MIN; @@ -1109,9 +1118,6 @@ vmw_stdu_primary_plane_cleanup_fb(struct drm_plane *plane, { struct vmw_plane_state *vps = vmw_plane_state_to_vps(old_state); - if (vps->guest_map.virtual) - ttm_bo_kunmap(&vps->guest_map); - if (vps->host_map.virtual) ttm_bo_kunmap(&vps->host_map); @@ -1277,33 +1283,11 @@ vmw_stdu_primary_plane_prepare_fb(struct drm_plane *plane, */ if (vps->content_fb_type == SEPARATE_DMA && !(dev_priv->capabilities & SVGA_CAP_3D)) { - - struct vmw_framebuffer_dmabuf *new_vfbd; - - new_vfbd = vmw_framebuffer_to_vfbd(new_fb); - - ret = ttm_bo_reserve(&new_vfbd->buffer->base, false, false, - NULL); - if (ret) - goto out_srf_unpin; - - ret = ttm_bo_kmap(&new_vfbd->buffer->base, 0, - new_vfbd->buffer->base.num_pages, - &vps->guest_map); - - ttm_bo_unreserve(&new_vfbd->buffer->base); - - if (ret) { - DRM_ERROR("Failed to map content buffer to CPU\n"); - goto out_srf_unpin; - } - ret = ttm_bo_kmap(&vps->surf->res.backup->base, 0, vps->surf->res.backup->base.num_pages, &vps->host_map); if (ret) { DRM_ERROR("Failed to map display buffer to CPU\n"); - ttm_bo_kunmap(&vps->guest_map); goto out_srf_unpin; } @@ -1350,7 +1334,6 @@ vmw_stdu_primary_plane_atomic_update(struct drm_plane *plane, stdu->display_srf = vps->surf; stdu->content_fb_type = vps->content_fb_type; stdu->cpp = vps->cpp; - memcpy(&stdu->guest_map, &vps->guest_map, sizeof(vps->guest_map)); memcpy(&stdu->host_map, &vps->host_map, sizeof(vps->host_map)); if (!stdu->defined) -- cgit v1.2.3 From 191eccb1580939fb0d47deb405b82a85b0379070 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 9 Jan 2018 03:52:05 +1100 Subject: powerpc/pseries: Add H_GET_CPU_CHARACTERISTICS flags & wrapper A new hypervisor call has been defined to communicate various characteristics of the CPU to guests. Add definitions for the hcall number, flags and a wrapper function. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hvcall.h | 17 +++++++++++++++++ arch/powerpc/include/asm/plpar_wrappers.h | 14 ++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index a409177be8bd..f0461618bf7b 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -241,6 +241,7 @@ #define H_GET_HCA_INFO 0x1B8 #define H_GET_PERF_COUNT 0x1BC #define H_MANAGE_TRACE 0x1C0 +#define H_GET_CPU_CHARACTERISTICS 0x1C8 #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4 #define H_QUERY_INT_STATE 0x1E4 #define H_POLL_PENDING 0x1D8 @@ -330,6 +331,17 @@ #define H_SIGNAL_SYS_RESET_ALL_OTHERS -2 /* >= 0 values are CPU number */ +/* H_GET_CPU_CHARACTERISTICS return values */ +#define H_CPU_CHAR_SPEC_BAR_ORI31 (1ull << 63) // IBM bit 0 +#define H_CPU_CHAR_BCCTRL_SERIALISED (1ull << 62) // IBM bit 1 +#define H_CPU_CHAR_L1D_FLUSH_ORI30 (1ull << 61) // IBM bit 2 +#define H_CPU_CHAR_L1D_FLUSH_TRIG2 (1ull << 60) // IBM bit 3 +#define H_CPU_CHAR_L1D_THREAD_PRIV (1ull << 59) // IBM bit 4 + +#define H_CPU_BEHAV_FAVOUR_SECURITY (1ull << 63) // IBM bit 0 +#define H_CPU_BEHAV_L1D_FLUSH_PR (1ull << 62) // IBM bit 1 +#define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ull << 61) // IBM bit 2 + /* Flag values used in H_REGISTER_PROC_TBL hcall */ #define PROC_TABLE_OP_MASK 0x18 #define PROC_TABLE_DEREG 0x10 @@ -436,6 +448,11 @@ static inline unsigned int get_longbusy_msecs(int longbusy_rc) } } +struct h_cpu_char_result { + u64 character; + u64 behaviour; +}; + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index 7f01b22fa6cb..55eddf50d149 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -326,4 +326,18 @@ static inline long plapr_signal_sys_reset(long cpu) return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu); } +static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + rc = plpar_hcall(H_GET_CPU_CHARACTERISTICS, retbuf); + if (rc == H_SUCCESS) { + p->character = retbuf[0]; + p->behaviour = retbuf[1]; + } + + return rc; +} + #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */ -- cgit v1.2.3 From 46eb14a6e1585d99c1b9f58d0e7389082a5f466b Mon Sep 17 00:00:00 2001 From: Pete Zaitcev Date: Mon, 8 Jan 2018 15:46:41 -0600 Subject: USB: fix usbmon BUG trigger Automated tests triggered this by opening usbmon and accessing the mmap while simultaneously resizing the buffers. This bug was with us since 2006, because typically applications only size the buffers once and thus avoid racing. Reported by Kirill A. Shutemov. Reported-by: Signed-off-by: Pete Zaitcev Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/mon/mon_bin.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index f6ae753ab99b..f932f40302df 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1004,7 +1004,9 @@ static long mon_bin_ioctl(struct file *file, unsigned int cmd, unsigned long arg break; case MON_IOCQ_RING_SIZE: + mutex_lock(&rp->fetch_lock); ret = rp->b_size; + mutex_unlock(&rp->fetch_lock); break; case MON_IOCT_RING_SIZE: @@ -1231,12 +1233,16 @@ static int mon_bin_vma_fault(struct vm_fault *vmf) unsigned long offset, chunk_idx; struct page *pageptr; + mutex_lock(&rp->fetch_lock); offset = vmf->pgoff << PAGE_SHIFT; - if (offset >= rp->b_size) + if (offset >= rp->b_size) { + mutex_unlock(&rp->fetch_lock); return VM_FAULT_SIGBUS; + } chunk_idx = offset / CHUNK_SIZE; pageptr = rp->b_vec[chunk_idx].pg; get_page(pageptr); + mutex_unlock(&rp->fetch_lock); vmf->page = pageptr; return 0; } -- cgit v1.2.3 From 7ae2c3c280db183ca9ada2675c34ec2f7378abfa Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 3 Jan 2018 12:51:51 -0500 Subject: USB: UDC core: fix double-free in usb_add_gadget_udc_release The error-handling pathways in usb_add_gadget_udc_release() are messed up. Aside from the uninformative statement labels, they can deallocate the udc structure after calling put_device(), which is a double-free. This was observed by KASAN in automatic testing. This patch cleans up the routine. It preserves the requirement that when any failure occurs, we call put_device(&gadget->dev). Signed-off-by: Alan Stern Reported-by: Fengguang Wu CC: Reviewed-by: Peter Chen Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 93eff7dec2f5..1b3efb14aec7 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1147,11 +1147,7 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, udc = kzalloc(sizeof(*udc), GFP_KERNEL); if (!udc) - goto err1; - - ret = device_add(&gadget->dev); - if (ret) - goto err2; + goto err_put_gadget; device_initialize(&udc->dev); udc->dev.release = usb_udc_release; @@ -1160,7 +1156,11 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, udc->dev.parent = parent; ret = dev_set_name(&udc->dev, "%s", kobject_name(&parent->kobj)); if (ret) - goto err3; + goto err_put_udc; + + ret = device_add(&gadget->dev); + if (ret) + goto err_put_udc; udc->gadget = gadget; gadget->udc = udc; @@ -1170,7 +1170,7 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, ret = device_add(&udc->dev); if (ret) - goto err4; + goto err_unlist_udc; usb_gadget_set_state(gadget, USB_STATE_NOTATTACHED); udc->vbus = true; @@ -1178,27 +1178,25 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, /* pick up one of pending gadget drivers */ ret = check_pending_gadget_drivers(udc); if (ret) - goto err5; + goto err_del_udc; mutex_unlock(&udc_lock); return 0; -err5: + err_del_udc: device_del(&udc->dev); -err4: + err_unlist_udc: list_del(&udc->list); mutex_unlock(&udc_lock); -err3: - put_device(&udc->dev); device_del(&gadget->dev); -err2: - kfree(udc); + err_put_udc: + put_device(&udc->dev); -err1: + err_put_gadget: put_device(&gadget->dev); return ret; } -- cgit v1.2.3 From 9ecccfaa7cb5249bd31bdceb93fcf5bedb8a24d8 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 9 Jan 2018 15:02:51 +0000 Subject: sysfs/cpu: Fix typos in vulnerability documentation Fixes: 87590ce6e ("sysfs/cpu: Add vulnerability folder") Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner --- Documentation/ABI/testing/sysfs-devices-system-cpu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index bd3a88e16d8b..258902db14bf 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -378,7 +378,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/meltdown /sys/devices/system/cpu/vulnerabilities/spectre_v1 /sys/devices/system/cpu/vulnerabilities/spectre_v2 -Date: Januar 2018 +Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities @@ -388,4 +388,4 @@ Description: Information about CPU vulnerabilities "Not affected" CPU is not affected by the vulnerability "Vulnerable" CPU is affected and no mitigation in effect - "Mitigation: $M" CPU is affetcted and mitigation $M is in effect + "Mitigation: $M" CPU is affected and mitigation $M is in effect -- cgit v1.2.3 From 50e51c13b3822d14ff6df4279423e4b7b2269bc3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64: Add macros for annotating the destination of rfid/hrfid The rfid/hrfid ((Hypervisor) Return From Interrupt) instruction is used for switching from the kernel to userspace, and from the hypervisor to the guest kernel. However it can and is also used for other transitions, eg. from real mode kernel code to virtual mode kernel code, and it's not always clear from the code what the destination context is. To make it clearer when reading the code, add macros which encode the expected destination context. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64e.h | 6 ++++++ arch/powerpc/include/asm/exception-64s.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index a703452d67b6..555e22d5e07f 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -209,5 +209,11 @@ exc_##label##_book3e: ori r3,r3,vector_offset@l; \ mtspr SPRN_IVOR##vector_number,r3; +#define RFI_TO_KERNEL \ + rfi + +#define RFI_TO_USER \ + rfi + #endif /* _ASM_POWERPC_EXCEPTION_64E_H */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index b27205297e1d..1af427a3c74f 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -74,6 +74,35 @@ */ #define EX_R3 EX_DAR +/* Macros for annotating the expected destination of (h)rfid */ + +#define RFI_TO_KERNEL \ + rfid + +#define RFI_TO_USER \ + rfid + +#define RFI_TO_USER_OR_KERNEL \ + rfid + +#define RFI_TO_GUEST \ + rfid + +#define HRFI_TO_KERNEL \ + hrfid + +#define HRFI_TO_USER \ + hrfid + +#define HRFI_TO_USER_OR_KERNEL \ + hrfid + +#define HRFI_TO_GUEST \ + hrfid + +#define HRFI_TO_UNKNOWN \ + hrfid + #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \ -- cgit v1.2.3 From 222f20f140623ef6033491d0103ee0875fe87d35 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64s: Simple RFI macro conversions This commit does simple conversions of rfi/rfid to the new macros that include the expected destination context. By simple we mean cases where there is a single well known destination context, and it's simply a matter of substituting the instruction for the appropriate macro. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 4 ++-- arch/powerpc/kernel/entry_64.S | 14 +++++++++----- arch/powerpc/kernel/exceptions-64s.S | 24 ++++++++++++------------ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 ++++----- arch/powerpc/kvm/book3s_rmhandlers.S | 7 +++++-- arch/powerpc/kvm/book3s_segment.S | 4 ++-- 6 files changed, 34 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 1af427a3c74f..dfc56daed98b 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -247,7 +247,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtspr SPRN_##h##SRR0,r12; \ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ mtspr SPRN_##h##SRR1,r10; \ - h##rfid; \ + h##RFI_TO_KERNEL; \ b . /* prevent speculative execution */ #define EXCEPTION_PROLOG_PSERIES_1(label, h) \ __EXCEPTION_PROLOG_PSERIES_1(label, h) @@ -261,7 +261,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtspr SPRN_##h##SRR0,r12; \ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ mtspr SPRN_##h##SRR1,r10; \ - h##rfid; \ + h##RFI_TO_KERNEL; \ b . /* prevent speculative execution */ #define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 3320bcac7192..e68faa4d1b13 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -37,6 +37,11 @@ #include #include #include +#ifdef CONFIG_PPC_BOOK3S +#include +#else +#include +#endif /* * System calls. @@ -397,8 +402,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) mtmsrd r10, 1 mtspr SPRN_SRR0, r11 mtspr SPRN_SRR1, r12 - - rfid + RFI_TO_USER b . /* prevent speculative execution */ #endif _ASM_NOKPROBE_SYMBOL(system_call_common); @@ -1073,7 +1077,7 @@ __enter_rtas: mtspr SPRN_SRR0,r5 mtspr SPRN_SRR1,r6 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ rtas_return_loc: @@ -1098,7 +1102,7 @@ rtas_return_loc: mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ _ASM_NOKPROBE_SYMBOL(__enter_rtas) _ASM_NOKPROBE_SYMBOL(rtas_return_loc) @@ -1171,7 +1175,7 @@ _GLOBAL(enter_prom) LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE) andc r11,r11,r12 mtsrr1 r11 - rfid + RFI_TO_KERNEL #endif /* CONFIG_PPC_BOOK3E */ 1: /* Return from OF */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e441b469dc8f..5502b0147c4e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -256,7 +256,7 @@ BEGIN_FTR_SECTION LOAD_HANDLER(r12, machine_check_handle_early) 1: mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r11 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ 2: /* Stack overflow. Stay on emergency stack and panic. @@ -445,7 +445,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) li r3,MSR_ME andc r10,r10,r3 /* Turn off MSR_ME */ mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . 2: /* @@ -463,7 +463,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) */ bl machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP - rfid + RFI_TO_USER_OR_KERNEL 9: /* Deliver the machine check to host kernel in V mode. */ MACHINE_CHECK_HANDLER_WINDUP @@ -651,7 +651,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . 8: std r3,PACA_EXSLB+EX_DAR(r13) @@ -662,7 +662,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . EXC_COMMON_BEGIN(unrecov_slb) @@ -901,7 +901,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) mtspr SPRN_SRR0,r10 ; \ ld r10,PACAKMSR(r13) ; \ mtspr SPRN_SRR1,r10 ; \ - rfid ; \ + RFI_TO_KERNEL ; \ b . ; /* prevent speculative execution */ #ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH @@ -917,7 +917,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ xori r12,r12,MSR_LE ; \ mtspr SPRN_SRR1,r12 ; \ mr r13,r9 ; \ - rfid ; /* return to userspace */ \ + RFI_TO_USER ; /* return to userspace */ \ b . ; /* prevent speculative execution */ #else #define SYSCALL_FASTENDIAN_TEST @@ -1063,7 +1063,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early) mtcr r11 REST_GPR(11, r1) ld r1,GPR1(r1) - hrfid + HRFI_TO_USER_OR_KERNEL 1: mtcr r11 REST_GPR(11, r1) @@ -1314,7 +1314,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) ld r11,PACA_EXGEN+EX_R11(r13) ld r12,PACA_EXGEN+EX_R12(r13) ld r13,PACA_EXGEN+EX_R13(r13) - HRFID + HRFI_TO_UNKNOWN b . #endif @@ -1418,7 +1418,7 @@ masked_##_H##interrupt: \ ld r10,PACA_EXGEN+EX_R10(r13); \ ld r11,PACA_EXGEN+EX_R11(r13); \ /* returns to kernel where r13 must be set up, so don't restore it */ \ - ##_H##rfid; \ + ##_H##RFI_TO_KERNEL; \ b .; \ MASKED_DEC_HANDLER(_H) @@ -1441,7 +1441,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_interrupt) addi r13, r13, 4 mtspr SPRN_SRR0, r13 GET_SCRATCH0(r13) - rfid + RFI_TO_KERNEL b . TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt) @@ -1453,7 +1453,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt) addi r13, r13, 4 mtspr SPRN_HSRR0, r13 GET_SCRATCH0(r13) - hrfid + HRFI_TO_KERNEL b . #endif diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 2659844784b8..9c61f736c75b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -79,7 +79,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline) mtmsrd r0,1 /* clear RI in MSR */ mtsrr0 r5 mtsrr1 r6 - RFI + RFI_TO_KERNEL kvmppc_call_hv_entry: BEGIN_FTR_SECTION @@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtmsrd r6, 1 /* Clear RI in MSR */ mtsrr0 r8 mtsrr1 r7 - RFI + RFI_TO_KERNEL /* Virtual-mode return */ .Lvirt_return: @@ -1167,8 +1167,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) - - hrfid + HRFI_TO_GUEST b . secondary_too_late: @@ -3320,7 +3319,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) ld r4, PACAKMSR(r13) mtspr SPRN_SRR0, r3 mtspr SPRN_SRR1, r4 - rfid + RFI_TO_KERNEL 9: addi r3, r1, STACK_FRAME_OVERHEAD bl kvmppc_bad_interrupt b 9b diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 42a4b237df5f..34a5adeff084 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -46,6 +46,9 @@ #define FUNC(name) name +#define RFI_TO_KERNEL RFI +#define RFI_TO_GUEST RFI + .macro INTERRUPT_TRAMPOLINE intno .global kvmppc_trampoline_\intno @@ -141,7 +144,7 @@ kvmppc_handler_skip_ins: GET_SCRATCH0(r13) /* And get back into the code */ - RFI + RFI_TO_KERNEL #endif /* @@ -164,6 +167,6 @@ _GLOBAL_TOC(kvmppc_entry_trampoline) ori r5, r5, MSR_EE mtsrr0 r7 mtsrr1 r6 - RFI + RFI_TO_KERNEL #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 2a2b96d53999..93a180ceefad 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -156,7 +156,7 @@ no_dcbz32_on: PPC_LL r9, SVCPU_R9(r3) PPC_LL r3, (SVCPU_R3)(r3) - RFI + RFI_TO_GUEST kvmppc_handler_trampoline_enter_end: @@ -407,5 +407,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) cmpwi r12, BOOK3S_INTERRUPT_DOORBELL beqa BOOK3S_INTERRUPT_DOORBELL - RFI + RFI_TO_KERNEL kvmppc_handler_trampoline_exit_end: -- cgit v1.2.3 From b8e90cb7bc04a509e821e82ab6ed7a8ef11ba333 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64: Convert the syscall exit path to use RFI_TO_USER/KERNEL In the syscall exit path we may be returning to user or kernel context. We already have a test for that, because we conditionally restore r13. So use that existing test and branch, and bifurcate the return based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index e68faa4d1b13..724733b74744 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -267,13 +267,23 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ + ld r2,GPR2(r1) + ld r1,GPR1(r1) + mtlr r4 + mtcr r5 + mtspr SPRN_SRR0,r7 + mtspr SPRN_SRR1,r8 + RFI_TO_USER + b . /* prevent speculative execution */ + + /* exit to kernel */ 1: ld r2,GPR2(r1) ld r1,GPR1(r1) mtlr r4 mtcr r5 mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - RFI + RFI_TO_KERNEL b . /* prevent speculative execution */ .Lsyscall_error: -- cgit v1.2.3 From a08f828cf47e6c605af21d2cdec68f84e799c318 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64: Convert fast_exception_return to use RFI_TO_USER/KERNEL Similar to the syscall return path, in fast_exception_return we may be returning to user or kernel context. We already have a test for that, because we conditionally restore r13. So use that existing test and branch, and bifurcate the return based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 724733b74744..2748584b767d 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -892,7 +892,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r13, r2, r4) REST_GPR(13, r1) -1: + mtspr SPRN_SRR1,r3 ld r2,_CCR(r1) @@ -905,8 +905,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r3,GPR3(r1) ld r4,GPR4(r1) ld r1,GPR1(r1) + RFI_TO_USER + b . /* prevent speculative execution */ + +1: mtspr SPRN_SRR1,r3 + + ld r2,_CCR(r1) + mtcrf 0xFF,r2 + ld r2,_NIP(r1) + mtspr SPRN_SRR0,r2 - rfid + ld r0,GPR0(r1) + ld r2,GPR2(r1) + ld r3,GPR3(r1) + ld r4,GPR4(r1) + ld r1,GPR1(r1) + RFI_TO_KERNEL b . /* prevent speculative execution */ #endif /* CONFIG_PPC_BOOK3E */ -- cgit v1.2.3 From c7305645eb0c1621351cfc104038831ae87c0053 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64s: Convert slb_miss_common to use RFI_TO_USER/KERNEL In the SLB miss handler we may be returning to user or kernel. We need to add a check early on and save the result in the cr4 register, and then we bifurcate the return path based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5502b0147c4e..ed356194f09c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -598,6 +598,9 @@ EXC_COMMON_BEGIN(slb_miss_common) stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ + andi. r9,r11,MSR_PR // Check for exception from userspace + cmpdi cr4,r9,MSR_PR // And save the result in CR4 for later + /* * Test MSR_RI before calling slb_allocate_realmode, because the * MSR in r11 gets clobbered. However we still want to allocate @@ -624,9 +627,32 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) /* All done -- return from exception. */ + bne cr4,1f /* returning to kernel */ + +.machine push +.machine "power4" + mtcrf 0x80,r9 + mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ + mtcrf 0x02,r9 /* I/D indication is in cr6 */ + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ +.machine pop + + RESTORE_CTR(r9, PACA_EXSLB) + RESTORE_PPR_PACA(PACA_EXSLB, r9) + mr r3,r12 + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + RFI_TO_USER + b . /* prevent speculative execution */ +1: .machine push .machine "power4" mtcrf 0x80,r9 + mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ mtcrf 0x02,r9 /* I/D indication is in cr6 */ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ @@ -640,9 +666,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) ld r11,PACA_EXSLB+EX_R11(r13) ld r12,PACA_EXSLB+EX_R12(r13) ld r13,PACA_EXSLB+EX_R13(r13) - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ + 2: std r3,PACA_EXSLB+EX_DAR(r13) mr r3,r12 mfspr r11,SPRN_SRR0 -- cgit v1.2.3 From 928afc85270753657b5543e052cc270c279a3fe9 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Sat, 6 Jan 2018 00:56:44 +0800 Subject: uas: ignore UAS for Norelsys NS1068(X) chips The UAS mode of Norelsys NS1068(X) is reported to fail to work on several platforms with the following error message: xhci-hcd xhci-hcd.0.auto: ERROR Transfer event for unknown stream ring slot 1 ep 8 xhci-hcd xhci-hcd.0.auto: @00000000bf04a400 00000000 00000000 1b000000 01098001 And when trying to mount a partition on the disk the disk will disconnect from the USB controller, then after re-connecting the device will be offlined and not working at all. Falling back to USB mass storage can solve this problem, so ignore UAS function of this chip. Cc: stable@vger.kernel.org Signed-off-by: Icenowy Zheng Acked-by: Hans de Goede Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_uas.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h index e6127fb21c12..a7d08ae0adad 100644 --- a/drivers/usb/storage/unusual_uas.h +++ b/drivers/usb/storage/unusual_uas.h @@ -143,6 +143,13 @@ UNUSUAL_DEV(0x2109, 0x0711, 0x0000, 0x9999, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_NO_ATA_1X), +/* Reported-by: Icenowy Zheng */ +UNUSUAL_DEV(0x2537, 0x1068, 0x0000, 0x9999, + "Norelsys", + "NS1068X", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_IGNORE_UAS), + /* Reported-by: Takeo Nakayama */ UNUSUAL_DEV(0x357d, 0x7788, 0x0000, 0x9999, "JMicron", -- cgit v1.2.3 From 541676078b52f365f53d46ee5517d305cd1b6350 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 15 Dec 2017 14:23:10 -0500 Subject: membarrier: Disable preemption when calling smp_call_function_many() smp_call_function_many() requires disabling preemption around the call. Signed-off-by: Mathieu Desnoyers Cc: # v4.14+ Cc: Andrea Parri Cc: Andrew Hunter Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171215192310.25293-1-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index dd7908743dab..9bcbacba82a8 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -89,7 +89,9 @@ static int membarrier_private_expedited(void) rcu_read_unlock(); } if (!fallback) { + preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); free_cpumask_var(tmpmask); } cpus_read_unlock(); -- cgit v1.2.3 From 1e532d2b49645e7cb76d5af6cb5bc4ec93d861ae Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 10 Jan 2018 09:33:26 +0100 Subject: af_key: Fix memory leak in key_notify_policy. We leak the allocated out_skb in case pfkey_xfrm_policy2msg() fails. Fix this by freeing it on error. Reported-by: Dmitry Vyukov Signed-off-by: Steffen Klassert --- net/key/af_key.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index d40861a048fe..7e2e7188e7f4 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2202,8 +2202,10 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); - if (err < 0) + if (err < 0) { + kfree_skb(out_skb); return err; + } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; -- cgit v1.2.3 From 6c7d47c33ed323f14f2a3b8de925e831dbaa4e69 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 22 Nov 2017 14:42:21 +1100 Subject: KVM: PPC: Book3S PR: Fix WIMG handling under pHyp Commit 96df226 ("KVM: PPC: Book3S PR: Preserve storage control bits") added code to preserve WIMG bits but it missed 2 special cases: - a magic page in kvmppc_mmu_book3s_64_xlate() and - guest real mode in kvmppc_handle_pagefault(). For these ptes, WIMG was 0 and pHyp failed on these causing a guest to stop in the very beginning at NIP=0x100 (due to bd9166ffe "KVM: PPC: Book3S PR: Exit KVM on failed mapping"). According to LoPAPR v1.1 14.5.4.1.2 H_ENTER: The hypervisor checks that the WIMG bits within the PTE are appropriate for the physical page number else H_Parameter return. (For System Memory pages WIMG=0010, or, 1110 if the SAO option is enabled, and for IO pages WIMG=01**.) This hence initializes WIMG to non-zero value HPTE_R_M (0x10), as expected by pHyp. [paulus@ozlabs.org - fix compile for 32-bit] Cc: stable@vger.kernel.org # v4.11+ Fixes: 96df226 "KVM: PPC: Book3S PR: Preserve storage control bits" Signed-off-by: Alexey Kardashevskiy Tested-by: Ruediger Oertel Reviewed-by: Greg Kurz Tested-by: Greg Kurz Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu.c | 1 + arch/powerpc/kvm/book3s_pr.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 29ebe2fd5867..a93d719edc90 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -235,6 +235,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_read = true; gpte->may_write = true; gpte->page_size = MMU_PAGE_4K; + gpte->wimg = HPTE_R_M; return 0; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index d0dc8624198f..7deaeeb14b93 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -60,6 +60,7 @@ static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); #define MSR_USER32 MSR_USER #define MSR_USER64 MSR_USER #define HW_PAGE_SIZE PAGE_SIZE +#define HPTE_R_M _PAGE_COHERENT #endif static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu) @@ -557,6 +558,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte.eaddr = eaddr; pte.vpage = eaddr >> 12; pte.page_size = MMU_PAGE_64K; + pte.wimg = HPTE_R_M; } switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) { -- cgit v1.2.3 From ecba8297aafd50db6ae867e90844eead1611ef1c Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 10 Jan 2018 17:04:39 +1100 Subject: KVM: PPC: Book3S HV: Always flush TLB in kvmppc_alloc_reset_hpt() The KVM_PPC_ALLOCATE_HTAB ioctl(), implemented by kvmppc_alloc_reset_hpt() is supposed to completely clear and reset a guest's Hashed Page Table (HPT) allocating or re-allocating it if necessary. In the case where an HPT of the right size already exists and it just zeroes it, it forces a TLB flush on all guest CPUs, to remove any stale TLB entries loaded from the old HPT. However, that situation can arise when the HPT is resizing as well - or even when switching from an RPT to HPT - so those cases need a TLB flush as well. So, move the TLB flush to trigger in all cases except for errors. Cc: stable@vger.kernel.org # v4.10+ Fixes: f98a8bf9ee20 ("KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size") Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8355398f0bb6..b73dbc9e797d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -165,8 +165,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) * Reset all the reverse-mapping chains for all memslots */ kvmppc_rmap_reset(kvm); - /* Ensure that each vcpu will flush its TLB on next entry. */ - cpumask_setall(&kvm->arch.need_tlb_flush); err = 0; goto out; } @@ -182,6 +180,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) kvmppc_set_hpt(kvm, &info); out: + if (err == 0) + /* Ensure that each vcpu will flush its TLB on next entry. */ + cpumask_setall(&kvm->arch.need_tlb_flush); + mutex_unlock(&kvm->lock); return err; } -- cgit v1.2.3 From e4c9fd10eb21376f44723c40ad12395089251c28 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 10 Jan 2018 08:34:28 +0100 Subject: ALSA: hda - Apply headphone noise quirk for another Dell XPS 13 variant There is another Dell XPS 13 variant (SSID 1028:082a) that requires the existing fixup for reducing the headphone noise. This patch adds the quirk entry for that. BugLink: http://lkml.kernel.org/r/CAHXyb9ZCZJzVisuBARa+UORcjRERV8yokez=DP1_5O5isTz0ZA@mail.gmail.com Reported-and-tested-by: Francisco G. Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8fd2d9c62c96..9aafc6c86132 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6196,6 +6196,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x075b, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), SND_PCI_QUIRK(0x1028, 0x075d, "Dell AIO", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x1028, 0x0798, "Dell Inspiron 17 7000 Gaming", ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER), + SND_PCI_QUIRK(0x1028, 0x082a, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), -- cgit v1.2.3 From 031f335cda879450095873003abb03ae8ed3b74a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 10 Jan 2018 10:53:18 +0100 Subject: ALSA: hda - Apply the existing quirk to iMac 14,1 iMac 14,1 requires the same quirk as iMac 12,2, using GPIO 2 and 3 for headphone and speaker output amps. Add the codec SSID quirk entry (106b:0600) accordingly. BugLink: http://lkml.kernel.org/r/CAEw6Zyteav09VGHRfD5QwsfuWv5a43r0tFBNbfcHXoNrxVz7ew@mail.gmail.com Reported-by: Freaky Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_cirrus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_cirrus.c b/sound/pci/hda/patch_cirrus.c index 80bbadc83721..d6e079f4ec09 100644 --- a/sound/pci/hda/patch_cirrus.c +++ b/sound/pci/hda/patch_cirrus.c @@ -408,6 +408,7 @@ static const struct snd_pci_quirk cs420x_fixup_tbl[] = { /*SND_PCI_QUIRK(0x8086, 0x7270, "IMac 27 Inch", CS420X_IMAC27),*/ /* codec SSID */ + SND_PCI_QUIRK(0x106b, 0x0600, "iMac 14,1", CS420X_IMAC27_122), SND_PCI_QUIRK(0x106b, 0x1c00, "MacBookPro 8,1", CS420X_MBP81), SND_PCI_QUIRK(0x106b, 0x2000, "iMac 12,2", CS420X_IMAC27_122), SND_PCI_QUIRK(0x106b, 0x2800, "MacBookPro 10,1", CS420X_MBP101), -- cgit v1.2.3 From aa8a5e0062ac940f7659394f4817c948dc8c0667 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64s: Add support for RFI flush of L1-D cache On some CPUs we can prevent the Meltdown vulnerability by flushing the L1-D cache on exit from kernel to user mode, and from hypervisor to guest. This is known to be the case on at least Power7, Power8 and Power9. At this time we do not know the status of the vulnerability on other CPUs such as the 970 (Apple G5), pasemi CPUs (AmigaOne X1000) or Freescale CPUs. As more information comes to light we can enable this, or other mechanisms on those CPUs. The vulnerability occurs when the load of an architecturally inaccessible memory region (eg. userspace load of kernel memory) is speculatively executed to the point where its result can influence the address of a subsequent speculatively executed load. In order for that to happen, the first load must hit in the L1, because before the load is sent to the L2 the permission check is performed. Therefore if no kernel addresses hit in the L1 the vulnerability can not occur. We can ensure that is the case by flushing the L1 whenever we return to userspace. Similarly for hypervisor vs guest. In order to flush the L1-D cache on exit, we add a section of nops at each (h)rfi location that returns to a lower privileged context, and patch that with some sequence. Newer firmwares are able to advertise to us that there is a special nop instruction that flushes the L1-D. If we do not see that advertised, we fall back to doing a displacement flush in software. For guest kernels we support migration between some CPU versions, and different CPUs may use different flush instructions. So that we are prepared to migrate to a machine with a different flush instruction activated, we may have to patch more than one flush instruction at boot if the hypervisor tells us to. In the end this patch is mostly the work of Nicholas Piggin and Michael Ellerman. However a cast of thousands contributed to analysis of the issue, earlier versions of the patch, back ports testing etc. Many thanks to all of them. Tested-by: Jon Masters Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 40 ++++++++++++--- arch/powerpc/include/asm/feature-fixups.h | 13 +++++ arch/powerpc/include/asm/paca.h | 10 ++++ arch/powerpc/include/asm/setup.h | 13 +++++ arch/powerpc/kernel/asm-offsets.c | 5 ++ arch/powerpc/kernel/exceptions-64s.S | 84 +++++++++++++++++++++++++++++++ arch/powerpc/kernel/setup_64.c | 79 +++++++++++++++++++++++++++++ arch/powerpc/kernel/vmlinux.lds.S | 9 ++++ arch/powerpc/lib/feature-fixups.c | 41 +++++++++++++++ 9 files changed, 286 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index dfc56daed98b..7197b179c1b1 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -74,34 +74,58 @@ */ #define EX_R3 EX_DAR -/* Macros for annotating the expected destination of (h)rfid */ +/* + * Macros for annotating the expected destination of (h)rfid + * + * The nop instructions allow us to insert one or more instructions to flush the + * L1-D cache when returning to userspace or a guest. + */ +#define RFI_FLUSH_SLOT \ + RFI_FLUSH_FIXUP_SECTION; \ + nop; \ + nop; \ + nop #define RFI_TO_KERNEL \ rfid #define RFI_TO_USER \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define RFI_TO_USER_OR_KERNEL \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define RFI_TO_GUEST \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define HRFI_TO_KERNEL \ hrfid #define HRFI_TO_USER \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_USER_OR_KERNEL \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_GUEST \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_UNKNOWN \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 8f88f771cc55..1e82eb3caabd 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -187,7 +187,20 @@ label##3: \ FTR_ENTRY_OFFSET label##1b-label##3b; \ .popsection; +#define RFI_FLUSH_FIXUP_SECTION \ +951: \ + .pushsection __rfi_flush_fixup,"a"; \ + .align 2; \ +952: \ + FTR_ENTRY_OFFSET 951b-952b; \ + .popsection; + + #ifndef __ASSEMBLY__ +#include + +extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup; + void apply_feature_fixups(void); void setup_feature_keys(void); #endif diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 3892db93b837..23ac7fc0af23 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -232,6 +232,16 @@ struct paca_struct { struct sibling_subcore_state *sibling_subcore_state; #endif #endif +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * rfi fallback flush must be in its own cacheline to prevent + * other paca data leaking into the L1d + */ + u64 exrfi[EX_SIZE] __aligned(0x80); + void *rfi_flush_fallback_area; + u64 l1d_flush_congruence; + u64 l1d_flush_sets; +#endif }; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index cf00ec26303a..469b7fdc9be4 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -39,6 +39,19 @@ static inline void pseries_big_endian_exceptions(void) {} static inline void pseries_little_endian_exceptions(void) {} #endif /* CONFIG_PPC_PSERIES */ +void rfi_flush_enable(bool enable); + +/* These are bit flags */ +enum l1d_flush_type { + L1D_FLUSH_NONE = 0x1, + L1D_FLUSH_FALLBACK = 0x2, + L1D_FLUSH_ORI = 0x4, + L1D_FLUSH_MTTRIG = 0x8, +}; + +void __init setup_rfi_flush(enum l1d_flush_type, bool enable); +void do_rfi_flush_fixups(enum l1d_flush_type types); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_SETUP_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6b958414b4e0..f390d57cf2e1 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -237,6 +237,11 @@ int main(void) OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp); OFFSET(PACA_IN_MCE, paca_struct, in_mce); OFFSET(PACA_IN_NMI, paca_struct, in_nmi); + OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area); + OFFSET(PACA_EXRFI, paca_struct, exrfi); + OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence); + OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets); + #endif OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id); OFFSET(PACAKEXECSTATE, paca_struct, kexec_state); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ed356194f09c..2dc10bf646b8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1449,6 +1449,90 @@ masked_##_H##interrupt: \ b .; \ MASKED_DEC_HANDLER(_H) +TRAMP_REAL_BEGIN(rfi_flush_fallback) + SET_SCRATCH0(r13); + GET_PACA(r13); + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + std r12,PACA_EXRFI+EX_R12(r13) + std r8,PACA_EXRFI+EX_R13(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SETS(r13) + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13) + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt). + */ + addi r12,r12,8 + mtctr r11 + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + sync +1: li r8,0 + .rept 8 /* 8-way set associative */ + ldx r11,r10,r8 + add r8,r8,r12 + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not + add r8,r8,r11 // Add 0, this creates a dependency on the ldx + .endr + addi r10,r10,128 /* 128 byte cache line */ + bdnz 1b + + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r12,PACA_EXRFI+EX_R12(r13) + ld r8,PACA_EXRFI+EX_R13(r13) + GET_SCRATCH0(r13); + rfid + +TRAMP_REAL_BEGIN(hrfi_flush_fallback) + SET_SCRATCH0(r13); + GET_PACA(r13); + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + std r12,PACA_EXRFI+EX_R12(r13) + std r8,PACA_EXRFI+EX_R13(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SETS(r13) + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13) + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt). + */ + addi r12,r12,8 + mtctr r11 + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + sync +1: li r8,0 + .rept 8 /* 8-way set associative */ + ldx r11,r10,r8 + add r8,r8,r12 + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not + add r8,r8,r11 // Add 0, this creates a dependency on the ldx + .endr + addi r10,r10,128 /* 128 byte cache line */ + bdnz 1b + + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r12,PACA_EXRFI+EX_R12(r13) + ld r8,PACA_EXRFI+EX_R13(r13) + GET_SCRATCH0(r13); + hrfid + /* * Real mode exceptions actually use this too, but alternate * instruction code patches (which end up in the common .text area) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 8956a9856604..96163f4c3673 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -801,3 +801,82 @@ static int __init disable_hardlockup_detector(void) return 0; } early_initcall(disable_hardlockup_detector); + +#ifdef CONFIG_PPC_BOOK3S_64 +static enum l1d_flush_type enabled_flush_types; +static void *l1d_flush_fallback_area; +bool rfi_flush; + +static void do_nothing(void *unused) +{ + /* + * We don't need to do the flush explicitly, just enter+exit kernel is + * sufficient, the RFI exit handlers will do the right thing. + */ +} + +void rfi_flush_enable(bool enable) +{ + if (rfi_flush == enable) + return; + + if (enable) { + do_rfi_flush_fixups(enabled_flush_types); + on_each_cpu(do_nothing, NULL, 1); + } else + do_rfi_flush_fixups(L1D_FLUSH_NONE); + + rfi_flush = enable; +} + +static void init_fallback_flush(void) +{ + u64 l1d_size, limit; + int cpu; + + l1d_size = ppc64_caches.l1d.size; + limit = min(safe_stack_limit(), ppc64_rma_size); + + /* + * Align to L1d size, and size it at 2x L1d size, to catch possible + * hardware prefetch runoff. We don't have a recipe for load patterns to + * reliably avoid the prefetcher. + */ + l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit)); + memset(l1d_flush_fallback_area, 0, l1d_size * 2); + + for_each_possible_cpu(cpu) { + /* + * The fallback flush is currently coded for 8-way + * associativity. Different associativity is possible, but it + * will be treated as 8-way and may not evict the lines as + * effectively. + * + * 128 byte lines are mandatory. + */ + u64 c = l1d_size / 8; + + paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area; + paca[cpu].l1d_flush_congruence = c; + paca[cpu].l1d_flush_sets = c / 128; + } +} + +void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) +{ + if (types & L1D_FLUSH_FALLBACK) { + pr_info("rfi-flush: Using fallback displacement flush\n"); + init_fallback_flush(); + } + + if (types & L1D_FLUSH_ORI) + pr_info("rfi-flush: Using ori type flush\n"); + + if (types & L1D_FLUSH_MTTRIG) + pr_info("rfi-flush: Using mttrig type flush\n"); + + enabled_flush_types = types; + + rfi_flush_enable(enable); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 0494e1566ee2..307843d23682 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -132,6 +132,15 @@ SECTIONS /* Read-only data */ RO_DATA(PAGE_SIZE) +#ifdef CONFIG_PPC64 + . = ALIGN(8); + __rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) { + __start___rfi_flush_fixup = .; + *(__rfi_flush_fixup) + __stop___rfi_flush_fixup = .; + } +#endif + EXCEPTION_TABLE(0) NOTES :kernel :notes diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 41cf5ae273cf..a95ea007d654 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -116,6 +116,47 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) } } +#ifdef CONFIG_PPC_BOOK3S_64 +void do_rfi_flush_fixups(enum l1d_flush_type types) +{ + unsigned int instrs[3], *dest; + long *start, *end; + int i; + + start = PTRRELOC(&__start___rfi_flush_fixup), + end = PTRRELOC(&__stop___rfi_flush_fixup); + + instrs[0] = 0x60000000; /* nop */ + instrs[1] = 0x60000000; /* nop */ + instrs[2] = 0x60000000; /* nop */ + + if (types & L1D_FLUSH_FALLBACK) + /* b .+16 to fallback flush */ + instrs[0] = 0x48000010; + + i = 0; + if (types & L1D_FLUSH_ORI) { + instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */ + instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/ + } + + if (types & L1D_FLUSH_MTTRIG) + instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */ + + for (i = 0; start < end; start++, i++) { + dest = (void *)start + *start; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + + patch_instruction(dest, instrs[0]); + patch_instruction(dest + 1, instrs[1]); + patch_instruction(dest + 2, instrs[2]); + } + + printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ + void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) { long *start, *end; -- cgit v1.2.3 From bc9c9304a45480797e13a8e1df96ffcf44fb62fe Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64s: Support disabling RFI flush with no_rfi_flush and nopti Because there may be some performance overhead of the RFI flush, add kernel command line options to disable it. We add a sensibly named 'no_rfi_flush' option, but we also hijack the x86 option 'nopti'. The RFI flush is not the same as KPTI, but if we see 'nopti' we can guess that the user is trying to avoid any overhead of Meltdown mitigations, and it means we don't have to educate every one about a different command line option. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_64.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 96163f4c3673..491be4179ddd 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -805,8 +805,29 @@ early_initcall(disable_hardlockup_detector); #ifdef CONFIG_PPC_BOOK3S_64 static enum l1d_flush_type enabled_flush_types; static void *l1d_flush_fallback_area; +static bool no_rfi_flush; bool rfi_flush; +static int __init handle_no_rfi_flush(char *p) +{ + pr_info("rfi-flush: disabled on command line."); + no_rfi_flush = true; + return 0; +} +early_param("no_rfi_flush", handle_no_rfi_flush); + +/* + * The RFI flush is not KPTI, but because users will see doco that says to use + * nopti we hijack that option here to also disable the RFI flush. + */ +static int __init handle_no_pti(char *p) +{ + pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); + handle_no_rfi_flush(NULL); + return 0; +} +early_param("nopti", handle_no_pti); + static void do_nothing(void *unused) { /* @@ -877,6 +898,7 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) enabled_flush_types = types; - rfi_flush_enable(enable); + if (!no_rfi_flush) + rfi_flush_enable(enable); } #endif /* CONFIG_PPC_BOOK3S_64 */ -- cgit v1.2.3 From 8989d56878a7735dfdb234707a2fee6faf631085 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/pseries: Query hypervisor for RFI flush settings A new hypervisor call is available which tells the guest settings related to the RFI flush. Use it to query the appropriate flush instruction(s), and whether the flush is required. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/setup.c | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index a8531e012658..ae4f596273b5 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -459,6 +459,39 @@ static void __init find_and_init_phbs(void) of_pci_check_probe_only(); } +static void pseries_setup_rfi_flush(void) +{ + struct h_cpu_char_result result; + enum l1d_flush_type types; + bool enable; + long rc; + + /* Enable by default */ + enable = true; + + rc = plpar_get_cpu_characteristics(&result); + if (rc == H_SUCCESS) { + types = L1D_FLUSH_NONE; + + if (result.character & H_CPU_CHAR_L1D_FLUSH_TRIG2) + types |= L1D_FLUSH_MTTRIG; + if (result.character & H_CPU_CHAR_L1D_FLUSH_ORI30) + types |= L1D_FLUSH_ORI; + + /* Use fallback if nothing set in hcall */ + if (types == L1D_FLUSH_NONE) + types = L1D_FLUSH_FALLBACK; + + if (!(result.behaviour & H_CPU_BEHAV_L1D_FLUSH_PR)) + enable = false; + } else { + /* Default to fallback if case hcall is not available */ + types = L1D_FLUSH_FALLBACK; + } + + setup_rfi_flush(types, enable); +} + static void __init pSeries_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); @@ -476,6 +509,8 @@ static void __init pSeries_setup_arch(void) fwnmi_init(); + pseries_setup_rfi_flush(); + /* By default, only probe PCI (can be overridden by rtas_pci) */ pci_add_flags(PCI_PROBE_ONLY); -- cgit v1.2.3 From 6e032b350cd1fdb830f18f8320ef0e13b4e24094 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/powernv: Check device-tree for RFI flush settings New device-tree properties are available which tell the hypervisor settings related to the RFI flush. Use them to determine the appropriate flush instruction to use, and whether the flush is required. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/setup.c | 49 ++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 1edfbc1e40f4..4fb21e17504a 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -37,13 +37,62 @@ #include #include #include +#include #include "powernv.h" +static void pnv_setup_rfi_flush(void) +{ + struct device_node *np, *fw_features; + enum l1d_flush_type type; + int enable; + + /* Default to fallback in case fw-features are not available */ + type = L1D_FLUSH_FALLBACK; + enable = 1; + + np = of_find_node_by_name(NULL, "ibm,opal"); + fw_features = of_get_child_by_name(np, "fw-features"); + of_node_put(np); + + if (fw_features) { + np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2"); + if (np && of_property_read_bool(np, "enabled")) + type = L1D_FLUSH_MTTRIG; + + of_node_put(np); + + np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0"); + if (np && of_property_read_bool(np, "enabled")) + type = L1D_FLUSH_ORI; + + of_node_put(np); + + /* Enable unless firmware says NOT to */ + enable = 2; + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0"); + if (np && of_property_read_bool(np, "disabled")) + enable--; + + of_node_put(np); + + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1"); + if (np && of_property_read_bool(np, "disabled")) + enable--; + + of_node_put(np); + of_node_put(fw_features); + } + + setup_rfi_flush(type, enable > 0); +} + static void __init pnv_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); + pnv_setup_rfi_flush(); + /* Initialize SMP */ pnv_smp_init(); -- cgit v1.2.3 From 76a4201191814a0061cb5c861fafb9ecaa764846 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 10 Jan 2018 12:14:28 +0100 Subject: xfrm: Fix a race in the xdst pcpu cache. We need to run xfrm_resolve_and_create_bundle() with bottom halves off. Otherwise we may reuse an already released dst_enty when the xfrm lookup functions are called from process context. Fixes: c30d78c14a813db39a647b6a348b428 ("xfrm: add xdst pcpu cache") Reported-by: Darius Ski Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index bc5eae12fb09..bd6b0e7a0ee4 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2063,8 +2063,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, if (num_xfrms <= 0) goto make_dummy_bundle; + local_bh_disable(); xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, - xflo->dst_orig); + xflo->dst_orig); + local_bh_enable(); + if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err != -EAGAIN) @@ -2151,9 +2154,12 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, goto no_transform; } + local_bh_disable(); xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); + local_bh_enable(); + if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); -- cgit v1.2.3 From d780537f9b49e9d714a454e5ed989d909beab8ec Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 10 Jan 2018 13:04:58 +0100 Subject: drm/tegra: sor: Fix hang on Tegra124 eDP The SOR0 found on Tegra124 and Tegra210 only supports eDP and LVDS and therefore has a slightly different clock tree than the SOR1 which does not support eDP, but HDMI and DP instead. Commit e1335e2f0cfc ("drm/tegra: sor: Reimplement pad clock") breaks setups with eDP because the sor->clk_out clock is uninitialized and therefore setting the parent clock (either the safe clock or either of the display PLLs) fails, which can cause hangs later on since there is no clock driving the module. Fix this by falling back to the module clock for sor->clk_out on those setups. This guarantees that the module will always be clocked by an enabled clock and hence prevents those hangs. Fixes: e1335e2f0cfc ("drm/tegra: sor: Reimplement pad clock") Reported-by: Guillaume Tucker Tested-by: Jon Hunter Signed-off-by: Thierry Reding --- drivers/gpu/drm/tegra/sor.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/tegra/sor.c b/drivers/gpu/drm/tegra/sor.c index b0a1dedac802..476079f1255f 100644 --- a/drivers/gpu/drm/tegra/sor.c +++ b/drivers/gpu/drm/tegra/sor.c @@ -2656,6 +2656,9 @@ static int tegra_sor_probe(struct platform_device *pdev) name, err); goto remove; } + } else { + /* fall back to the module clock on SOR0 (eDP/LVDS only) */ + sor->clk_out = sor->clk; } sor->clk_parent = devm_clk_get(&pdev->dev, "parent"); -- cgit v1.2.3 From 951a010233625b77cde3430b4b8785a9a22968d1 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Tue, 9 Jan 2018 12:10:21 +0000 Subject: xen/gntdev: Fix off-by-one error when unmapping with holes If the requested range has a hole, the calculation of the number of pages to unmap is off by one. Fix it. Signed-off-by: Ross Lagerwall Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/gntdev.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 57efbd3b053b..d3391a1e3796 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -380,10 +380,8 @@ static int unmap_grant_pages(struct grant_map *map, int offset, int pages) } range = 0; while (range < pages) { - if (map->unmap_ops[offset+range].handle == -1) { - range--; + if (map->unmap_ops[offset+range].handle == -1) break; - } range++; } err = __unmap_grant_pages(map, offset, range); -- cgit v1.2.3 From cf2acf66ad43abb39735568f55e1f85f9844e990 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Tue, 9 Jan 2018 12:10:22 +0000 Subject: xen/gntdev: Fix partial gntdev_mmap() cleanup When cleaning up after a partially successful gntdev_mmap(), unmap the successfully mapped grant pages otherwise Xen will kill the domain if in debug mode (Attempt to implicitly unmap a granted PTE) or Linux will kill the process and emit "BUG: Bad page map in process" if Xen is in release mode. This is only needed when use_ptemod is true because gntdev_put_map() will unmap grant pages itself when use_ptemod is false. Signed-off-by: Ross Lagerwall Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/gntdev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index d3391a1e3796..bd56653b9bbc 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1071,8 +1071,10 @@ unlock_out: out_unlock_put: mutex_unlock(&priv->lock); out_put_map: - if (use_ptemod) + if (use_ptemod) { map->vma = NULL; + unmap_grant_pages(map, 0, map->count); + } gntdev_put_map(priv, map); return err; } -- cgit v1.2.3 From 0d9cac0ca0429830c40fe1a4e50e60f6221fd7b6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Jan 2018 12:40:04 +0300 Subject: drm/vmwgfx: Potential off by one in vmw_view_add() The vmw_view_cmd_to_type() function returns vmw_view_max (3) on error. It's one element beyond the end of the vmw_view_cotables[] table. My read on this is that it's possible to hit this failure. header->id comes from vmw_cmd_check() and it's a user controlled number between 1040 and 1225 so we can hit that error. But I don't have the hardware to test this code. Fixes: d80efd5cb3de ("drm/vmwgfx: Initial DX support") Signed-off-by: Dan Carpenter Reviewed-by: Thomas Hellstrom Cc: --- drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index 21c62a34e558..87e8af5776a3 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -2731,6 +2731,8 @@ static int vmw_cmd_dx_view_define(struct vmw_private *dev_priv, } view_type = vmw_view_cmd_to_type(header->id); + if (view_type == vmw_view_max) + return -EINVAL; cmd = container_of(header, typeof(*cmd), header); ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface, user_surface_converter, -- cgit v1.2.3 From 612e8e9350fd19cae6900cf36ea0c6892d1a0dca Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 10 Jan 2018 12:28:16 +0100 Subject: x86/alternatives: Fix optimize_nops() checking The alternatives code checks only the first byte whether it is a NOP, but with NOPs in front of the payload and having actual instructions after it breaks the "optimized' test. Make sure to scan all bytes before deciding to optimize the NOPs in there. Reported-by: David Woodhouse Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Tom Lendacky Cc: Andi Kleen Cc: Tim Chen Cc: Peter Zijlstra Cc: Jiri Kosina Cc: Dave Hansen Cc: Andi Kleen Cc: Andrew Lutomirski Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/20180110112815.mgciyf5acwacphkq@pd.tnic --- arch/x86/kernel/alternative.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3344d3382e91..e0b97e4d1db5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -344,9 +344,12 @@ done: static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; + int i; - if (instr[0] != 0x90) - return; + for (i = 0; i < a->padlen; i++) { + if (instr[i] != 0x90) + return; + } local_irq_save(flags); add_nops(instr + (a->instrlen - a->padlen), a->padlen); -- cgit v1.2.3 From cd52cb26e7ead5093635e98e07e221e4df482d34 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 26 Nov 2017 15:31:04 +0200 Subject: iser-target: Fix possible use-after-free in connection establishment error In case we fail to establish the connection we must drain our pre-posted login recieve work request before continuing safely with connection teardown. Fixes: a060b5629ab0 ("IB/core: generic RDMA READ/WRITE API") Cc: # 4.7+ Reported-by: Amrani, Ram Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/isert/ib_isert.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 720dfb3a1ac2..1b02283ce20e 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -741,6 +741,7 @@ isert_connect_error(struct rdma_cm_id *cma_id) { struct isert_conn *isert_conn = cma_id->qp->qp_context; + ib_drain_qp(isert_conn->qp); list_del_init(&isert_conn->node); isert_conn->cm_id = NULL; isert_put_conn(isert_conn); -- cgit v1.2.3 From 57194fa763bfa1a0908f30d4c77835beaa118fcb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 9 Jan 2018 23:03:46 +0300 Subject: IB/hfi1: Prevent a NULL dereference In the original code, we set "fd->uctxt" to NULL and then dereference it which will cause an Oops. Fixes: f2a3bc00a03c ("IB/hfi1: Protect context array set/clear with spinlock") Cc: # 4.14.x Signed-off-by: Dan Carpenter Reviewed-by: Michael J. Ruhl Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 7750a9c38b06..1df7da47f431 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -763,11 +763,11 @@ static int complete_subctxt(struct hfi1_filedata *fd) } if (ret) { - hfi1_rcd_put(fd->uctxt); - fd->uctxt = NULL; spin_lock_irqsave(&fd->dd->uctxt_lock, flags); __clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts); spin_unlock_irqrestore(&fd->dd->uctxt_lock, flags); + hfi1_rcd_put(fd->uctxt); + fd->uctxt = NULL; } return ret; -- cgit v1.2.3 From 40950343932879247861ae152dcb55e4555afdff Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 10 Jan 2018 09:20:54 +0000 Subject: bpf: fix spelling mistake: "obusing" -> "abusing" Trivial fix to spelling mistake in error message text. Signed-off-by: Colin Ian King Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b414d6b2d470..96ab165c873c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4472,7 +4472,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON) { - verbose(env, "tail_call obusing map_ptr\n"); + verbose(env, "tail_call abusing map_ptr\n"); return -EINVAL; } if (!map_ptr->unpriv_array) -- cgit v1.2.3 From 7891a87efc7116590eaba57acc3c422487802c6f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Jan 2018 20:04:37 +0100 Subject: bpf: arsh is not supported in 32 bit alu thus reject it The following snippet was throwing an 'unknown opcode cc' warning in BPF interpreter: 0: (18) r0 = 0x0 2: (7b) *(u64 *)(r10 -16) = r0 3: (cc) (u32) r0 s>>= (u32) r0 4: (95) exit Although a number of JITs do support BPF_ALU | BPF_ARSH | BPF_{K,X} generation, not all of them do and interpreter does neither. We can leave existing ones and implement it later in bpf-next for the remaining ones, but reject this properly in verifier for the time being. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Reported-by: syzbot+93c4904c5c70348a6890@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++++ tools/testing/selftests/bpf/test_verifier.c | 40 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 96ab165c873c..20eb04fd155e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2493,6 +2493,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { + verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); + return -EINVAL; + } + if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index b51017404c62..6bafa5456568 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -272,6 +272,46 @@ static struct bpf_test tests[] = { .errstr = "invalid bpf_ld_imm64 insn", .result = REJECT, }, + { + "arsh32 on imm", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_ALU32_IMM(BPF_ARSH, BPF_REG_0, 5), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "BPF_ARSH not supported for 32 bit ALU", + }, + { + "arsh32 on reg", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_1, 5), + BPF_ALU32_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "BPF_ARSH not supported for 32 bit ALU", + }, + { + "arsh64 on imm", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_0, 5), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "arsh64 on reg", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_1, 5), + BPF_ALU64_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, { "no bpf_exit", .insns = { -- cgit v1.2.3 From bbeb6e4323dad9b5e0ee9f60c223dd532e2403b1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Jan 2018 23:25:05 +0100 Subject: bpf, array: fix overflow in max_entries and undefined behavior in index_mask syzkaller tried to alloc a map with 0xfffffffd entries out of a userns, and thus unprivileged. With the recently added logic in b2157399cc98 ("bpf: prevent out-of-bounds speculation") we round this up to the next power of two value for max_entries for unprivileged such that we can apply proper masking into potentially zeroed out map slots. However, this will generate an index_mask of 0xffffffff, and therefore a + 1 will let this overflow into new max_entries of 0. This will pass allocation, etc, and later on map access we still enforce on the original attr->max_entries value which was 0xfffffffd, therefore triggering GPF all over the place. Thus bail out on overflow in such case. Moreover, on 32 bit archs roundup_pow_of_two() can also not be used, since fls_long(max_entries - 1) can result in 32 and 1UL << 32 in 32 bit space is undefined. Therefore, do this by hand in a 64 bit variable. This fixes all the issues triggered by syzkaller's reproducers. Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: syzbot+b0efb8e572d01bce1ae0@syzkaller.appspotmail.com Reported-by: syzbot+6c15e9744f75f2364773@syzkaller.appspotmail.com Reported-by: syzbot+d2f5524fb46fd3b312ee@syzkaller.appspotmail.com Reported-by: syzbot+61d23c95395cc90dbc2b@syzkaller.appspotmail.com Reported-by: syzbot+0d363c942452cca68c01@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index aaa319848e7d..ab94d304a634 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; - u64 array_size; + u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -74,13 +74,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); max_entries = attr->max_entries; - index_mask = roundup_pow_of_two(max_entries) - 1; - if (unpriv) + /* On 32 bit archs roundup_pow_of_two() with max_entries that has + * upper most bit set in u32 space is undefined behavior due to + * resulting 1U << 32, so do it manually here in u64 space. + */ + mask64 = fls_long(max_entries - 1); + mask64 = 1ULL << mask64; + mask64 -= 1; + + index_mask = mask64; + if (unpriv) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ max_entries = index_mask + 1; + /* Check for overflows. */ + if (max_entries < attr->max_entries) + return ERR_PTR(-E2BIG); + } array_size = sizeof(*array); if (percpu) -- cgit v1.2.3 From 23b19b7b50fe1867da8d431eea9cd3e4b6328c2c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 10 Jan 2018 23:48:05 +0100 Subject: ALSA: pcm: Remove yet superfluous WARN_ON() muldiv32() contains a snd_BUG_ON() (which is morphed as WARN_ON() with debug option) for checking the case of 0 / 0. This would be helpful if this happens only as a logical error; however, since the hw refine is performed with any data set provided by user, the inconsistent values that can trigger such a condition might be passed easily. Actually, syzbot caught this by passing some zero'ed old hw_params ioctl. So, having snd_BUG_ON() there is simply superfluous and rather harmful to give unnecessary confusions. Let's get rid of it. Reported-by: syzbot+7e6ee55011deeebce15d@syzkaller.appspotmail.com Cc: Signed-off-by: Takashi Iwai --- sound/core/pcm_lib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index db7894bb028c..faa67861cbc1 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -560,7 +560,6 @@ static inline unsigned int muldiv32(unsigned int a, unsigned int b, { u_int64_t n = (u_int64_t) a * b; if (c == 0) { - snd_BUG_ON(!n); *r = 0; return UINT_MAX; } -- cgit v1.2.3 From 4636bda86aa1f34f45c629477476a0dcfa04e597 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Fri, 5 Jan 2018 00:59:05 -0800 Subject: drm/i915: Whitelist SLICE_COMMON_ECO_CHICKEN1 on Geminilake. Geminilake requires the 3D driver to select whether barriers are intended for compute shaders, or tessellation control shaders, by whacking a "Barrier Mode" bit in SLICE_COMMON_ECO_CHICKEN1 when switching pipelines. Failure to do this properly can result in GPU hangs. Unfortunately, this means it needs to switch mid-batch, so only userspace can properly set it. To facilitate this, the kernel needs to whitelist the register. The workarounds page currently tags this as applying to Broxton only, but that doesn't make sense. The documentation for the register it references says the bit userspace is supposed to toggle only exists on Geminilake. Empirically, the Mesa patch to toggle this bit appears to fix intermittent GPU hangs in tessellation control shader barrier tests on Geminilake; we haven't seen those hangs on Broxton. v2: Mention WA #0862 in the comment (it doesn't have a name). Signed-off-by: Kenneth Graunke Acked-by: Rodrigo Vivi Cc: stable@vger.kernel.org Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20180105085905.9298-1-kenneth@whitecape.org (cherry picked from commit ab062639edb0412daf6de540725276b9a5d217f9) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_reg.h | 2 ++ drivers/gpu/drm/i915/intel_engine_cs.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 333f40bc03bb..7923dfd9963c 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -7027,6 +7027,8 @@ enum { #define GEN9_SLICE_COMMON_ECO_CHICKEN0 _MMIO(0x7308) #define DISABLE_PIXEL_MASK_CAMMING (1<<14) +#define GEN9_SLICE_COMMON_ECO_CHICKEN1 _MMIO(0x731c) + #define GEN7_L3SQCREG1 _MMIO(0xB010) #define VLV_B0_WA_L3SQCREG1_VALUE 0x00D30000 diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index ab5bf4e2e28e..6074e04dc99f 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -1390,6 +1390,11 @@ static int glk_init_workarounds(struct intel_engine_cs *engine) if (ret) return ret; + /* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */ + ret = wa_ring_whitelist_reg(engine, GEN9_SLICE_COMMON_ECO_CHICKEN1); + if (ret) + return ret; + /* WaToEnableHwFixForPushConstHWBug:glk */ WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); -- cgit v1.2.3 From 5005c8514285ae4f28e862f8d91faaa2015e03a3 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 6 Jan 2018 10:56:18 +0000 Subject: drm/i915: Don't adjust priority on an already signaled fence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we retire a signaled fence, we free the dependency tree. However, we skip clearing the list so that if we then try to adjust the priority of the signaled fence, we may walk the list of freed dependencies. [ 3083.156757] ================================================================== [ 3083.156806] BUG: KASAN: use-after-free in execlists_schedule+0x199/0x660 [i915] [ 3083.156810] Read of size 8 at addr ffff8806bf20f400 by task Xorg/831 [ 3083.156815] CPU: 0 PID: 831 Comm: Xorg Not tainted 4.15.0-rc6-no-psn+ #1 [ 3083.156817] Hardware name: Notebook N24_25BU/N24_25BU, BIOS 5.12 02/17/2017 [ 3083.156818] Call Trace: [ 3083.156823] dump_stack+0x5c/0x7a [ 3083.156827] print_address_description+0x6b/0x290 [ 3083.156830] kasan_report+0x28f/0x380 [ 3083.156872] ? execlists_schedule+0x199/0x660 [i915] [ 3083.156914] execlists_schedule+0x199/0x660 [i915] [ 3083.156956] ? intel_crtc_atomic_check+0x146/0x4e0 [i915] [ 3083.156997] ? execlists_submit_request+0xe0/0xe0 [i915] [ 3083.157038] ? i915_vma_misplaced.part.4+0x25/0xb0 [i915] [ 3083.157079] ? __i915_vma_do_pin+0x7c8/0xc80 [i915] [ 3083.157121] ? intel_atomic_state_alloc+0x44/0x60 [i915] [ 3083.157130] ? drm_atomic_helper_page_flip+0x3e/0xb0 [drm_kms_helper] [ 3083.157145] ? drm_mode_page_flip_ioctl+0x7d2/0x850 [drm] [ 3083.157159] ? drm_ioctl_kernel+0xa7/0xf0 [drm] [ 3083.157172] ? drm_ioctl+0x45b/0x560 [drm] [ 3083.157211] i915_gem_object_wait_priority+0x14c/0x2c0 [i915] [ 3083.157251] ? i915_gem_get_aperture_ioctl+0x150/0x150 [i915] [ 3083.157290] ? i915_vma_pin_fence+0x1d8/0x320 [i915] [ 3083.157331] ? intel_pin_and_fence_fb_obj+0x175/0x250 [i915] [ 3083.157372] ? intel_rotation_info_size+0x60/0x60 [i915] [ 3083.157413] ? intel_link_compute_m_n+0x80/0x80 [i915] [ 3083.157428] ? drm_dev_printk+0x1b0/0x1b0 [drm] [ 3083.157443] ? drm_dev_printk+0x1b0/0x1b0 [drm] [ 3083.157485] intel_prepare_plane_fb+0x2f8/0x5a0 [i915] [ 3083.157527] ? intel_crtc_get_vblank_counter+0x80/0x80 [i915] [ 3083.157536] drm_atomic_helper_prepare_planes+0xa0/0x1c0 [drm_kms_helper] [ 3083.157587] intel_atomic_commit+0x12e/0x4e0 [i915] [ 3083.157605] drm_atomic_helper_page_flip+0xa2/0xb0 [drm_kms_helper] [ 3083.157621] drm_mode_page_flip_ioctl+0x7d2/0x850 [drm] [ 3083.157638] ? drm_mode_cursor2_ioctl+0x10/0x10 [drm] [ 3083.157652] ? drm_lease_owner+0x1a/0x30 [drm] [ 3083.157668] ? drm_mode_cursor2_ioctl+0x10/0x10 [drm] [ 3083.157681] drm_ioctl_kernel+0xa7/0xf0 [drm] [ 3083.157696] drm_ioctl+0x45b/0x560 [drm] [ 3083.157711] ? drm_mode_cursor2_ioctl+0x10/0x10 [drm] [ 3083.157725] ? drm_getstats+0x20/0x20 [drm] [ 3083.157729] ? timerqueue_del+0x49/0x80 [ 3083.157732] ? __remove_hrtimer+0x62/0xb0 [ 3083.157735] ? hrtimer_try_to_cancel+0x173/0x210 [ 3083.157738] do_vfs_ioctl+0x13b/0x880 [ 3083.157741] ? ioctl_preallocate+0x140/0x140 [ 3083.157744] ? _raw_spin_unlock_irq+0xe/0x30 [ 3083.157746] ? do_setitimer+0x234/0x370 [ 3083.157750] ? SyS_setitimer+0x19e/0x1b0 [ 3083.157752] ? SyS_alarm+0x140/0x140 [ 3083.157755] ? __rcu_read_unlock+0x66/0x80 [ 3083.157757] ? __fget+0xc4/0x100 [ 3083.157760] SyS_ioctl+0x74/0x80 [ 3083.157763] entry_SYSCALL_64_fastpath+0x1a/0x7d [ 3083.157765] RIP: 0033:0x7f6135d0c6a7 [ 3083.157767] RSP: 002b:00007fff01451888 EFLAGS: 00003246 ORIG_RAX: 0000000000000010 [ 3083.157769] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f6135d0c6a7 [ 3083.157771] RDX: 00007fff01451950 RSI: 00000000c01864b0 RDI: 000000000000000c [ 3083.157772] RBP: 00007f613076f600 R08: 0000000000000001 R09: 0000000000000000 [ 3083.157773] R10: 0000000000000060 R11: 0000000000003246 R12: 0000000000000000 [ 3083.157774] R13: 0000000000000060 R14: 000000000000001b R15: 0000000000000060 [ 3083.157779] Allocated by task 831: [ 3083.157783] kmem_cache_alloc+0xc0/0x200 [ 3083.157822] i915_gem_request_await_dma_fence+0x2c4/0x5d0 [i915] [ 3083.157861] i915_gem_request_await_object+0x321/0x370 [i915] [ 3083.157900] i915_gem_do_execbuffer+0x1165/0x19c0 [i915] [ 3083.157937] i915_gem_execbuffer2+0x1ad/0x550 [i915] [ 3083.157950] drm_ioctl_kernel+0xa7/0xf0 [drm] [ 3083.157962] drm_ioctl+0x45b/0x560 [drm] [ 3083.157964] do_vfs_ioctl+0x13b/0x880 [ 3083.157966] SyS_ioctl+0x74/0x80 [ 3083.157968] entry_SYSCALL_64_fastpath+0x1a/0x7d [ 3083.157971] Freed by task 831: [ 3083.157973] kmem_cache_free+0x77/0x220 [ 3083.158012] i915_gem_request_retire+0x72c/0xa70 [i915] [ 3083.158051] i915_gem_request_alloc+0x1e9/0x8b0 [i915] [ 3083.158089] i915_gem_do_execbuffer+0xa96/0x19c0 [i915] [ 3083.158127] i915_gem_execbuffer2+0x1ad/0x550 [i915] [ 3083.158140] drm_ioctl_kernel+0xa7/0xf0 [drm] [ 3083.158153] drm_ioctl+0x45b/0x560 [drm] [ 3083.158155] do_vfs_ioctl+0x13b/0x880 [ 3083.158156] SyS_ioctl+0x74/0x80 [ 3083.158158] entry_SYSCALL_64_fastpath+0x1a/0x7d [ 3083.158162] The buggy address belongs to the object at ffff8806bf20f400 which belongs to the cache i915_dependency of size 64 [ 3083.158166] The buggy address is located 0 bytes inside of 64-byte region [ffff8806bf20f400, ffff8806bf20f440) [ 3083.158168] The buggy address belongs to the page: [ 3083.158171] page:00000000d43decc4 count:1 mapcount:0 mapping: (null) index:0x0 [ 3083.158174] flags: 0x17ffe0000000100(slab) [ 3083.158179] raw: 017ffe0000000100 0000000000000000 0000000000000000 0000000180200020 [ 3083.158182] raw: ffffea001afc16c0 0000000500000005 ffff880731b881c0 0000000000000000 [ 3083.158184] page dumped because: kasan: bad access detected [ 3083.158187] Memory state around the buggy address: [ 3083.158190] ffff8806bf20f300: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3083.158192] ffff8806bf20f380: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3083.158195] >ffff8806bf20f400: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3083.158196] ^ [ 3083.158199] ffff8806bf20f480: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3083.158201] ffff8806bf20f500: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3083.158203] ================================================================== Reported-by: Alexandru Chirvasitu Reported-by: Mike Keehan Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104436 Fixes: 1f181225f8ec ("drm/i915/execlists: Keep request->priority for its lifetime") Signed-off-by: Chris Wilson Cc: Alexandru Chirvasitu Cc: MichaÅ‚ Winiarski Cc: Joonas Lahtinen Cc: Tvrtko Ursulin Tested-by: Alexandru Chirvasitu Reviewed-by: MichaÅ‚ Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20180106105618.13532-1-chris@chris-wilson.co.uk (cherry picked from commit c218ee03b9315073ce43992792554dafa0626eb8) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem.c | 2 +- drivers/gpu/drm/i915/intel_lrc.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 18de6569d04a..5cfba89ed586 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -467,7 +467,7 @@ static void __fence_set_priority(struct dma_fence *fence, int prio) struct drm_i915_gem_request *rq; struct intel_engine_cs *engine; - if (!dma_fence_is_i915(fence)) + if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence)) return; rq = to_request(fence); diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index d36e25607435..e71a8cd50498 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -974,6 +974,9 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) GEM_BUG_ON(prio == I915_PRIORITY_INVALID); + if (i915_gem_request_completed(request)) + return; + if (prio <= READ_ONCE(request->priotree.priority)) return; -- cgit v1.2.3 From 2a266f23550be997d783f27e704b9b40c4010292 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Wed, 10 Jan 2018 21:44:42 +0800 Subject: KVM MMU: check pending exception before injecting APF For example, when two APF's for page ready happen after one exit and the first one becomes pending, the second one will result in #DF. Instead, just handle the second page fault synchronously. Reported-by: Ross Zwisler Message-ID: Reported-by: Alec Blayne Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c4deb1f34faa..e577bacd4bd0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3781,7 +3781,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || - kvm_event_needs_reinjection(vcpu))) + kvm_event_needs_reinjection(vcpu) || + vcpu->arch.exception.pending)) return false; if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) -- cgit v1.2.3 From b3defb791b26ea0683a93a4f49c77ec45ec96f10 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 9 Jan 2018 23:11:03 +0100 Subject: ALSA: seq: Make ioctls race-free The ALSA sequencer ioctls have no protection against racy calls while the concurrent operations may lead to interfere with each other. As reported recently, for example, the concurrent calls of setting client pool with a combination of write calls may lead to either the unkillable dead-lock or UAF. As a slightly big hammer solution, this patch introduces the mutex to make each ioctl exclusive. Although this may reduce performance via parallel ioctl calls, usually it's not demanded for sequencer usages, hence it should be negligible. Reported-by: Luo Quan Reviewed-by: Kees Cook Reviewed-by: Greg Kroah-Hartman Cc: Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 3 +++ sound/core/seq/seq_clientmgr.h | 1 + 2 files changed, 4 insertions(+) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 6e22eea72654..d01913404581 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -221,6 +221,7 @@ static struct snd_seq_client *seq_create_client1(int client_index, int poolsize) rwlock_init(&client->ports_lock); mutex_init(&client->ports_mutex); INIT_LIST_HEAD(&client->ports_list_head); + mutex_init(&client->ioctl_mutex); /* find free slot in the client table */ spin_lock_irqsave(&clients_lock, flags); @@ -2130,7 +2131,9 @@ static long snd_seq_ioctl(struct file *file, unsigned int cmd, return -EFAULT; } + mutex_lock(&client->ioctl_mutex); err = handler->func(client, &buf); + mutex_unlock(&client->ioctl_mutex); if (err >= 0) { /* Some commands includes a bug in 'dir' field. */ if (handler->cmd == SNDRV_SEQ_IOCTL_SET_QUEUE_CLIENT || diff --git a/sound/core/seq/seq_clientmgr.h b/sound/core/seq/seq_clientmgr.h index c6614254ef8a..0611e1e0ed5b 100644 --- a/sound/core/seq/seq_clientmgr.h +++ b/sound/core/seq/seq_clientmgr.h @@ -61,6 +61,7 @@ struct snd_seq_client { struct list_head ports_list_head; rwlock_t ports_lock; struct mutex ports_mutex; + struct mutex ioctl_mutex; int convert32; /* convert 32->64bit */ /* output pool */ -- cgit v1.2.3 From ab271bd4dfd568060ffcf5a21b667c7c5df7ab99 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 Jan 2018 17:26:59 +0100 Subject: x86: kvm: propagate register_shrinker return code Patch "mm,vmscan: mark register_shrinker() as __must_check" is queued for 4.16 in linux-mm and adds a warning about the unchecked call to register_shrinker: arch/x86/kvm/mmu.c:5485:2: warning: ignoring return value of 'register_shrinker', declared with attribute warn_unused_result [-Wunused-result] This changes the kvm_mmu_module_init() function to fail itself when the call to register_shrinker fails. Signed-off-by: Arnd Bergmann Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e577bacd4bd0..2b8eb4da4d08 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5466,30 +5466,34 @@ static void mmu_destroy_caches(void) int kvm_mmu_module_init(void) { + int ret = -ENOMEM; + kvm_mmu_clear_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); if (!pte_list_desc_cache) - goto nomem; + goto out; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", sizeof(struct kvm_mmu_page), 0, SLAB_ACCOUNT, NULL); if (!mmu_page_header_cache) - goto nomem; + goto out; if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto nomem; + goto out; - register_shrinker(&mmu_shrinker); + ret = register_shrinker(&mmu_shrinker); + if (ret) + goto out; return 0; -nomem: +out: mmu_destroy_caches(); - return -ENOMEM; + return ret; } /* -- cgit v1.2.3 From bd89525a823ce6edddcedbe9aed79faa1b9cf544 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jan 2018 16:55:24 +0100 Subject: KVM: x86: emulate #UD while in guest mode This reverts commits ae1f57670703656cc9f293722c3b8b6782f8ab3f and ac9b305caa0df6f5b75d294e4b86c1027648991e. If the hardware doesn't support MOVBE, but L0 sets CPUID.01H:ECX.MOVBE in L1's emulated CPUID information, then L1 is likely to pass that CPUID bit through to L2. L2 will expect MOVBE to work, but if L1 doesn't intercept #UD, then any MOVBE instruction executed in L2 will raise #UD, and the exception will be delivered in L2. Commit ac9b305caa0df6f5b75d294e4b86c1027648991e is a better and more complete version of ae1f57670703 ("KVM: nVMX: Do not emulate #UD while in guest mode"); however, neither considers the above case. Suggested-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 9 +-------- arch/x86/kvm/vmx.c | 5 +---- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bb31c801f1fc..3158dac87f82 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -361,7 +361,6 @@ static void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h; struct nested_state *g; - u32 h_intercept_exceptions; mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -372,14 +371,9 @@ static void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested; - /* No need to intercept #UD if L1 doesn't intercept it */ - h_intercept_exceptions = - h->intercept_exceptions & ~(1U << UD_VECTOR); - c->intercept_cr = h->intercept_cr | g->intercept_cr; c->intercept_dr = h->intercept_dr | g->intercept_dr; - c->intercept_exceptions = - h_intercept_exceptions | g->intercept_exceptions; + c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; c->intercept = h->intercept | g->intercept; } @@ -2202,7 +2196,6 @@ static int ud_interception(struct vcpu_svm *svm) { int er; - WARN_ON_ONCE(is_guest_mode(&svm->vcpu)); er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); if (er == EMULATE_USER_EXIT) return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5c14d65f676a..427fd3200dd8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1887,7 +1887,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << MC_VECTOR) | + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == @@ -1905,8 +1905,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) */ if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; - else - eb |= 1u << UD_VECTOR; vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -5917,7 +5915,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) return 1; /* already handled by vmx_vcpu_run() */ if (is_invalid_opcode(intr_info)) { - WARN_ON_ONCE(is_guest_mode(vcpu)); er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); if (er == EMULATE_USER_EXIT) return 0; -- cgit v1.2.3 From 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 Mon Sep 17 00:00:00 2001 From: Andrew Honig Date: Wed, 10 Jan 2018 10:12:03 -0800 Subject: KVM: x86: Add memory barrier on vmcs field lookup This adds a memory barrier when performing a lookup into the vmcs_field_to_offset_table. This is related to CVE-2017-5753. Signed-off-by: Andrew Honig Reviewed-by: Jim Mattson Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a6f4f095f8f4..7f8fcc5ce664 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -884,8 +884,16 @@ static inline short vmcs_field_to_offset(unsigned long field) { BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || - vmcs_field_to_offset_table[field] == 0) + if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) + return -ENOENT; + + /* + * FIXME: Mitigation for CVE-2017-5753. To be replaced with a + * generic mechanism. + */ + asm("lfence"); + + if (vmcs_field_to_offset_table[field] == 0) return -ENOENT; return vmcs_field_to_offset_table[field]; -- cgit v1.2.3 From f32ab7547161b9fa7ebfbc4f18ea1eb3fd49fe25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FChristian=3D20K=3DC3=3DB6nig=3F=3D?= Date: Thu, 11 Jan 2018 14:23:29 +0100 Subject: x86/PCI: Add "pci=big_root_window" option for AMD 64-bit windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only try to enable a 64-bit window on AMD CPUs when "pci=big_root_window" is specified. This taints the kernel because the new 64-bit window uses address space we don't know anything about, and it may contain unreported devices or memory that would conflict with the window. The pci_amd_enable_64bit_bar() quirk that enables the window is specific to AMD CPUs. The generic solution would be to have the firmware enable the window and describe it in the host bridge's _CRS method, or at least describe it in the _PRS method so the OS would have the option of enabling it. Signed-off-by: Christian König [bhelgaas: changelog, extend doc, mention taint in dmesg] Signed-off-by: Bjorn Helgaas --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ arch/x86/include/asm/pci_x86.h | 1 + arch/x86/pci/common.c | 5 +++++ arch/x86/pci/fixup.c | 7 ++++++- 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6571fbfdb2a1..619638362416 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3094,6 +3094,12 @@ pcie_scan_all Scan all possible PCIe devices. Otherwise we only look for one device below a PCIe downstream port. + big_root_window Try to add a big 64bit memory window to the PCIe + root complex on AMD CPUs. Some GFX hardware + can resize a BAR to allow access to all VRAM. + Adding the window is slightly risky (it may + conflict with unreported devices), so this + taints the kernel. pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 7a5d6695abd3..eb66fa9cd0fc 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -38,6 +38,7 @@ do { \ #define PCI_NOASSIGN_ROMS 0x80000 #define PCI_ROOT_NO_CRS 0x100000 #define PCI_NOASSIGN_BARS 0x200000 +#define PCI_BIG_ROOT_WINDOW 0x400000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 7a5350d08cef..563049c483a1 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -594,6 +594,11 @@ char *__init pcibios_setup(char *str) } else if (!strcmp(str, "nocrs")) { pci_probe |= PCI_ROOT_NO_CRS; return NULL; +#ifdef CONFIG_PHYS_ADDR_T_64BIT + } else if (!strcmp(str, "big_root_window")) { + pci_probe |= PCI_BIG_ROOT_WINDOW; + return NULL; +#endif } else if (!strcmp(str, "earlydump")) { pci_early_dump_regs = 1; return NULL; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index e663d6bf1328..8bad19c7473d 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -667,6 +667,9 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) struct resource *res, *conflict; struct pci_dev *other; + if (!(pci_probe & PCI_BIG_ROOT_WINDOW)) + return; + /* Check that we are the only device of that type */ other = pci_get_device(dev->vendor, dev->device, NULL); if (other != dev || @@ -714,7 +717,9 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) res->start = conflict->end + 1; } - dev_info(&dev->dev, "adding root bus resource %pR\n", res); + dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", + res); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) | AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK; -- cgit v1.2.3 From b8626f1dc29d3eee444bfaa92146ec7b291ef41c Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Thu, 11 Jan 2018 14:47:40 +0100 Subject: usb: misc: usb3503: make sure reset is low for at least 100us When using a GPIO which is high by default, and initialize the driver in USB Hub mode, initialization fails with: [ 111.757794] usb3503 0-0008: SP_ILOCK failed (-5) The reason seems to be that the chip is not properly reset. Probe does initialize reset low, however some lines later the code already set it back high, which is not long enouth. Make sure reset is asserted for at least 100us by inserting a delay after initializing the reset pin during probe. Signed-off-by: Stefan Agner Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/usb3503.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/misc/usb3503.c b/drivers/usb/misc/usb3503.c index 465dbf68b463..f723f7b8c9ac 100644 --- a/drivers/usb/misc/usb3503.c +++ b/drivers/usb/misc/usb3503.c @@ -279,6 +279,8 @@ static int usb3503_probe(struct usb3503 *hub) if (gpio_is_valid(hub->gpio_reset)) { err = devm_gpio_request_one(dev, hub->gpio_reset, GPIOF_OUT_INIT_LOW, "usb3503 reset"); + /* Datasheet defines a hardware reset to be at least 100us */ + usleep_range(100, 10000); if (err) { dev_err(dev, "unable to request GPIO %d as reset pin (%d)\n", -- cgit v1.2.3 From 1a2e91e795def04e15fac87b8e16b635691d0b82 Mon Sep 17 00:00:00 2001 From: Bin Liu Date: Tue, 9 Jan 2018 13:27:17 -0600 Subject: Documentation: usb: fix typo in UVC gadgetfs config command This seems to be a copy&paste error. With the fix the uvc gadget now can be created by following the instrucitons. Signed-off-by: Bin Liu Signed-off-by: Greg Kroah-Hartman --- Documentation/usb/gadget-testing.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/usb/gadget-testing.txt b/Documentation/usb/gadget-testing.txt index 441a4b9b666f..5908a21fddb6 100644 --- a/Documentation/usb/gadget-testing.txt +++ b/Documentation/usb/gadget-testing.txt @@ -693,7 +693,7 @@ such specification consists of a number of lines with an inverval value in each line. The rules stated above are best illustrated with an example: # mkdir functions/uvc.usb0/control/header/h -# cd functions/uvc.usb0/control/header/h +# cd functions/uvc.usb0/control/ # ln -s header/h class/fs # ln -s header/h class/ss # mkdir -p functions/uvc.usb0/streaming/uncompressed/u/360p -- cgit v1.2.3 From 03a551734cfc2b93f83950a595974e3c9cbd82fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FChristian=3D20K=3DC3=3DB6nig=3F=3D?= Date: Thu, 11 Jan 2018 14:23:30 +0100 Subject: x86/PCI: Move and shrink AMD 64-bit window to avoid conflict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid problems with BIOS implementations which don't report all used resources to the OS by only allocating a 256GB window directly below the hardware limit (from the BKDG, sec 2.4.6). Fixes a silent reboot loop reported by Aaro Koskinen on an AMD-based MSI MS-7699/760GA-P43(FX) system. This was apparently caused by RAM or other unreported hardware that conflicted with the new window. Link: https://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf Link: https://lkml.kernel.org/r/20180105220412.fzpwqe4zljdawr36@darkstar.musicnaut.iki.fi Fixes: fa564ad96366 ("x86/PCI: Enable a 64bit BAR on AMD Family 15h (Models 00-1f, 30-3f, 60-7f)") Reported-by: Aaro Koskinen Signed-off-by: Christian König [bhelgaas: changelog, comment, Fixes:] Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 8bad19c7473d..f6a26e3cb476 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -662,10 +662,11 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); */ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) { - unsigned i; u32 base, limit, high; - struct resource *res, *conflict; struct pci_dev *other; + struct resource *res; + unsigned i; + int r; if (!(pci_probe & PCI_BIG_ROOT_WINDOW)) return; @@ -702,19 +703,20 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) if (!res) return; + /* + * Allocate a 256GB window directly below the 0xfd00000000 hardware + * limit (see AMD Family 15h Models 30h-3Fh BKDG, sec 2.4.6). + */ res->name = "PCI Bus 0000:00"; res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_WINDOW; - res->start = 0x100000000ull; + res->start = 0xbd00000000ull; res->end = 0xfd00000000ull - 1; - /* Just grab the free area behind system memory for this */ - while ((conflict = request_resource_conflict(&iomem_resource, res))) { - if (conflict->end >= res->end) { - kfree(res); - return; - } - res->start = conflict->end + 1; + r = request_resource(&iomem_resource, res); + if (r) { + kfree(res); + return; } dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", -- cgit v1.2.3 From 445b69e3b75e42362a5bdc13c8b8f61599e2228a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 10 Jan 2018 14:49:39 -0800 Subject: x86/pti: Make unpoison of pgd for trusted boot work for real The inital fix for trusted boot and PTI potentially misses the pgd clearing if pud_alloc() sets a PGD. It probably works in *practice* because for two adjacent calls to map_tboot_page() that share a PGD entry, the first will clear NX, *then* allocate and set the PGD (without NX clear). The second call will *not* allocate but will clear the NX bit. Defer the NX clearing to a point after it is known that all top-level allocations have occurred. Add a comment to clarify why. [ tglx: Massaged changelog ] Fixes: 262b6b30087 ("x86/tboot: Unbreak tboot with PTI enabled") Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Andrea Arcangeli Cc: Jon Masters Cc: "Tim Chen" Cc: gnomes@lxorguk.ukuu.org.uk Cc: peterz@infradead.org Cc: ning.sun@intel.com Cc: tboot-devel@lists.sourceforge.net Cc: andi@firstfloor.org Cc: luto@kernel.org Cc: law@redhat.com Cc: pbonzini@redhat.com Cc: torvalds@linux-foundation.org Cc: gregkh@linux-foundation.org Cc: dwmw@amazon.co.uk Cc: nickc@redhat.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180110224939.2695CD47@viggo.jf.intel.com --- arch/x86/kernel/tboot.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 75869a4b6c41..a2486f444073 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -127,7 +127,6 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, p4d = p4d_alloc(&tboot_mm, pgd, vaddr); if (!p4d) return -1; - pgd->pgd &= ~_PAGE_NX; pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; @@ -139,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); pte_unmap(pte); + + /* + * PTI poisons low addresses in the kernel page tables in the + * name of making them unusable for userspace. To execute + * code at such a low address, the poison must be cleared. + * + * Note: 'pgd' actually gets set in p4d_alloc() _or_ + * pud_alloc() depending on 4/5-level paging. + */ + pgd->pgd &= ~_PAGE_NX; + return 0; } -- cgit v1.2.3 From 8978cc921fc7fad3f4d6f91f1da01352aeeeff25 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Tue, 9 Jan 2018 11:41:10 +0200 Subject: {net,ib}/mlx5: Don't disable local loopback multicast traffic when needed There are systems platform information management interfaces (such as HOST2BMC) for which we cannot disable local loopback multicast traffic. Separate disable_local_lb_mc and disable_local_lb_uc capability bits so driver will not disable multicast loopback traffic if not supported. (It is expected that Firmware will not set disable_local_lb_mc if HOST2BMC is running for example.) Function mlx5_nic_vport_update_local_lb will do best effort to disable/enable UC/MC loopback traffic and return success only in case it succeeded to changed all allowed by Firmware. Adapt mlx5_ib and mlx5e to support the new cap bits. Fixes: 2c43c5a036be ("net/mlx5e: Enable local loopback in loopback selftest") Fixes: c85023e153e3 ("IB/mlx5: Add raw ethernet local loopback support") Fixes: bded747bb432 ("net/mlx5: Add raw ethernet local loopback firmware command") Signed-off-by: Eran Ben Elisha Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/main.c | 9 +++++--- .../net/ethernet/mellanox/mlx5/core/en_selftest.c | 27 ++++++++++++++-------- drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 22 +++++++++++++----- include/linux/mlx5/mlx5_ifc.h | 5 ++-- 5 files changed, 44 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 8ac50de2b242..00cb184fa027 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1324,7 +1324,8 @@ static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) return err; if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || - !MLX5_CAP_GEN(dev->mdev, disable_local_lb)) + (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) && + !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) return err; mutex_lock(&dev->lb_mutex); @@ -1342,7 +1343,8 @@ static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn) mlx5_core_dealloc_transport_domain(dev->mdev, tdn); if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || - !MLX5_CAP_GEN(dev->mdev, disable_local_lb)) + (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) && + !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) return; mutex_lock(&dev->lb_mutex); @@ -4187,7 +4189,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) } if ((MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && - MLX5_CAP_GEN(mdev, disable_local_lb)) + (MLX5_CAP_GEN(mdev, disable_local_lb_uc) || + MLX5_CAP_GEN(mdev, disable_local_lb_mc))) mutex_init(&dev->lb_mutex); dev->ib_active = true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 1f1f8af87d4d..5a4608281f38 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -238,15 +238,19 @@ static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv, int err = 0; /* Temporarily enable local_lb */ - if (MLX5_CAP_GEN(priv->mdev, disable_local_lb)) { - mlx5_nic_vport_query_local_lb(priv->mdev, &lbtp->local_lb); - if (!lbtp->local_lb) - mlx5_nic_vport_update_local_lb(priv->mdev, true); + err = mlx5_nic_vport_query_local_lb(priv->mdev, &lbtp->local_lb); + if (err) + return err; + + if (!lbtp->local_lb) { + err = mlx5_nic_vport_update_local_lb(priv->mdev, true); + if (err) + return err; } err = mlx5e_refresh_tirs(priv, true); if (err) - return err; + goto out; lbtp->loopback_ok = false; init_completion(&lbtp->comp); @@ -256,16 +260,21 @@ static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv, lbtp->pt.dev = priv->netdev; lbtp->pt.af_packet_priv = lbtp; dev_add_pack(&lbtp->pt); + + return 0; + +out: + if (!lbtp->local_lb) + mlx5_nic_vport_update_local_lb(priv->mdev, false); + return err; } static void mlx5e_test_loopback_cleanup(struct mlx5e_priv *priv, struct mlx5e_lbt_priv *lbtp) { - if (MLX5_CAP_GEN(priv->mdev, disable_local_lb)) { - if (!lbtp->local_lb) - mlx5_nic_vport_update_local_lb(priv->mdev, false); - } + if (!lbtp->local_lb) + mlx5_nic_vport_update_local_lb(priv->mdev, false); dev_remove_pack(&lbtp->pt); mlx5e_refresh_tirs(priv, false); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 8a89c7e8cd63..95e188d0883e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -578,8 +578,7 @@ static int mlx5_core_set_hca_defaults(struct mlx5_core_dev *dev) int ret = 0; /* Disable local_lb by default */ - if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && - MLX5_CAP_GEN(dev, disable_local_lb)) + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) ret = mlx5_nic_vport_update_local_lb(dev, false); return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index d653b0025b13..a1296a62497d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -908,23 +908,33 @@ int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable) void *in; int err; - mlx5_core_dbg(mdev, "%s local_lb\n", enable ? "enable" : "disable"); + if (!MLX5_CAP_GEN(mdev, disable_local_lb_mc) && + !MLX5_CAP_GEN(mdev, disable_local_lb_uc)) + return 0; + in = kvzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; - MLX5_SET(modify_nic_vport_context_in, in, - field_select.disable_mc_local_lb, 1); MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.disable_mc_local_lb, !enable); - - MLX5_SET(modify_nic_vport_context_in, in, - field_select.disable_uc_local_lb, 1); MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.disable_uc_local_lb, !enable); + if (MLX5_CAP_GEN(mdev, disable_local_lb_mc)) + MLX5_SET(modify_nic_vport_context_in, in, + field_select.disable_mc_local_lb, 1); + + if (MLX5_CAP_GEN(mdev, disable_local_lb_uc)) + MLX5_SET(modify_nic_vport_context_in, in, + field_select.disable_uc_local_lb, 1); + err = mlx5_modify_nic_vport_context(mdev, in, inlen); + if (!err) + mlx5_core_dbg(mdev, "%s local_lb\n", + enable ? "enable" : "disable"); + kvfree(in); return err; } diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index d44ec5f41d4a..1391a82da98e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1027,8 +1027,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_wq_sz[0x5]; u8 nic_vport_change_event[0x1]; - u8 disable_local_lb[0x1]; - u8 reserved_at_3e2[0x9]; + u8 disable_local_lb_uc[0x1]; + u8 disable_local_lb_mc[0x1]; + u8 reserved_at_3e3[0x8]; u8 log_max_vlan_list[0x5]; u8 reserved_at_3f0[0x3]; u8 log_max_current_mc_list[0x5]; -- cgit v1.2.3 From 39b735332cb8b33a27c28592d969e4016c86c3ea Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 11 Jan 2018 21:46:23 +0000 Subject: objtool: Detect jumps to retpoline thunks A direct jump to a retpoline thunk is really an indirect jump in disguise. Change the objtool instruction type accordingly. Objtool needs to know where indirect branches are so it can detect switch statement jump tables. This fixes a bunch of warnings with CONFIG_RETPOLINE like: arch/x86/events/intel/uncore_nhmex.o: warning: objtool: nhmex_rbox_msr_enable_event()+0x44: sibling call from callable instruction with modified stack frame kernel/signal.o: warning: objtool: copy_siginfo_to_user()+0x91: sibling call from callable instruction with modified stack frame ... Signed-off-by: Josh Poimboeuf Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-2-git-send-email-dwmw@amazon.co.uk --- tools/objtool/check.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 9b341584eb1b..de053fb7049b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -456,6 +456,13 @@ static int add_jump_destinations(struct objtool_file *file) } else if (rela->sym->sec->idx) { dest_sec = rela->sym->sec; dest_off = rela->sym->sym.st_value + rela->addend + 4; + } else if (strstr(rela->sym->name, "_indirect_thunk_")) { + /* + * Retpoline jumps are really dynamic jumps in + * disguise, so convert them accordingly. + */ + insn->type = INSN_JUMP_DYNAMIC; + continue; } else { /* sibling call */ insn->jump_dest = 0; -- cgit v1.2.3 From 258c76059cece01bebae098e81bacb1af2edad17 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 11 Jan 2018 21:46:24 +0000 Subject: objtool: Allow alternatives to be ignored Getting objtool to understand retpolines is going to be a bit of a challenge. For now, take advantage of the fact that retpolines are patched in with alternatives. Just read the original (sane) non-alternative instruction, and ignore the patched-in retpoline. This allows objtool to understand the control flow *around* the retpoline, even if it can't yet follow what's inside. This means the ORC unwinder will fail to unwind from inside a retpoline, but will work fine otherwise. Signed-off-by: Josh Poimboeuf Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-3-git-send-email-dwmw@amazon.co.uk --- tools/objtool/check.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++----- tools/objtool/check.h | 2 +- 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index de053fb7049b..f40d46e24bcc 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -427,6 +427,40 @@ static void add_ignores(struct objtool_file *file) } } +/* + * FIXME: For now, just ignore any alternatives which add retpolines. This is + * a temporary hack, as it doesn't allow ORC to unwind from inside a retpoline. + * But it at least allows objtool to understand the control flow *around* the + * retpoline. + */ +static int add_nospec_ignores(struct objtool_file *file) +{ + struct section *sec; + struct rela *rela; + struct instruction *insn; + + sec = find_section_by_name(file->elf, ".rela.discard.nospec"); + if (!sec) + return 0; + + list_for_each_entry(rela, &sec->rela_list, list) { + if (rela->sym->type != STT_SECTION) { + WARN("unexpected relocation symbol type in %s", sec->name); + return -1; + } + + insn = find_insn(file, rela->sym->sec, rela->addend); + if (!insn) { + WARN("bad .discard.nospec entry"); + return -1; + } + + insn->ignore_alts = true; + } + + return 0; +} + /* * Find the destination instructions for all jumps. */ @@ -509,11 +543,18 @@ static int add_call_destinations(struct objtool_file *file) dest_off = insn->offset + insn->len + insn->immediate; insn->call_dest = find_symbol_by_offset(insn->sec, dest_off); + /* + * FIXME: Thanks to retpolines, it's now considered + * normal for a function to call within itself. So + * disable this warning for now. + */ +#if 0 if (!insn->call_dest) { WARN_FUNC("can't find call dest symbol at offset 0x%lx", insn->sec, insn->offset, dest_off); return -1; } +#endif } else if (rela->sym->type == STT_SECTION) { insn->call_dest = find_symbol_by_offset(rela->sym->sec, rela->addend+4); @@ -678,12 +719,6 @@ static int add_special_section_alts(struct objtool_file *file) return ret; list_for_each_entry_safe(special_alt, tmp, &special_alts, list) { - alt = malloc(sizeof(*alt)); - if (!alt) { - WARN("malloc failed"); - ret = -1; - goto out; - } orig_insn = find_insn(file, special_alt->orig_sec, special_alt->orig_off); @@ -694,6 +729,10 @@ static int add_special_section_alts(struct objtool_file *file) goto out; } + /* Ignore retpoline alternatives. */ + if (orig_insn->ignore_alts) + continue; + new_insn = NULL; if (!special_alt->group || special_alt->new_len) { new_insn = find_insn(file, special_alt->new_sec, @@ -719,6 +758,13 @@ static int add_special_section_alts(struct objtool_file *file) goto out; } + alt = malloc(sizeof(*alt)); + if (!alt) { + WARN("malloc failed"); + ret = -1; + goto out; + } + alt->insn = new_insn; list_add_tail(&alt->list, &orig_insn->alts); @@ -1035,6 +1081,10 @@ static int decode_sections(struct objtool_file *file) add_ignores(file); + ret = add_nospec_ignores(file); + if (ret) + return ret; + ret = add_jump_destinations(file); if (ret) return ret; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index 47d9ea70a83d..dbadb304a410 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -44,7 +44,7 @@ struct instruction { unsigned int len; unsigned char type; unsigned long immediate; - bool alt_group, visited, dead_end, ignore, hint, save, restore; + bool alt_group, visited, dead_end, ignore, hint, save, restore, ignore_alts; struct symbol *call_dest; struct instruction *jump_dest; struct list_head alts; -- cgit v1.2.3 From 76b043848fd22dbf7f8bf3a1452f8c70d557b860 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:25 +0000 Subject: x86/retpoline: Add initial retpoline support Enable the use of -mindirect-branch=thunk-extern in newer GCC, and provide the corresponding thunks. Provide assembler macros for invoking the thunks in the same way that GCC does, from native and inline assembler. This adds X86_FEATURE_RETPOLINE and sets it by default on all CPUs. In some circumstances, IBRS microcode features may be used instead, and the retpoline can be disabled. On AMD CPUs if lfence is serialising, the retpoline can be dramatically simplified to a simple "lfence; jmp *\reg". A future patch, after it has been verified that lfence really is serialising in all circumstances, can enable this by setting the X86_FEATURE_RETPOLINE_AMD feature bit in addition to X86_FEATURE_RETPOLINE. Do not align the retpoline in the altinstr section, because there is no guarantee that it stays aligned when it's copied over the oldinstr during alternative patching. [ Andi Kleen: Rename the macros, add CONFIG_RETPOLINE option, export thunks] [ tglx: Put actual function CALL/JMP in front of the macros, convert to symbolic labels ] [ dwmw2: Convert back to numeric labels, merge objtool fixes ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-4-git-send-email-dwmw@amazon.co.uk --- arch/x86/Kconfig | 13 ++++ arch/x86/Makefile | 10 +++ arch/x86/include/asm/asm-prototypes.h | 25 +++++++ arch/x86/include/asm/cpufeatures.h | 2 + arch/x86/include/asm/nospec-branch.h | 128 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/common.c | 4 ++ arch/x86/lib/Makefile | 1 + arch/x86/lib/retpoline.S | 48 +++++++++++++ 8 files changed, 231 insertions(+) create mode 100644 arch/x86/include/asm/nospec-branch.h create mode 100644 arch/x86/lib/retpoline.S diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e23d21ac745a..d1819161cc6c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -429,6 +429,19 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH +config RETPOLINE + bool "Avoid speculative indirect branches in kernel" + default y + help + Compile kernel with the retpoline compiler options to guard against + kernel-to-user data leaks by avoiding speculative indirect + branches. Requires a compiler with -mindirect-branch=thunk-extern + support for full protection. The kernel may run slower. + + Without compiler support, at least indirect branches in assembler + code are eliminated. Since this includes the syscall entry path, + it is not entirely pointless. + config INTEL_RDT bool "Intel Resource Director Technology support" default n diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a20eacd9c7e9..974c61864978 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -235,6 +235,16 @@ KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables +# Avoid indirect branches in kernel to deal with Spectre +ifdef CONFIG_RETPOLINE + RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) + ifneq ($(RETPOLINE_CFLAGS),) + KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE + else + $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) + endif +endif + archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index ff700d81e91e..0927cdc4f946 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -11,7 +11,32 @@ #include #include #include +#include #ifndef CONFIG_X86_CMPXCHG64 extern void cmpxchg8b_emu(void); #endif + +#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_X86_32 +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); +#else +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); +INDIRECT_THUNK(8) +INDIRECT_THUNK(9) +INDIRECT_THUNK(10) +INDIRECT_THUNK(11) +INDIRECT_THUNK(12) +INDIRECT_THUNK(13) +INDIRECT_THUNK(14) +INDIRECT_THUNK(15) +#endif +INDIRECT_THUNK(ax) +INDIRECT_THUNK(bx) +INDIRECT_THUNK(cx) +INDIRECT_THUNK(dx) +INDIRECT_THUNK(si) +INDIRECT_THUNK(di) +INDIRECT_THUNK(bp) +INDIRECT_THUNK(sp) +#endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1641c2f96363..f275447862f4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -203,6 +203,8 @@ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h new file mode 100644 index 000000000000..e20e92ef2ca8 --- /dev/null +++ b/arch/x86/include/asm/nospec-branch.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NOSPEC_BRANCH_H__ +#define __NOSPEC_BRANCH_H__ + +#include +#include +#include + +#ifdef __ASSEMBLY__ + +/* + * This should be used immediately before a retpoline alternative. It tells + * objtool where the retpolines are so that it can make sense of the control + * flow by just reading the original instruction(s) and ignoring the + * alternatives. + */ +.macro ANNOTATE_NOSPEC_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.nospec + .long .Lannotate_\@ - . + .popsection +.endm + +/* + * These are the bare retpoline primitives for indirect jmp and call. + * Do not use these directly; they only exist to make the ALTERNATIVE + * invocation below less ugly. + */ +.macro RETPOLINE_JMP reg:req + call .Ldo_rop_\@ +.Lspec_trap_\@: + pause + jmp .Lspec_trap_\@ +.Ldo_rop_\@: + mov \reg, (%_ASM_SP) + ret +.endm + +/* + * This is a wrapper around RETPOLINE_JMP so the called function in reg + * returns to the instruction after the macro. + */ +.macro RETPOLINE_CALL reg:req + jmp .Ldo_call_\@ +.Ldo_retpoline_jmp_\@: + RETPOLINE_JMP \reg +.Ldo_call_\@: + call .Ldo_retpoline_jmp_\@ +.endm + +/* + * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple + * indirect jmp/call which may be susceptible to the Spectre variant 2 + * attack. + */ +.macro JMP_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE_2 __stringify(jmp *\reg), \ + __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD +#else + jmp *\reg +#endif +.endm + +.macro CALL_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE_2 __stringify(call *\reg), \ + __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ + __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD +#else + call *\reg +#endif +.endm + +#else /* __ASSEMBLY__ */ + +#define ANNOTATE_NOSPEC_ALTERNATIVE \ + "999:\n\t" \ + ".pushsection .discard.nospec\n\t" \ + ".long 999b - .\n\t" \ + ".popsection\n\t" + +#if defined(CONFIG_X86_64) && defined(RETPOLINE) + +/* + * Since the inline asm uses the %V modifier which is only in newer GCC, + * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE. + */ +# define CALL_NOSPEC \ + ANNOTATE_NOSPEC_ALTERNATIVE \ + ALTERNATIVE( \ + "call *%[thunk_target]\n", \ + "call __x86_indirect_thunk_%V[thunk_target]\n", \ + X86_FEATURE_RETPOLINE) +# define THUNK_TARGET(addr) [thunk_target] "r" (addr) + +#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) +/* + * For i386 we use the original ret-equivalent retpoline, because + * otherwise we'll run out of registers. We don't care about CET + * here, anyway. + */ +# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n", \ + " jmp 904f;\n" \ + " .align 16\n" \ + "901: call 903f;\n" \ + "902: pause;\n" \ + " jmp 902b;\n" \ + " .align 16\n" \ + "903: addl $4, %%esp;\n" \ + " pushl %[thunk_target];\n" \ + " ret;\n" \ + " .align 16\n" \ + "904: call 901b;\n", \ + X86_FEATURE_RETPOLINE) + +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#else /* No retpoline */ +# define CALL_NOSPEC "call *%[thunk_target]\n" +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 372ba3fb400f..7a671d1ae3cb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -905,6 +905,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); +#ifdef CONFIG_RETPOLINE + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); +#endif + fpu__init_system(c); #ifdef CONFIG_X86_32 diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 457f681ef379..d435c89875c1 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +lib-$(CONFIG_RETPOLINE) += retpoline.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S new file mode 100644 index 000000000000..cb45c6cb465f --- /dev/null +++ b/arch/x86/lib/retpoline.S @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include +#include +#include +#include + +.macro THUNK reg + .section .text.__x86.indirect_thunk.\reg + +ENTRY(__x86_indirect_thunk_\reg) + CFI_STARTPROC + JMP_NOSPEC %\reg + CFI_ENDPROC +ENDPROC(__x86_indirect_thunk_\reg) +.endm + +/* + * Despite being an assembler file we can't just use .irp here + * because __KSYM_DEPS__ only uses the C preprocessor and would + * only see one instance of "__x86_indirect_thunk_\reg" rather + * than one per register with the correct names. So we do it + * the simple and nasty way... + */ +#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg) +#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) + +GENERATE_THUNK(_ASM_AX) +GENERATE_THUNK(_ASM_BX) +GENERATE_THUNK(_ASM_CX) +GENERATE_THUNK(_ASM_DX) +GENERATE_THUNK(_ASM_SI) +GENERATE_THUNK(_ASM_DI) +GENERATE_THUNK(_ASM_BP) +GENERATE_THUNK(_ASM_SP) +#ifdef CONFIG_64BIT +GENERATE_THUNK(r8) +GENERATE_THUNK(r9) +GENERATE_THUNK(r10) +GENERATE_THUNK(r11) +GENERATE_THUNK(r12) +GENERATE_THUNK(r13) +GENERATE_THUNK(r14) +GENERATE_THUNK(r15) +#endif -- cgit v1.2.3 From da285121560e769cc31797bba6422eea71d473e0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:26 +0000 Subject: x86/spectre: Add boot time option to select Spectre v2 mitigation Add a spectre_v2= option to select the mitigation used for the indirect branch speculation vulnerability. Currently, the only option available is retpoline, in its various forms. This will be expanded to cover the new IBRS/IBPB microcode features. The RETPOLINE_AMD feature relies on a serializing LFENCE for speculation control. For AMD hardware, only set RETPOLINE_AMD if LFENCE is a serializing instruction, which is indicated by the LFENCE_RDTSC feature. [ tglx: Folded back the LFENCE/AMD fixes and reworked it so IBRS integration becomes simple ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-5-git-send-email-dwmw@amazon.co.uk --- Documentation/admin-guide/kernel-parameters.txt | 28 +++++ arch/x86/include/asm/nospec-branch.h | 10 ++ arch/x86/kernel/cpu/bugs.c | 158 +++++++++++++++++++++++- arch/x86/kernel/cpu/common.c | 4 - 4 files changed, 195 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 905991745d26..8122b5f98ea1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2599,6 +2599,11 @@ nosmt [KNL,S390] Disable symmetric multithreading (SMT). Equivalent to smt=1. + nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2 + (indirect branch prediction) vulnerability. System may + allow data leaks with this option, which is equivalent + to spectre_v2=off. + noxsave [BUGS=X86] Disables x86 extended register state save and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state. @@ -3908,6 +3913,29 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/laptops/sonypi.txt + spectre_v2= [X86] Control mitigation of Spectre variant 2 + (indirect branch speculation) vulnerability. + + on - unconditionally enable + off - unconditionally disable + auto - kernel detects whether your CPU model is + vulnerable + + Selecting 'on' will, and 'auto' may, choose a + mitigation method at run time according to the + CPU, the available microcode, the setting of the + CONFIG_RETPOLINE configuration option, and the + compiler with which the kernel was built. + + Specific mitigations can also be selected manually: + + retpoline - replace indirect branches + retpoline,generic - google's original retpoline + retpoline,amd - AMD-specific minimal thunk + + Not specifying this option is equivalent to + spectre_v2=auto. + spia_io_base= [HW,MTD] spia_fio_base= spia_pedr= diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e20e92ef2ca8..ea034fa6e261 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -124,5 +124,15 @@ # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif +/* The Spectre V2 mitigation variants */ +enum spectre_v2_mitigation { + SPECTRE_V2_NONE, + SPECTRE_V2_RETPOLINE_MINIMAL, + SPECTRE_V2_RETPOLINE_MINIMAL_AMD, + SPECTRE_V2_RETPOLINE_GENERIC, + SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_IBRS, +}; + #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 76ad6cb44b40..e4dc26185aa7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -11,6 +11,9 @@ #include #include #include + +#include +#include #include #include #include @@ -21,6 +24,8 @@ #include #include +static void __init spectre_v2_select_mitigation(void); + void __init check_bugs(void) { identify_boot_cpu(); @@ -30,6 +35,9 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); } + /* Select the proper spectre mitigation before patching alternatives */ + spectre_v2_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -62,6 +70,153 @@ void __init check_bugs(void) #endif } +/* The kernel command line selection */ +enum spectre_v2_mitigation_cmd { + SPECTRE_V2_CMD_NONE, + SPECTRE_V2_CMD_AUTO, + SPECTRE_V2_CMD_FORCE, + SPECTRE_V2_CMD_RETPOLINE, + SPECTRE_V2_CMD_RETPOLINE_GENERIC, + SPECTRE_V2_CMD_RETPOLINE_AMD, +}; + +static const char *spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", + [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", + [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", + [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", +}; + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt + +static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; + +static void __init spec2_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", reason); +} + +static void __init spec2_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", reason); +} + +static inline bool retp_compiler(void) +{ + return __is_defined(RETPOLINE); +} + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +{ + char arg[20]; + int ret; + + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, + sizeof(arg)); + if (ret > 0) { + if (match_option(arg, ret, "off")) { + goto disable; + } else if (match_option(arg, ret, "on")) { + spec2_print_if_secure("force enabled on command line."); + return SPECTRE_V2_CMD_FORCE; + } else if (match_option(arg, ret, "retpoline")) { + spec2_print_if_insecure("retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE; + } else if (match_option(arg, ret, "retpoline,amd")) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); + return SPECTRE_V2_CMD_AUTO; + } + spec2_print_if_insecure("AMD retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE_AMD; + } else if (match_option(arg, ret, "retpoline,generic")) { + spec2_print_if_insecure("generic retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE_GENERIC; + } else if (match_option(arg, ret, "auto")) { + return SPECTRE_V2_CMD_AUTO; + } + } + + if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + return SPECTRE_V2_CMD_AUTO; +disable: + spec2_print_if_insecure("disabled on command line."); + return SPECTRE_V2_CMD_NONE; +} + +static void __init spectre_v2_select_mitigation(void) +{ + enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); + enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + + /* + * If the CPU is not affected and the command line mode is NONE or AUTO + * then nothing to do. + */ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + return; + + switch (cmd) { + case SPECTRE_V2_CMD_NONE: + return; + + case SPECTRE_V2_CMD_FORCE: + /* FALLTRHU */ + case SPECTRE_V2_CMD_AUTO: + goto retpoline_auto; + + case SPECTRE_V2_CMD_RETPOLINE_AMD: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_amd; + break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_generic; + break; + case SPECTRE_V2_CMD_RETPOLINE: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_auto; + break; + } + pr_err("kernel not compiled with retpoline; no mitigation available!"); + return; + +retpoline_auto: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + retpoline_amd: + if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE not serializing. Switching to generic retpoline\n"); + goto retpoline_generic; + } + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : + SPECTRE_V2_RETPOLINE_MINIMAL_AMD; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } else { + retpoline_generic: + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : + SPECTRE_V2_RETPOLINE_MINIMAL; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } + + spectre_v2_enabled = mode; + pr_info("%s\n", spectre_v2_strings[mode]); +} + +#undef pr_fmt + #ifdef CONFIG_SYSFS ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) @@ -86,6 +241,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return sprintf(buf, "Not affected\n"); - return sprintf(buf, "Vulnerable\n"); + + return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]); } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7a671d1ae3cb..372ba3fb400f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -905,10 +905,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); -#ifdef CONFIG_RETPOLINE - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); -#endif - fpu__init_system(c); #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 9697fa39efd3fc3692f2949d4045f393ec58450b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:27 +0000 Subject: x86/retpoline/crypto: Convert crypto assembler indirect jumps Convert all indirect jumps in crypto assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-6-git-send-email-dwmw@amazon.co.uk --- arch/x86/crypto/aesni-intel_asm.S | 5 +++-- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 3 ++- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 3 ++- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 16627fec80b2..3d09e3aca18d 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -32,6 +32,7 @@ #include #include #include +#include /* * The following macros are used to move an (un)aligned 16 byte value to/from @@ -2884,7 +2885,7 @@ ENTRY(aesni_xts_crypt8) pxor INC, STATE4 movdqu IV, 0x30(OUTP) - call *%r11 + CALL_NOSPEC %r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 @@ -2929,7 +2930,7 @@ ENTRY(aesni_xts_crypt8) _aesni_gf128mul_x_ble() movups IV, (IVP) - call *%r11 + CALL_NOSPEC %r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index f7c495e2863c..a14af6eb09cb 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -17,6 +17,7 @@ #include #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1227,7 +1228,7 @@ camellia_xts_crypt_16way: vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; - call *%r9; + CALL_NOSPEC %r9; addq $(16 * 16), %rsp; diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index eee5b3982cfd..b66bbfa62f50 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -12,6 +12,7 @@ #include #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1343,7 +1344,7 @@ camellia_xts_crypt_32way: vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - call *%r9; + CALL_NOSPEC %r9; addq $(16 * 32), %rsp; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 7a7de27c6f41..d9b734d0c8cc 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -45,6 +45,7 @@ #include #include +#include ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction @@ -172,7 +173,7 @@ continue_block: movzxw (bufp, %rax, 2), len lea crc_array(%rip), bufp lea (bufp, len, 1), bufp - jmp *bufp + JMP_NOSPEC bufp ################################################################ ## 2a) PROCESS FULL BLOCKS: -- cgit v1.2.3 From 2641f08bb7fc63a636a2b18173221d7040a3512e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:28 +0000 Subject: x86/retpoline/entry: Convert entry assembler indirect jumps Convert indirect jumps in core 32/64bit entry assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Don't use CALL_NOSPEC in entry_SYSCALL_64_fastpath because the return address after the 'call' instruction must be *precisely* at the .Lentry_SYSCALL_64_after_fastpath label for stub_ptregs_64 to work, and the use of alternatives will mess that up unless we play horrid games to prepend with NOPs and make the variants the same length. It's not worth it; in the case where we ALTERNATIVE out the retpoline, the first instruction at __x86.indirect_thunk.rax is going to be a bare jmp *%rax anyway. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Arjan van de Ven Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-7-git-send-email-dwmw@amazon.co.uk --- arch/x86/entry/entry_32.S | 5 +++-- arch/x86/entry/entry_64.S | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ace8f321a5a1..a1f28a54f23a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -44,6 +44,7 @@ #include #include #include +#include .section .entry.text, "ax" @@ -290,7 +291,7 @@ ENTRY(ret_from_fork) /* kernel thread */ 1: movl %edi, %eax - call *%ebx + CALL_NOSPEC %ebx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -919,7 +920,7 @@ common_exception: movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - call *%edi + CALL_NOSPEC %edi jmp ret_from_exception END(common_exception) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ed31d00dc5ee..59874bc1aed2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -37,6 +37,7 @@ #include #include #include +#include #include #include "calling.h" @@ -187,7 +188,7 @@ ENTRY(entry_SYSCALL_64_trampoline) */ pushq %rdi movq $entry_SYSCALL_64_stage2, %rdi - jmp *%rdi + JMP_NOSPEC %rdi END(entry_SYSCALL_64_trampoline) .popsection @@ -266,7 +267,12 @@ entry_SYSCALL_64_fastpath: * It might end up jumping to the slow path. If it jumps, RAX * and all argument registers are clobbered. */ +#ifdef CONFIG_RETPOLINE + movq sys_call_table(, %rax, 8), %rax + call __x86_indirect_thunk_rax +#else call *sys_call_table(, %rax, 8) +#endif .Lentry_SYSCALL_64_after_fastpath_call: movq %rax, RAX(%rsp) @@ -438,7 +444,7 @@ ENTRY(stub_ptregs_64) jmp entry_SYSCALL64_slow_path 1: - jmp *%rax /* Called from C */ + JMP_NOSPEC %rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func @@ -517,7 +523,7 @@ ENTRY(ret_from_fork) 1: /* kernel thread */ movq %r12, %rdi - call *%rbx + CALL_NOSPEC %rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() -- cgit v1.2.3 From 9351803bd803cdbeb9b5a7850b7b6f464806e3db Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:29 +0000 Subject: x86/retpoline/ftrace: Convert ftrace assembler indirect jumps Convert all indirect jumps in ftrace assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-8-git-send-email-dwmw@amazon.co.uk --- arch/x86/kernel/ftrace_32.S | 6 ++++-- arch/x86/kernel/ftrace_64.S | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index b6c6468e10bc..4c8440de3355 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CC_USING_FENTRY # define function_hook __fentry__ @@ -197,7 +198,8 @@ ftrace_stub: movl 0x4(%ebp), %edx subl $MCOUNT_INSN_SIZE, %eax - call *ftrace_trace_function + movl ftrace_trace_function, %ecx + CALL_NOSPEC %ecx popl %edx popl %ecx @@ -241,5 +243,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - jmp *%ecx + JMP_NOSPEC %ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index c832291d948a..7cb8ba08beb9 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -7,7 +7,7 @@ #include #include #include - +#include .code64 .section .entry.text, "ax" @@ -286,8 +286,8 @@ trace: * ip and parent ip are used and the list function is called when * function tracing is enabled. */ - call *ftrace_trace_function - + movq ftrace_trace_function, %r8 + CALL_NOSPEC %r8 restore_mcount_regs jmp fgraph_trace @@ -329,5 +329,5 @@ GLOBAL(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - jmp *%rdi + JMP_NOSPEC %rdi #endif -- cgit v1.2.3 From e70e5892b28c18f517f29ab6e83bd57705104b31 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:30 +0000 Subject: x86/retpoline/hyperv: Convert assembler indirect jumps Convert all indirect jumps in hyperv inline asm code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-9-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/mshyperv.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 581bb54dd464..5119e4b555cc 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -7,6 +7,7 @@ #include #include #include +#include /* * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent @@ -186,10 +187,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) return U64_MAX; __asm__ __volatile__("mov %4, %%r8\n" - "call *%5" + CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) - : "r" (output_address), "m" (hv_hypercall_pg) + : "r" (output_address), + THUNK_TARGET(hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); #else u32 input_address_hi = upper_32_bits(input_address); @@ -200,13 +202,13 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) if (!hv_hypercall_pg) return U64_MAX; - __asm__ __volatile__("call *%7" + __asm__ __volatile__(CALL_NOSPEC : "=A" (hv_status), "+c" (input_address_lo), ASM_CALL_CONSTRAINT : "A" (control), "b" (input_address_hi), "D"(output_address_hi), "S"(output_address_lo), - "m" (hv_hypercall_pg) + THUNK_TARGET(hv_hypercall_pg) : "cc", "memory"); #endif /* !x86_64 */ return hv_status; @@ -227,10 +229,10 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) #ifdef CONFIG_X86_64 { - __asm__ __volatile__("call *%4" + __asm__ __volatile__(CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) - : "m" (hv_hypercall_pg) + : THUNK_TARGET(hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); } #else @@ -238,13 +240,13 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) u32 input1_hi = upper_32_bits(input1); u32 input1_lo = lower_32_bits(input1); - __asm__ __volatile__ ("call *%5" + __asm__ __volatile__ (CALL_NOSPEC : "=A"(hv_status), "+c"(input1_lo), ASM_CALL_CONSTRAINT : "A" (control), "b" (input1_hi), - "m" (hv_hypercall_pg) + THUNK_TARGET(hv_hypercall_pg) : "cc", "edi", "esi"); } #endif -- cgit v1.2.3 From ea08816d5b185ab3d09e95e393f265af54560350 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:31 +0000 Subject: x86/retpoline/xen: Convert Xen hypercall indirect jumps Convert indirect call in Xen hypercall to use non-speculative sequence, when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Reviewed-by: Juergen Gross Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-10-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/xen/hypercall.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 7cb282e9e587..bfd882617613 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -217,9 +218,9 @@ privcmd_call(unsigned call, __HYPERCALL_5ARG(a1, a2, a3, a4, a5); stac(); - asm volatile("call *%[call]" + asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM - : [call] "a" (&hypercall_page[call]) + : [thunk_target] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); clac(); -- cgit v1.2.3 From 5096732f6f695001fa2d6f1335a2680b37912c69 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:32 +0000 Subject: x86/retpoline/checksum32: Convert assembler indirect jumps Convert all indirect jumps in 32bit checksum assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-11-git-send-email-dwmw@amazon.co.uk --- arch/x86/lib/checksum_32.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4d34bb548b41..46e71a74e612 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -29,7 +29,8 @@ #include #include #include - +#include + /* * computes a partial checksum, e.g. for TCP/UDP fragments */ @@ -156,7 +157,7 @@ ENTRY(csum_partial) negl %ebx lea 45f(%ebx,%ebx,2), %ebx testl %esi, %esi - jmp *%ebx + JMP_NOSPEC %ebx # Handle 2-byte-aligned regions 20: addw (%esi), %ax @@ -439,7 +440,7 @@ ENTRY(csum_partial_copy_generic) andl $-32,%edx lea 3f(%ebx,%ebx), %ebx testl %esi, %esi - jmp *%ebx + JMP_NOSPEC %ebx 1: addl $64,%esi addl $64,%edi SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) -- cgit v1.2.3 From 7614e913db1f40fff819b36216484dc3808995d4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 11 Jan 2018 21:46:33 +0000 Subject: x86/retpoline/irq32: Convert assembler indirect jumps Convert all indirect jumps in 32bit irq inline asm code to use non speculative sequences. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-12-git-send-email-dwmw@amazon.co.uk --- arch/x86/kernel/irq_32.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a83b3346a0e1..c1bdbd3d3232 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -20,6 +20,7 @@ #include #include +#include #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack); static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=b" (stack) : "0" (stack), - "D"(func) + [thunk_target] "D"(func) : "memory", "cc", "edx", "ecx", "eax"); } @@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) call_on_stack(print_stack_overflow, isp); asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=a" (arg1), "=b" (isp) : "0" (desc), "1" (isp), - "D" (desc->handle_irq) + [thunk_target] "D" (desc->handle_irq) : "memory", "cc", "ecx"); return 1; } -- cgit v1.2.3 From 05e0cc84e00c54fb152d1f4b86bc211823a83d0c Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 4 Jan 2018 04:35:51 +0200 Subject: net/mlx5: Fix get vector affinity helper function mlx5_get_vector_affinity used to call pci_irq_get_affinity and after reverting the patch that sets the device affinity via PCI_IRQ_AFFINITY API, calling pci_irq_get_affinity becomes useless and it breaks RDMA mlx5 users. To fix this, this patch provides an alternative way to retrieve IRQ vector affinity using legacy IRQ API, following smp_affinity read procfs implementation. Fixes: 231243c82793 ("Revert mlx5: move affinity hints assignments to generic code") Fixes: a435393acafb ("mlx5: move affinity hints assignments to generic code") Cc: Sagi Grimberg Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1f509d072026..a0610427e168 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -1231,7 +1232,23 @@ enum { static inline const struct cpumask * mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector) { - return pci_irq_get_affinity(dev->pdev, MLX5_EQ_VEC_COMP_BASE + vector); + const struct cpumask *mask; + struct irq_desc *desc; + unsigned int irq; + int eqn; + int err; + + err = mlx5_vector2eqn(dev, vector, &eqn, &irq); + if (err) + return NULL; + + desc = irq_to_desc(irq); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + mask = irq_data_get_effective_affinity_mask(&desc->irq_data); +#else + mask = desc->irq_common_data.affinity; +#endif + return mask; } #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From b6908c296021a99ba2a83a4b4703eb9e6365e5dc Mon Sep 17 00:00:00 2001 From: Alaa Hleihel Date: Thu, 14 Dec 2017 19:23:50 +0200 Subject: net/mlx5: Fix memory leak in bad flow of mlx5_alloc_irq_vectors Fix a memory leak where in case that pci_alloc_irq_vectors failed, priv->irq_info was not released. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Alaa Hleihel Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 95e188d0883e..a4c82fa71aec 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -319,6 +319,7 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) struct mlx5_eq_table *table = &priv->eq_table; int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq); int nvec; + int err; nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE; @@ -328,21 +329,23 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) priv->irq_info = kcalloc(nvec, sizeof(*priv->irq_info), GFP_KERNEL); if (!priv->irq_info) - goto err_free_msix; + return -ENOMEM; nvec = pci_alloc_irq_vectors(dev->pdev, MLX5_EQ_VEC_COMP_BASE + 1, nvec, PCI_IRQ_MSIX); - if (nvec < 0) - return nvec; + if (nvec < 0) { + err = nvec; + goto err_free_irq_info; + } table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE; return 0; -err_free_msix: +err_free_irq_info: kfree(priv->irq_info); - return -ENOMEM; + return err; } static void mlx5_free_irq_vectors(struct mlx5_core_dev *dev) -- cgit v1.2.3 From 72f36be06138bdc11bdbe1f04e4a3e2637ea438d Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 20 Nov 2017 09:58:01 +0200 Subject: net/mlx5: Fix mlx5_get_uars_page to return error code Change mlx5_get_uars_page to return ERR_PTR in case of allocation failure. Change all callers accordingly to check the IS_ERR(ptr) instead of NULL. Fixes: 59211bd3b632 ("net/mlx5: Split the load/unload flow into hardware and software flows") Signed-off-by: Eran Ben Elisha Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/uar.c | 14 ++++++-------- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 00cb184fa027..262c1aa2e028 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4160,7 +4160,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) goto err_cnt; dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); - if (!dev->mdev->priv.uar) + if (IS_ERR(dev->mdev->priv.uar)) goto err_cong; err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a4c82fa71aec..6dffa58fb178 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1135,8 +1135,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, } dev->priv.uar = mlx5_get_uars_page(dev); - if (!dev->priv.uar) { + if (IS_ERR(dev->priv.uar)) { dev_err(&pdev->dev, "Failed allocating uar, aborting\n"); + err = PTR_ERR(dev->priv.uar); goto err_disable_msix; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c index 222b25908d01..8b97066dd1f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c @@ -168,18 +168,16 @@ struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev) struct mlx5_uars_page *ret; mutex_lock(&mdev->priv.bfregs.reg_head.lock); - if (list_empty(&mdev->priv.bfregs.reg_head.list)) { - ret = alloc_uars_page(mdev, false); - if (IS_ERR(ret)) { - ret = NULL; - goto out; - } - list_add(&ret->list, &mdev->priv.bfregs.reg_head.list); - } else { + if (!list_empty(&mdev->priv.bfregs.reg_head.list)) { ret = list_first_entry(&mdev->priv.bfregs.reg_head.list, struct mlx5_uars_page, list); kref_get(&ret->ref_count); + goto out; } + ret = alloc_uars_page(mdev, false); + if (IS_ERR(ret)) + goto out; + list_add(&ret->list, &mdev->priv.bfregs.reg_head.list); out: mutex_unlock(&mdev->priv.bfregs.reg_head.lock); -- cgit v1.2.3 From 259bbc575c5322e0bc675c9a77e937250723c333 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 31 Dec 2017 11:31:34 +0200 Subject: net/mlx5: Fix error handling in load one We didn't store the result of mlx5_init_once, due to that mlx5_load_one returned success on error. Fix that. Fixes: 59211bd3b632 ("net/mlx5: Split the load/unload flow into hardware and software flows") Signed-off-by: Maor Gottlieb Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 6dffa58fb178..0f88fd30a09a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1123,9 +1123,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, goto err_stop_poll; } - if (boot && mlx5_init_once(dev, priv)) { - dev_err(&pdev->dev, "sw objs init failed\n"); - goto err_stop_poll; + if (boot) { + err = mlx5_init_once(dev, priv); + if (err) { + dev_err(&pdev->dev, "sw objs init failed\n"); + goto err_stop_poll; + } } err = mlx5_alloc_irq_vectors(dev); -- cgit v1.2.3 From e556f6dd47eda62cbb046fa92e03265245a1537f Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 26 Dec 2017 13:44:49 +0200 Subject: net/mlx5e: Keep updating ethtool statistics when the interface is down ethtool statistics should be updated even when the interface is down since it shows more than just netdev counters, which might change while the logical link is down. One useful use case, for example, is when running RoCE traffic over the interface (while the logical link is down, but physical link is up) and examining rx_prioX_bytes. Fixes: f62b8bb8f2d3 ("net/mlx5: Extend mlx5_core to support ConnectX-4 Ethernet functionality") Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 8f05efa5c829..ea5fff2c3143 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -207,8 +207,7 @@ void mlx5e_ethtool_get_ethtool_stats(struct mlx5e_priv *priv, return; mutex_lock(&priv->state_lock); - if (test_bit(MLX5E_STATE_OPENED, &priv->state)) - mlx5e_update_stats(priv, true); + mlx5e_update_stats(priv, true); mutex_unlock(&priv->state_lock); for (i = 0; i < mlx5e_num_stats_grps; i++) -- cgit v1.2.3 From 97c8c3aa48ca8eb85d1806e08f882f90d78b1856 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 10 Oct 2017 16:51:44 +0300 Subject: net/mlx5e: Add error print in ETS init ETS initialization might fail, add a print to indicate such failures. Fixes: 08fb1dacdd76 ("net/mlx5e: Support DCBNL IEEE ETS") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 9bcf38f4123b..a5c5134f5cb2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -922,8 +922,9 @@ static void mlx5e_dcbnl_query_dcbx_mode(struct mlx5e_priv *priv, static void mlx5e_ets_init(struct mlx5e_priv *priv) { - int i; struct ieee_ets ets; + int err; + int i; if (!MLX5_CAP_GEN(priv->mdev, ets)) return; @@ -940,7 +941,10 @@ static void mlx5e_ets_init(struct mlx5e_priv *priv) ets.prio_tc[0] = 1; ets.prio_tc[1] = 0; - mlx5e_dcbnl_ieee_setets_core(priv, &ets); + err = mlx5e_dcbnl_ieee_setets_core(priv, &ets); + if (err) + netdev_err(priv->netdev, + "%s, Failed to init ETS: %d\n", __func__, err); } enum { -- cgit v1.2.3 From 4b7d4363f14a0398eca48c7e96e46120c5eb6a96 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 10 Oct 2017 16:54:30 +0300 Subject: net/mlx5e: Check support before TC swap in ETS init Should not do the following swap between TCs 0 and 1 when max num of TCs is 1: tclass[prio=0]=1, tclass[prio=1]=0, tclass[prio=i]=i (for i>1) Fixes: 08fb1dacdd76 ("net/mlx5e: Support DCBNL IEEE ETS") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index a5c5134f5cb2..3d46ef48d5b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -937,9 +937,11 @@ static void mlx5e_ets_init(struct mlx5e_priv *priv) ets.prio_tc[i] = i; } - /* tclass[prio=0]=1, tclass[prio=1]=0, tclass[prio=i]=i (for i>1) */ - ets.prio_tc[0] = 1; - ets.prio_tc[1] = 0; + if (ets.ets_cap > 1) { + /* tclass[prio=0]=1, tclass[prio=1]=0, tclass[prio=i]=i (for i>1) */ + ets.prio_tc[0] = 1; + ets.prio_tc[1] = 0; + } err = mlx5e_dcbnl_ieee_setets_core(priv, &ets); if (err) -- cgit v1.2.3 From 75b81ce719b79565eb0b39aa9954b6e11a5e73bf Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 10 Jan 2018 17:11:11 +0200 Subject: net/mlx5e: Don't override netdev features field unless in error flow Set features function sets dev->features in order to keep track of which features were successfully changed and which weren't (in case the user asks for more than one change in a single command). This breaks the logic in __netdev_update_features which assumes that dev->features is not changed on success and checks for diffs between features and dev->features (diffs that might not exist at this point because of the driver override). The solution is to keep track of successful/failed feature changes and assign them to dev->features in case of failure only. Fixes: 0e405443e803 ("net/mlx5e: Improve set features ndo resiliency") Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 43 +++++++++++++---------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index d9d8227f195f..311d5ec8407c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3219,12 +3219,12 @@ static int mlx5e_set_mac(struct net_device *netdev, void *addr) return 0; } -#define MLX5E_SET_FEATURE(netdev, feature, enable) \ +#define MLX5E_SET_FEATURE(features, feature, enable) \ do { \ if (enable) \ - netdev->features |= feature; \ + *features |= feature; \ else \ - netdev->features &= ~feature; \ + *features &= ~feature; \ } while (0) typedef int (*mlx5e_feature_handler)(struct net_device *netdev, bool enable); @@ -3347,6 +3347,7 @@ static int set_feature_arfs(struct net_device *netdev, bool enable) #endif static int mlx5e_handle_feature(struct net_device *netdev, + netdev_features_t *features, netdev_features_t wanted_features, netdev_features_t feature, mlx5e_feature_handler feature_handler) @@ -3365,34 +3366,40 @@ static int mlx5e_handle_feature(struct net_device *netdev, return err; } - MLX5E_SET_FEATURE(netdev, feature, enable); + MLX5E_SET_FEATURE(features, feature, enable); return 0; } static int mlx5e_set_features(struct net_device *netdev, netdev_features_t features) { + netdev_features_t oper_features = netdev->features; int err; - err = mlx5e_handle_feature(netdev, features, NETIF_F_LRO, - set_feature_lro); - err |= mlx5e_handle_feature(netdev, features, + err = mlx5e_handle_feature(netdev, &oper_features, features, + NETIF_F_LRO, set_feature_lro); + err |= mlx5e_handle_feature(netdev, &oper_features, features, NETIF_F_HW_VLAN_CTAG_FILTER, set_feature_cvlan_filter); - err |= mlx5e_handle_feature(netdev, features, NETIF_F_HW_TC, - set_feature_tc_num_filters); - err |= mlx5e_handle_feature(netdev, features, NETIF_F_RXALL, - set_feature_rx_all); - err |= mlx5e_handle_feature(netdev, features, NETIF_F_RXFCS, - set_feature_rx_fcs); - err |= mlx5e_handle_feature(netdev, features, NETIF_F_HW_VLAN_CTAG_RX, - set_feature_rx_vlan); + err |= mlx5e_handle_feature(netdev, &oper_features, features, + NETIF_F_HW_TC, set_feature_tc_num_filters); + err |= mlx5e_handle_feature(netdev, &oper_features, features, + NETIF_F_RXALL, set_feature_rx_all); + err |= mlx5e_handle_feature(netdev, &oper_features, features, + NETIF_F_RXFCS, set_feature_rx_fcs); + err |= mlx5e_handle_feature(netdev, &oper_features, features, + NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan); #ifdef CONFIG_RFS_ACCEL - err |= mlx5e_handle_feature(netdev, features, NETIF_F_NTUPLE, - set_feature_arfs); + err |= mlx5e_handle_feature(netdev, &oper_features, features, + NETIF_F_NTUPLE, set_feature_arfs); #endif - return err ? -EINVAL : 0; + if (err) { + netdev->features = oper_features; + return -EINVAL; + } + + return 0; } static netdev_features_t mlx5e_fix_features(struct net_device *netdev, -- cgit v1.2.3 From afc98a0b46d8576a55f18092400cc518d03a79a1 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Wed, 3 Jan 2018 17:23:55 +0200 Subject: net/mlx5: Update ptp_clock_event foreach PPS event PPS event did not update ptp_clock_event fields, therefore, timestamp value was not updated correctly. This fix updates the event source and the timestamp value for each PPS event. Fixes: 7c39afb394c7 ("net/mlx5: PTP code migration to driver core section") Signed-off-by: Feras Daoud Reported-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index fa8aed62b231..5701f125e99c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -423,9 +423,13 @@ void mlx5_pps_event(struct mlx5_core_dev *mdev, switch (clock->ptp_info.pin_config[pin].func) { case PTP_PF_EXTTS: + ptp_event.index = pin; + ptp_event.timestamp = timecounter_cyc2time(&clock->tc, + be64_to_cpu(eqe->data.pps.time_stamp)); if (clock->pps_info.enabled) { ptp_event.type = PTP_CLOCK_PPSUSR; - ptp_event.pps_times.ts_real = ns_to_timespec64(eqe->data.pps.time_stamp); + ptp_event.pps_times.ts_real = + ns_to_timespec64(ptp_event.timestamp); } else { ptp_event.type = PTP_CLOCK_EXTTS; } -- cgit v1.2.3 From 237f258c42c905f71c694670fe4d9773d85c36ed Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Mon, 8 Jan 2018 10:01:04 +0200 Subject: net/mlx5e: Remove timestamp set from netdevice open flow To avoid configuration override, timestamp set call will be moved from the netdevice open flow to the init flow. By this, a close-open procedure will not override the timestamp configuration. In addition, the change will rename mlx5e_timestamp_set function to be mlx5e_timestamp_init. Fixes: ef9814deafd0 ("net/mlx5e: Add HW timestamping (TS) support") Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 +++-- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 543060c305a0..c2d89bfa1a70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -895,7 +895,7 @@ int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto, u16 vid); void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv); void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv); -void mlx5e_timestamp_set(struct mlx5e_priv *priv); +void mlx5e_timestamp_init(struct mlx5e_priv *priv); struct mlx5e_redirect_rqt_param { bool is_rss; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 311d5ec8407c..d8aefeed124d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2669,7 +2669,7 @@ void mlx5e_switch_priv_channels(struct mlx5e_priv *priv, netif_carrier_on(netdev); } -void mlx5e_timestamp_set(struct mlx5e_priv *priv) +void mlx5e_timestamp_init(struct mlx5e_priv *priv) { priv->tstamp.tx_type = HWTSTAMP_TX_OFF; priv->tstamp.rx_filter = HWTSTAMP_FILTER_NONE; @@ -2690,7 +2690,6 @@ int mlx5e_open_locked(struct net_device *netdev) mlx5e_activate_priv_channels(priv); if (priv->profile->update_carrier) priv->profile->update_carrier(priv); - mlx5e_timestamp_set(priv); if (priv->profile->update_stats) queue_delayed_work(priv->wq, &priv->update_stats_work, 0); @@ -4146,6 +4145,8 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev, INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work); INIT_WORK(&priv->tx_timeout_work, mlx5e_tx_timeout_work); INIT_DELAYED_WORK(&priv->update_stats_work, mlx5e_update_stats_work); + + mlx5e_timestamp_init(priv); } static void mlx5e_set_netdev_dev_addr(struct net_device *netdev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 2c43606c26b5..3409d86eb06b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -877,6 +877,8 @@ static void mlx5e_init_rep(struct mlx5_core_dev *mdev, mlx5e_build_rep_params(mdev, &priv->channels.params); mlx5e_build_rep_netdev(netdev); + + mlx5e_timestamp_init(priv); } static int mlx5e_init_rep_rx(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 8812d7208e8f..ee2f378c5030 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -86,6 +86,8 @@ void mlx5i_init(struct mlx5_core_dev *mdev, mlx5e_build_nic_params(mdev, &priv->channels.params, profile->max_nch(mdev)); mlx5i_build_nic_params(mdev, &priv->channels.params); + mlx5e_timestamp_init(priv); + /* netdev init */ netdev->hw_features |= NETIF_F_SG; netdev->hw_features |= NETIF_F_IP_CSUM; @@ -450,7 +452,6 @@ static int mlx5i_open(struct net_device *netdev) mlx5e_refresh_tirs(epriv, false); mlx5e_activate_priv_channels(epriv); - mlx5e_timestamp_set(epriv); mutex_unlock(&epriv->state_lock); return 0; -- cgit v1.2.3 From 117cc7a908c83697b0b737d15ae1eb5943afe35b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 Jan 2018 11:11:27 +0000 Subject: x86/retpoline: Fill return stack buffer on vmexit In accordance with the Intel and AMD documentation, we need to overwrite all entries in the RSB on exiting a guest, to prevent malicious branch target predictions from affecting the host kernel. This is needed both for retpoline and for IBRS. [ak: numbers again for the RSB stuffing labels] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Peter Zijlstra (Intel) Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515755487-8524-1-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/nospec-branch.h | 78 +++++++++++++++++++++++++++++++++++- arch/x86/kvm/svm.c | 4 ++ arch/x86/kvm/vmx.c | 4 ++ 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ea034fa6e261..402a11c803c3 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,6 +7,48 @@ #include #include +/* + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an + * infinite 'pause; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to + * eliminate potentially bogus entries from the RSB, and sometimes + * purely to ensure that it doesn't get empty, which on some CPUs would + * allow predictions from other (unwanted!) sources to be used. + * + * We define a CPP macro such that it can be used from both .S files and + * inline assembly. It's possible to do a .macro and then include that + * from C via asm(".include ") but let's not go there. + */ + +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ +#define RSB_FILL_LOOPS 16 /* To avoid underflow */ + +/* + * Google experimented with loop-unrolling and this turned out to be + * the optimal version — two calls, each with their own speculation + * trap should their return address end up getting used, in a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr, sp) \ + mov $(nr/2), reg; \ +771: \ + call 772f; \ +773: /* speculation trap */ \ + pause; \ + jmp 773b; \ +772: \ + call 774f; \ +775: /* speculation trap */ \ + pause; \ + jmp 775b; \ +774: \ + dec reg; \ + jnz 771b; \ + add $(BITS_PER_LONG/8) * nr, sp; + #ifdef __ASSEMBLY__ /* @@ -74,6 +116,20 @@ #else call *\reg #endif +.endm + + /* + * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP + * monstrosity above, manually. + */ +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE "jmp .Lskip_rsb_\@", \ + __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ + \ftr +.Lskip_rsb_\@: +#endif .endm #else /* __ASSEMBLY__ */ @@ -119,7 +175,7 @@ X86_FEATURE_RETPOLINE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) -#else /* No retpoline */ +#else /* No retpoline for C / inline asm */ # define CALL_NOSPEC "call *%[thunk_target]\n" # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif @@ -134,5 +190,25 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +/* + * On VMEXIT we must ensure that no RSB predictions learned in the guest + * can be followed in the host, by overwriting the RSB completely. Both + * retpoline and IBRS mitigations for Spectre v2 need this; only on future + * CPUs with IBRS_ATT *might* it be avoided. + */ +static inline void vmexit_fill_RSB(void) +{ +#ifdef CONFIG_RETPOLINE + unsigned long loops = RSB_CLEAR_LOOPS / 2; + + asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE("jmp 910f", + __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), + X86_FEATURE_RETPOLINE) + "910:" + : "=&r" (loops), ASM_CALL_CONSTRAINT + : "r" (loops) : "memory" ); +#endif +} #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0e68f0b3cbf7..2744b97345b8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include "trace.h" @@ -4985,6 +4986,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 62ee4362e1c1..d1e25dba3112 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -9403,6 +9404,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (debugctlmsr) update_debugctlmsr(debugctlmsr); -- cgit v1.2.3 From 0dda0b3fb255048a221f736c8a2a24c674da8bf3 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 8 Dec 2017 17:43:18 -0800 Subject: apparmor: fix ptrace label match when matching stacked labels Given a label with a profile stack of A//&B or A//&C ... A ptrace rule should be able to specify a generic trace pattern with a rule like ptrace trace A//&**, however this is failing because while the correct label match routine is called, it is being done post label decomposition so it is always being done against a profile instead of the stacked label. To fix this refactor the cross check to pass the full peer label in to the label_match. Fixes: 290f458a4f16 ("apparmor: allow ptrace checks to be finer grained than just capability") Cc: Stable Reported-by: Matthew Garrett Tested-by: Matthew Garrett Signed-off-by: John Johansen --- security/apparmor/include/perms.h | 3 +++ security/apparmor/ipc.c | 53 +++++++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index 2b27bb79aec4..d7b7e7115160 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -133,6 +133,9 @@ extern struct aa_perms allperms; #define xcheck_labels_profiles(L1, L2, FN, args...) \ xcheck_ns_labels((L1), (L2), xcheck_ns_profile_label, (FN), args) +#define xcheck_labels(L1, L2, P, FN1, FN2) \ + xcheck(fn_for_each((L1), (P), (FN1)), fn_for_each((L2), (P), (FN2))) + void aa_perm_mask_to_str(char *str, const char *chrs, u32 mask); void aa_audit_perm_names(struct audit_buffer *ab, const char **names, u32 mask); diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 7ca0032e7ba9..b40678f3c1d5 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -64,40 +64,48 @@ static void audit_ptrace_cb(struct audit_buffer *ab, void *va) FLAGS_NONE, GFP_ATOMIC); } +/* assumes check for PROFILE_MEDIATES is already done */ /* TODO: conditionals */ static int profile_ptrace_perm(struct aa_profile *profile, - struct aa_profile *peer, u32 request, - struct common_audit_data *sa) + struct aa_label *peer, u32 request, + struct common_audit_data *sa) { struct aa_perms perms = { }; - /* need because of peer in cross check */ - if (profile_unconfined(profile) || - !PROFILE_MEDIATES(profile, AA_CLASS_PTRACE)) - return 0; - - aad(sa)->peer = &peer->label; - aa_profile_match_label(profile, &peer->label, AA_CLASS_PTRACE, request, + aad(sa)->peer = peer; + aa_profile_match_label(profile, peer, AA_CLASS_PTRACE, request, &perms); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, sa, audit_ptrace_cb); } -static int cross_ptrace_perm(struct aa_profile *tracer, - struct aa_profile *tracee, u32 request, - struct common_audit_data *sa) +static int profile_tracee_perm(struct aa_profile *tracee, + struct aa_label *tracer, u32 request, + struct common_audit_data *sa) { + if (profile_unconfined(tracee) || unconfined(tracer) || + !PROFILE_MEDIATES(tracee, AA_CLASS_PTRACE)) + return 0; + + return profile_ptrace_perm(tracee, tracer, request, sa); +} + +static int profile_tracer_perm(struct aa_profile *tracer, + struct aa_label *tracee, u32 request, + struct common_audit_data *sa) +{ + if (profile_unconfined(tracer)) + return 0; + if (PROFILE_MEDIATES(tracer, AA_CLASS_PTRACE)) - return xcheck(profile_ptrace_perm(tracer, tracee, request, sa), - profile_ptrace_perm(tracee, tracer, - request << PTRACE_PERM_SHIFT, - sa)); - /* policy uses the old style capability check for ptrace */ - if (profile_unconfined(tracer) || tracer == tracee) + return profile_ptrace_perm(tracer, tracee, request, sa); + + /* profile uses the old style capability check for ptrace */ + if (&tracer->label == tracee) return 0; aad(sa)->label = &tracer->label; - aad(sa)->peer = &tracee->label; + aad(sa)->peer = tracee; aad(sa)->request = 0; aad(sa)->error = aa_capable(&tracer->label, CAP_SYS_PTRACE, 1); @@ -115,10 +123,13 @@ static int cross_ptrace_perm(struct aa_profile *tracer, int aa_may_ptrace(struct aa_label *tracer, struct aa_label *tracee, u32 request) { + struct aa_profile *profile; + u32 xrequest = request << PTRACE_PERM_SHIFT; DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, OP_PTRACE); - return xcheck_labels_profiles(tracer, tracee, cross_ptrace_perm, - request, &sa); + return xcheck_labels(tracer, tracee, profile, + profile_tracer_perm(profile, tracee, request, &sa), + profile_tracee_perm(profile, tracer, xrequest, &sa)); } -- cgit v1.2.3 From 1a3881d305592d947ed47887306919d50112394d Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 11 Jan 2018 13:07:54 -0800 Subject: apparmor: Fix regression in profile conflict logic The intended behaviour in apparmor profile matching is to flag a conflict if two profiles match equally well. However, right now a conflict is generated if another profile has the same match length even if that profile doesn't actually match. Fix the logic so we only generate a conflict if the profiles match. Fixes: 844b8292b631 ("apparmor: ensure that undecidable profile attachments fail") Cc: Stable Signed-off-by: Matthew Garrett Signed-off-by: John Johansen --- security/apparmor/domain.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 04ba9d0718ea..6a54d2ffa840 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -330,10 +330,7 @@ static struct aa_profile *__attach_match(const char *name, continue; if (profile->xmatch) { - if (profile->xmatch_len == len) { - conflict = true; - continue; - } else if (profile->xmatch_len > len) { + if (profile->xmatch_len >= len) { unsigned int state; u32 perm; @@ -342,6 +339,10 @@ static struct aa_profile *__attach_match(const char *name, perm = dfa_user_allow(profile->xmatch, state); /* any accepting state means a valid match. */ if (perm & MAY_EXEC) { + if (profile->xmatch_len == len) { + conflict = true; + continue; + } candidate = profile; len = profile->xmatch_len; conflict = false; -- cgit v1.2.3 From 352909b49ba0d74929b96af6dfbefc854ab6ebb5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Jan 2018 17:16:51 -0800 Subject: selftests/x86: Add test_vsyscall This tests that the vsyscall entries do what they're expected to do. It also confirms that attempts to read the vsyscall page behave as expected. If changes are made to the vsyscall code or its memory map handling, running this test in all three of vsyscall=none, vsyscall=emulate, and vsyscall=native are helpful. (Because it's easy, this also compares the vsyscall results to their vDSO equivalents.) Note to KAISER backporters: please test this under all three vsyscall modes. Also, in the emulate and native modes, make sure that test_vsyscall_64 agrees with the command line or config option as to which mode you're in. It's quite easy to mess up the kernel such that native mode accidentally emulates or vice versa. Greg, etc: please backport this to all your Meltdown-patched kernels. It'll help make sure the patches didn't regress vsyscalls. CSigned-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/2b9c5a174c1d60fd7774461d518aa75598b1d8fd.1515719552.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/test_vsyscall.c | 500 ++++++++++++++++++++++++++++ 2 files changed, 501 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/test_vsyscall.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 7b1adeee4b0f..91fbfa8fdc15 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -7,7 +7,7 @@ include ../lib.mk TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \ check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ioperm \ - protection_keys test_vdso + protection_keys test_vdso test_vsyscall TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c new file mode 100644 index 000000000000..7a744fa7b786 --- /dev/null +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -0,0 +1,500 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __x86_64__ +# define VSYS(x) (x) +#else +# define VSYS(x) 0 +#endif + +#ifndef SYS_getcpu +# ifdef __x86_64__ +# define SYS_getcpu 309 +# else +# define SYS_getcpu 318 +# endif +#endif + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +/* vsyscalls and vDSO */ +bool should_read_vsyscall = false; + +typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); +gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000); +gtod_t vdso_gtod; + +typedef int (*vgettime_t)(clockid_t, struct timespec *); +vgettime_t vdso_gettime; + +typedef long (*time_func_t)(time_t *t); +time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400); +time_func_t vdso_time; + +typedef long (*getcpu_t)(unsigned *, unsigned *, void *); +getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800); +getcpu_t vdso_getcpu; + +static void init_vdso(void) +{ + void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) { + printf("[WARN]\tfailed to find vDSO\n"); + return; + } + + vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday"); + if (!vdso_gtod) + printf("[WARN]\tfailed to find gettimeofday in vDSO\n"); + + vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); + if (!vdso_gettime) + printf("[WARN]\tfailed to find clock_gettime in vDSO\n"); + + vdso_time = (time_func_t)dlsym(vdso, "__vdso_time"); + if (!vdso_time) + printf("[WARN]\tfailed to find time in vDSO\n"); + + vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu"); + if (!vdso_getcpu) { + /* getcpu() was never wired up in the 32-bit vDSO. */ + printf("[%s]\tfailed to find getcpu in vDSO\n", + sizeof(long) == 8 ? "WARN" : "NOTE"); + } +} + +static int init_vsys(void) +{ +#ifdef __x86_64__ + int nerrs = 0; + FILE *maps; + char line[128]; + bool found = false; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); + should_read_vsyscall = true; + return 0; + } + + while (fgets(line, sizeof(line), maps)) { + char r, x; + void *start, *end; + char name[128]; + if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s", + &start, &end, &r, &x, name) != 5) + continue; + + if (strcmp(name, "[vsyscall]")) + continue; + + printf("\tvsyscall map: %s", line); + + if (start != (void *)0xffffffffff600000 || + end != (void *)0xffffffffff601000) { + printf("[FAIL]\taddress range is nonsense\n"); + nerrs++; + } + + printf("\tvsyscall permissions are %c-%c\n", r, x); + should_read_vsyscall = (r == 'r'); + if (x != 'x') { + vgtod = NULL; + vtime = NULL; + vgetcpu = NULL; + } + + found = true; + break; + } + + fclose(maps); + + if (!found) { + printf("\tno vsyscall map in /proc/self/maps\n"); + should_read_vsyscall = false; + vgtod = NULL; + vtime = NULL; + vgetcpu = NULL; + } + + return nerrs; +#else + return 0; +#endif +} + +/* syscalls */ +static inline long sys_gtod(struct timeval *tv, struct timezone *tz) +{ + return syscall(SYS_gettimeofday, tv, tz); +} + +static inline int sys_clock_gettime(clockid_t id, struct timespec *ts) +{ + return syscall(SYS_clock_gettime, id, ts); +} + +static inline long sys_time(time_t *t) +{ + return syscall(SYS_time, t); +} + +static inline long sys_getcpu(unsigned * cpu, unsigned * node, + void* cache) +{ + return syscall(SYS_getcpu, cpu, node, cache); +} + +static jmp_buf jmpbuf; + +static void sigsegv(int sig, siginfo_t *info, void *ctx_void) +{ + siglongjmp(jmpbuf, 1); +} + +static double tv_diff(const struct timeval *a, const struct timeval *b) +{ + return (double)(a->tv_sec - b->tv_sec) + + (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6; +} + +static int check_gtod(const struct timeval *tv_sys1, + const struct timeval *tv_sys2, + const struct timezone *tz_sys, + const char *which, + const struct timeval *tv_other, + const struct timezone *tz_other) +{ + int nerrs = 0; + double d1, d2; + + if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) { + printf("[FAIL] %s tz mismatch\n", which); + nerrs++; + } + + d1 = tv_diff(tv_other, tv_sys1); + d2 = tv_diff(tv_sys2, tv_other); + printf("\t%s time offsets: %lf %lf\n", which, d1, d2); + + if (d1 < 0 || d2 < 0) { + printf("[FAIL]\t%s time was inconsistent with the syscall\n", which); + nerrs++; + } else { + printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which); + } + + return nerrs; +} + +static int test_gtod(void) +{ + struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys; + struct timezone tz_sys, tz_vdso, tz_vsys; + long ret_vdso = -1; + long ret_vsys = -1; + int nerrs = 0; + + printf("[RUN]\ttest gettimeofday()\n"); + + if (sys_gtod(&tv_sys1, &tz_sys) != 0) + err(1, "syscall gettimeofday"); + if (vdso_gtod) + ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso); + if (vgtod) + ret_vsys = vgtod(&tv_vsys, &tz_vsys); + if (sys_gtod(&tv_sys2, &tz_sys) != 0) + err(1, "syscall gettimeofday"); + + if (vdso_gtod) { + if (ret_vdso == 0) { + nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso); + } else { + printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso); + nerrs++; + } + } + + if (vgtod) { + if (ret_vsys == 0) { + nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); + } else { + printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys); + nerrs++; + } + } + + return nerrs; +} + +static int test_time(void) { + int nerrs = 0; + + printf("[RUN]\ttest time()\n"); + long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0; + long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1; + t_sys1 = sys_time(&t2_sys1); + if (vdso_time) + t_vdso = vdso_time(&t2_vdso); + if (vtime) + t_vsys = vtime(&t2_vsys); + t_sys2 = sys_time(&t2_sys2); + if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) { + printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2); + nerrs++; + return nerrs; + } + + if (vdso_time) { + if (t_vdso < 0 || t_vdso != t2_vdso) { + printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso); + nerrs++; + } else if (t_vdso < t_sys1 || t_vdso > t_sys2) { + printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2); + nerrs++; + } else { + printf("[OK]\tvDSO time() is okay\n"); + } + } + + if (vtime) { + if (t_vsys < 0 || t_vsys != t2_vsys) { + printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys); + nerrs++; + } else if (t_vsys < t_sys1 || t_vsys > t_sys2) { + printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2); + nerrs++; + } else { + printf("[OK]\tvsyscall time() is okay\n"); + } + } + + return nerrs; +} + +static int test_getcpu(int cpu) +{ + int nerrs = 0; + long ret_sys, ret_vdso = -1, ret_vsys = -1; + + printf("[RUN]\tgetcpu() on CPU %d\n", cpu); + + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { + printf("[SKIP]\tfailed to force CPU %d\n", cpu); + return nerrs; + } + + unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys; + unsigned node = 0; + bool have_node = false; + ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0); + if (vdso_getcpu) + ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0); + if (vgetcpu) + ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0); + + if (ret_sys == 0) { + if (cpu_sys != cpu) { + printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu); + nerrs++; + } + + have_node = true; + node = node_sys; + } + + if (vdso_getcpu) { + if (ret_vdso) { + printf("[FAIL]\tvDSO getcpu() failed\n"); + nerrs++; + } else { + if (!have_node) { + have_node = true; + node = node_vdso; + } + + if (cpu_vdso != cpu) { + printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu); + nerrs++; + } else { + printf("[OK]\tvDSO reported correct CPU\n"); + } + + if (node_vdso != node) { + printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node); + nerrs++; + } else { + printf("[OK]\tvDSO reported correct node\n"); + } + } + } + + if (vgetcpu) { + if (ret_vsys) { + printf("[FAIL]\tvsyscall getcpu() failed\n"); + nerrs++; + } else { + if (!have_node) { + have_node = true; + node = node_vsys; + } + + if (cpu_vsys != cpu) { + printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu); + nerrs++; + } else { + printf("[OK]\tvsyscall reported correct CPU\n"); + } + + if (node_vsys != node) { + printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node); + nerrs++; + } else { + printf("[OK]\tvsyscall reported correct node\n"); + } + } + } + + return nerrs; +} + +static int test_vsys_r(void) +{ +#ifdef __x86_64__ + printf("[RUN]\tChecking read access to the vsyscall page\n"); + bool can_read; + if (sigsetjmp(jmpbuf, 1) == 0) { + *(volatile int *)0xffffffffff600000; + can_read = true; + } else { + can_read = false; + } + + if (can_read && !should_read_vsyscall) { + printf("[FAIL]\tWe have read access, but we shouldn't\n"); + return 1; + } else if (!can_read && should_read_vsyscall) { + printf("[FAIL]\tWe don't have read access, but we should\n"); + return 1; + } else { + printf("[OK]\tgot expected result\n"); + } +#endif + + return 0; +} + + +#ifdef __x86_64__ +#define X86_EFLAGS_TF (1UL << 8) +static volatile sig_atomic_t num_vsyscall_traps; + +static unsigned long get_eflags(void) +{ + unsigned long eflags; + asm volatile ("pushfq\n\tpopq %0" : "=rm" (eflags)); + return eflags; +} + +static void set_eflags(unsigned long eflags) +{ + asm volatile ("pushq %0\n\tpopfq" : : "rm" (eflags) : "flags"); +} + +static void sigtrap(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t *)ctx_void; + unsigned long ip = ctx->uc_mcontext.gregs[REG_RIP]; + + if (((ip ^ 0xffffffffff600000UL) & ~0xfffUL) == 0) + num_vsyscall_traps++; +} + +static int test_native_vsyscall(void) +{ + time_t tmp; + bool is_native; + + if (!vtime) + return 0; + + printf("[RUN]\tchecking for native vsyscall\n"); + sethandler(SIGTRAP, sigtrap, 0); + set_eflags(get_eflags() | X86_EFLAGS_TF); + vtime(&tmp); + set_eflags(get_eflags() & ~X86_EFLAGS_TF); + + /* + * If vsyscalls are emulated, we expect a single trap in the + * vsyscall page -- the call instruction will trap with RIP + * pointing to the entry point before emulation takes over. + * In native mode, we expect two traps, since whatever code + * the vsyscall page contains will be more than just a ret + * instruction. + */ + is_native = (num_vsyscall_traps > 1); + + printf("\tvsyscalls are %s (%d instructions in vsyscall page)\n", + (is_native ? "native" : "emulated"), + (int)num_vsyscall_traps); + + return 0; +} +#endif + +int main(int argc, char **argv) +{ + int nerrs = 0; + + init_vdso(); + nerrs += init_vsys(); + + nerrs += test_gtod(); + nerrs += test_time(); + nerrs += test_getcpu(0); + nerrs += test_getcpu(1); + + sethandler(SIGSEGV, sigsegv, 0); + nerrs += test_vsys_r(); + +#ifdef __x86_64__ + nerrs += test_native_vsyscall(); +#endif + + return nerrs ? 1 : 0; +} -- cgit v1.2.3 From 36c1681678b507346e7397a235a7303dad665fc3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 11 Jan 2018 18:28:08 +0900 Subject: genksyms: drop *.hash.c from .gitignore This is a left-over of commit bb3290d91695 ("Remove gperf usage from toolchain"). We do not generate a hash function any more. Signed-off-by: Masahiro Yamada --- scripts/genksyms/.gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/genksyms/.gitignore b/scripts/genksyms/.gitignore index 86dc07a01b43..e7836b47f060 100644 --- a/scripts/genksyms/.gitignore +++ b/scripts/genksyms/.gitignore @@ -1,4 +1,3 @@ -*.hash.c *.lex.c *.tab.c *.tab.h -- cgit v1.2.3 From bed6760cf2c40778a58f2e399c8947b3b3c55518 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 12 Jan 2018 16:53:07 -0800 Subject: MAINTAINERS, nilfs2: change project home URLs The domain of NILFS project home was changed to "nilfs.sourceforge.io" to enable https access (the previous domain "nilfs.sourceforge.net" is redirected to the new one). Modify URLs of the project home to reflect this change and to replace their protocol from http to https. Link: http://lkml.kernel.org/r/1515416141-5614-1-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/nilfs2.txt | 4 ++-- MAINTAINERS | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index c0727dc36271..f2f3f8592a6f 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -25,8 +25,8 @@ available from the following download page. At least "mkfs.nilfs2", cleaner or garbage collector) are required. Details on the tools are described in the man pages included in the package. -Project web page: http://nilfs.sourceforge.net/ -Download page: http://nilfs.sourceforge.net/en/download.html +Project web page: https://nilfs.sourceforge.io/ +Download page: https://nilfs.sourceforge.io/en/download.html List info: http://vger.kernel.org/vger-lists.html#linux-nilfs Caveats diff --git a/MAINTAINERS b/MAINTAINERS index d76af75a653a..18994806e441 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9638,8 +9638,8 @@ F: include/uapi/linux/sunrpc/ NILFS2 FILESYSTEM M: Ryusuke Konishi L: linux-nilfs@vger.kernel.org -W: http://nilfs.sourceforge.net/ -W: http://nilfs.osdn.jp/ +W: https://nilfs.sourceforge.io/ +W: https://nilfs.osdn.jp/ T: git git://github.com/konis/nilfs2.git S: Supported F: Documentation/filesystems/nilfs2.txt -- cgit v1.2.3 From d9570ee3bd1d4f20ce63485f5ef05663866fe6c0 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Fri, 12 Jan 2018 16:53:10 -0800 Subject: kmemleak: allow to coexist with fault injection kmemleak does one slab allocation per user allocation. So if slab fault injection is enabled to any degree, kmemleak instantly fails to allocate and turns itself off. However, it's useful to use kmemleak with fault injection to find leaks on error paths. On the other hand, checking kmemleak itself is not so useful because (1) it's a debugging tool and (2) it has a very regular allocation pattern (basically a single allocation site, so it either works or not). Turn off fault injection for kmemleak allocations. Link: http://lkml.kernel.org/r/20180109192243.19316-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d73c14294f3a..f656ca27f6c2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -127,7 +127,7 @@ /* GFP bitmask for kmemleak internal allocations */ #define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ - __GFP_NOWARN) + __GFP_NOWARN | __GFP_NOFAIL) /* scanning area inside a memory block */ struct kmemleak_scan_area { -- cgit v1.2.3 From a0b1280368d1e91ab72f849ef095b4f07a39bbf1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 12 Jan 2018 16:53:14 -0800 Subject: kdump: write correct address of mem_section into vmcoreinfo Depending on configuration mem_section can now be an array or a pointer to an array allocated dynamically. In most cases, we can continue to refer to it as 'mem_section' regardless of what it is. But there's one exception: '&mem_section' means "address of the array" if mem_section is an array, but if mem_section is a pointer, it would mean "address of the pointer". We've stepped onto this in kdump code. VMCOREINFO_SYMBOL(mem_section) writes down address of pointer into vmcoreinfo, not array as we wanted. Let's introduce VMCOREINFO_SYMBOL_ARRAY() that would handle the situation correctly for both cases. Link: http://lkml.kernel.org/r/20180112162532.35896-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") Acked-by: Baoquan He Acked-by: Dave Young Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_core.h | 2 ++ kernel/crash_core.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 06097ef30449..b511f6d24b42 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -42,6 +42,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("PAGESIZE=%ld\n", value) #define VMCOREINFO_SYMBOL(name) \ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SYMBOL_ARRAY(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) #define VMCOREINFO_SIZE(name) \ vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ (unsigned long)sizeof(name)) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b3663896278e..4f63597c824d 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_SYMBOL_ARRAY(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); -- cgit v1.2.3 From 0f908ccbeca99ddf0ad60afa710e72aded4a5ea7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 12 Jan 2018 16:53:17 -0800 Subject: tools/objtool/Makefile: don't assume sync-check.sh is executable patch(1) loses the x bit. So if a user follows our patching instructions in Documentation/admin-guide/README.rst, their kernel will not compile. Fixes: 3bd51c5a371de ("objtool: Move kernel headers/code sync check to a script") Reported-by: Nicolas Bock Reported-by Joakim Tjernlund Cc: Ingo Molnar Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/objtool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index ae0272f9a091..e6acc281dd37 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -46,7 +46,7 @@ $(OBJTOOL_IN): fixdep FORCE @$(MAKE) $(build)=objtool $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) - @./sync-check.sh + @$(CONFIG_SHELL) ./sync-check.sh $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ -- cgit v1.2.3 From f10ee3dcc9f0aba92a5c4c064628be5200765dc2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jan 2018 00:23:57 +0100 Subject: x86/pti: Fix !PCID and sanitize defines The switch to the user space page tables in the low level ASM code sets unconditionally bit 12 and bit 11 of CR3. Bit 12 is switching the base address of the page directory to the user part, bit 11 is switching the PCID to the PCID associated with the user page tables. This fails on a machine which lacks PCID support because bit 11 is set in CR3. Bit 11 is reserved when PCID is inactive. While the Intel SDM claims that the reserved bits are ignored when PCID is disabled, the AMD APM states that they should be cleared. This went unnoticed as the AMD APM was not checked when the code was developed and reviewed and test systems with Intel CPUs never failed to boot. The report is against a Centos 6 host where the guest fails to boot, so it's not yet clear whether this is a virt issue or can happen on real hardware too, but thats irrelevant as the AMD APM clearly ask for clearing the reserved bits. Make sure that on non PCID machines bit 11 is not set by the page table switching code. Andy suggested to rename the related bits and masks so they are clearly describing what they should be used for, which is done as well for clarity. That split could have been done with alternatives but the macro hell is horrible and ugly. This can be done on top if someone cares to remove the extra orq. For now it's a straight forward fix. Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") Reported-by: Laura Abbott Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: stable Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Willy Tarreau Cc: David Woodhouse Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801140009150.2371@nanos --- arch/x86/entry/calling.h | 36 ++++++++++++++++++---------------- arch/x86/include/asm/processor-flags.h | 2 +- arch/x86/include/asm/tlbflush.h | 6 +++--- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 45a63e00a6af..3f48f695d5e6 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -198,8 +198,11 @@ For 32-bit we have the following conventions - kernel is built with * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two * halves: */ -#define PTI_SWITCH_PGTABLES_MASK (1<= (1 << X86_CR3_PTI_SWITCH_BIT)); + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); /* * The ASID being passed in here should have respected the * MAX_ASID_AVAILABLE and thus never have the switch bit set. */ - VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); #endif /* * The dynamically-assigned ASIDs that get passed in are small @@ -112,7 +112,7 @@ static inline u16 user_pcid(u16 asid) { u16 ret = kern_pcid(asid); #ifdef CONFIG_PAGE_TABLE_ISOLATION - ret |= 1 << X86_CR3_PTI_SWITCH_BIT; + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; #endif return ret; } -- cgit v1.2.3 From a237f762681e2a394ca67f21df2feb2b76a3609b Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Fri, 12 Jan 2018 15:24:59 -0800 Subject: security/Kconfig: Correct the Documentation reference for PTI When the config option for PTI was added a reference to documentation was added as well. But the documentation did not exist at that point. The final documentation has a different file name. Fix it up to point to the proper file. Fixes: 385ce0ea ("x86/mm/pti: Add Kconfig") Signed-off-by: W. Trevor King Signed-off-by: Thomas Gleixner Cc: Dave Hansen Cc: linux-mm@kvack.org Cc: linux-security-module@vger.kernel.org Cc: James Morris Cc: "Serge E. Hallyn" Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/3009cc8ccbddcd897ec1e0cb6dda524929de0d14.1515799398.git.wking@tremily.us --- security/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/Kconfig b/security/Kconfig index 3d4debd0257e..b0cb9a5f9448 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -63,7 +63,7 @@ config PAGE_TABLE_ISOLATION ensuring that the majority of kernel addresses are not mapped into userspace. - See Documentation/x86/pagetable-isolation.txt for more details. + See Documentation/x86/pti.txt for more details. config SECURITY_INFINIBAND bool "Infiniband Security Hooks" -- cgit v1.2.3 From 99a9dc98ba52267ce5e062b52de88ea1f1b2a7d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 14 Jan 2018 11:27:13 +0100 Subject: x86,perf: Disable intel_bts when PTI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intel_bts driver does not use the 'normal' BTS buffer which is exposed through the cpu_entry_area but instead uses the memory allocated for the perf AUX buffer. This obviously comes apart when using PTI because then the kernel mapping; which includes that AUX buffer memory; disappears. Fixing this requires to expose a mapping which is visible in all context and that's not trivial. As a quick fix disable this driver when PTI is enabled to prevent malfunction. Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") Reported-by: Vince Weaver Reported-by: Robert ÅšwiÄ™cki Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Alexander Shishkin Cc: greg@kroah.com Cc: hughd@google.com Cc: luto@amacapital.net Cc: Vince Weaver Cc: torvalds@linux-foundation.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180114102713.GB6166@worktop.programming.kicks-ass.net --- arch/x86/events/intel/bts.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 141e07b06216..24ffa1e88cf9 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -582,6 +582,24 @@ static __init int bts_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) return -ENODEV; + if (boot_cpu_has(X86_FEATURE_PTI)) { + /* + * BTS hardware writes through a virtual memory map we must + * either use the kernel physical map, or the user mapping of + * the AUX buffer. + * + * However, since this driver supports per-CPU and per-task inherit + * we cannot use the user mapping since it will not be availble + * if we're not running the owning process. + * + * With PTI we can't use the kernal map either, because its not + * there when we run userspace. + * + * For now, disable this driver when using PTI. + */ + return -ENODEV; + } + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | PERF_PMU_CAP_EXCLUSIVE; bts_pmu.task_ctx_nr = perf_sw_context; -- cgit v1.2.3 From b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jan 2018 22:13:29 +0100 Subject: x86/retpoline: Remove compile time warning Remove the compile time warning when CONFIG_RETPOLINE=y and the compiler does not have retpoline support. Linus rationale for this is: It's wrong because it will just make people turn off RETPOLINE, and the asm updates - and return stack clearing - that are independent of the compiler are likely the most important parts because they are likely the ones easiest to target. And it's annoying because most people won't be able to do anything about it. The number of people building their own compiler? Very small. So if their distro hasn't got a compiler yet (and pretty much nobody does), the warning is just annoying crap. It is already properly reported as part of the sysfs interface. The compile-time warning only encourages bad things. Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Requested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Cc: David Woodhouse Cc: Peter Zijlstra (Intel) Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/CA+55aFzWgquv4i6Mab6bASqYXg3ErV3XDFEYf=GEcCDQg5uAtw@mail.gmail.com --- arch/x86/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 974c61864978..504b1a4535ac 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -240,8 +240,6 @@ ifdef CONFIG_RETPOLINE RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) ifneq ($(RETPOLINE_CFLAGS),) KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE - else - $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) endif endif -- cgit v1.2.3 From a8750ddca918032d6349adbf9a4b6555e7db20da Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Jan 2018 15:32:30 -0800 Subject: Linux 4.15-rc8 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c4aa6210a2a4..bf5b8cbb9469 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8 NAME = Fearless Coyote # *DOCUMENTATION* -- cgit v1.2.3 From a48a52b7bea81c046fe1c1288f84d0eba214cba0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Jan 2018 09:12:05 +0100 Subject: cfg80211: fully initialize old channel for event Paul reported that he got a report about undefined behaviour that seems to me to originate in using uninitialized memory when the channel structure here is used in the event code in nl80211 later. He never reported whether this fixed it, and I wasn't able to trigger this so far, but we should do the right thing and fully initialize the on-stack structure anyway. Reported-by: Paul Menzel Signed-off-by: Johannes Berg --- net/wireless/reg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 78e71b0390be..7b42f0bacfd8 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1769,8 +1769,7 @@ static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx, if (wiphy->regulatory_flags & REGULATORY_DISABLE_BEACON_HINTS) return; - chan_before.center_freq = chan->center_freq; - chan_before.flags = chan->flags; + chan_before = *chan; if (chan->flags & IEEE80211_CHAN_NO_IR) { chan->flags &= ~IEEE80211_CHAN_NO_IR; -- cgit v1.2.3 From 7a94b8c2eee7083ddccd0515830f8c81a8e44b1a Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Mon, 15 Jan 2018 08:12:15 +0100 Subject: nl80211: take RCU read lock when calling ieee80211_bss_get_ie() As ieee80211_bss_get_ie() derefences an RCU to return ssid_ie, both the call to this function and any operation on this variable need protection by the RCU read lock. Fixes: 44905265bc15 ("nl80211: don't expose wdev->ssid for most interfaces") Signed-off-by: Dominik Brodowski Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 2b3dbcd40e46..ed87a97fcb0b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2618,12 +2618,13 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag const u8 *ssid_ie; if (!wdev->current_bss) break; + rcu_read_lock(); ssid_ie = ieee80211_bss_get_ie(&wdev->current_bss->pub, WLAN_EID_SSID); - if (!ssid_ie) - break; - if (nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) - goto nla_put_failure_locked; + if (ssid_ie && + nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) + goto nla_put_failure_rcu_locked; + rcu_read_unlock(); break; } default: @@ -2635,6 +2636,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag genlmsg_end(msg, hdr); return 0; + nla_put_failure_rcu_locked: + rcu_read_unlock(); nla_put_failure_locked: wdev_unlock(wdev); nla_put_failure: -- cgit v1.2.3 From b71d856ab536f25eb97c011a351ecddf5518de41 Mon Sep 17 00:00:00 2001 From: Benjamin Beichler Date: Wed, 10 Jan 2018 17:42:51 +0100 Subject: mac80211_hwsim: add workqueue to wait for deferred radio deletion on mod unload When closing multiple wmediumd instances with many radios and try to unload the mac80211_hwsim module, it may happen that the work items live longer than the module. To wait especially for this deletion work items, add a work queue, otherwise flush_scheduled_work would be necessary. Signed-off-by: Benjamin Beichler Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index e8189c07b41f..ccd573e53c92 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -489,6 +489,7 @@ static const struct ieee80211_iface_combination hwsim_if_comb_p2p_dev[] = { static spinlock_t hwsim_radio_lock; static LIST_HEAD(hwsim_radios); +static struct workqueue_struct *hwsim_wq; static int hwsim_radio_idx; static struct platform_driver mac80211_hwsim_driver = { @@ -3342,7 +3343,7 @@ static void remove_user_radios(u32 portid) if (entry->destroy_on_close && entry->portid == portid) { list_del(&entry->list); INIT_WORK(&entry->destroy_work, destroy_radio); - schedule_work(&entry->destroy_work); + queue_work(hwsim_wq, &entry->destroy_work); } } spin_unlock_bh(&hwsim_radio_lock); @@ -3417,7 +3418,7 @@ static void __net_exit hwsim_exit_net(struct net *net) list_del(&data->list); INIT_WORK(&data->destroy_work, destroy_radio); - schedule_work(&data->destroy_work); + queue_work(hwsim_wq, &data->destroy_work); } spin_unlock_bh(&hwsim_radio_lock); } @@ -3449,6 +3450,10 @@ static int __init init_mac80211_hwsim(void) spin_lock_init(&hwsim_radio_lock); + hwsim_wq = alloc_workqueue("hwsim_wq",WQ_MEM_RECLAIM,0); + if (!hwsim_wq) + return -ENOMEM; + err = register_pernet_device(&hwsim_net_ops); if (err) return err; @@ -3587,8 +3592,11 @@ static void __exit exit_mac80211_hwsim(void) hwsim_exit_netlink(); mac80211_hwsim_free(); + flush_workqueue(hwsim_wq); + unregister_netdev(hwsim_mon); platform_driver_unregister(&mac80211_hwsim_driver); unregister_pernet_device(&hwsim_net_ops); + destroy_workqueue(hwsim_wq); } module_exit(exit_mac80211_hwsim); -- cgit v1.2.3 From 51a1aaa631c90223888d8beac4d649dc11d2ca55 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Jan 2018 09:32:36 +0100 Subject: mac80211_hwsim: validate number of different channels When creating a new radio on the fly, hwsim allows this to be done with an arbitrary number of channels, but cfg80211 only supports a limited number of simultaneous channels, leading to a warning. Fix this by validating the number - this requires moving the define for the maximum out to a visible header file. Reported-by: syzbot+8dd9051ff19940290931@syzkaller.appspotmail.com Fixes: b59ec8dd4394 ("mac80211_hwsim: fix number of channels in interface combinations") Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 5 +++++ include/net/cfg80211.h | 2 ++ net/wireless/core.h | 2 -- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index ccd573e53c92..f6d4a50f1bdb 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3121,6 +3121,11 @@ static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) if (info->attrs[HWSIM_ATTR_CHANNELS]) param.channels = nla_get_u32(info->attrs[HWSIM_ATTR_CHANNELS]); + if (param.channels > CFG80211_MAX_NUM_DIFFERENT_CHANNELS) { + GENL_SET_ERR_MSG(info, "too many channels specified"); + return -EINVAL; + } + if (info->attrs[HWSIM_ATTR_NO_VIF]) param.no_vif = true; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cb4d92b79cd9..fb94a8bd8ab5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -815,6 +815,8 @@ struct cfg80211_csa_settings { u8 count; }; +#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 + /** * struct iface_combination_params - input parameters for interface combinations * diff --git a/net/wireless/core.h b/net/wireless/core.h index d2f7e8b8a097..eaff636169c2 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -507,8 +507,6 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); -#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 - #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS #define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) #else -- cgit v1.2.3 From 59b179b48ce2a6076448a44531242ac2b3f6cef2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Jan 2018 09:58:27 +0100 Subject: cfg80211: check dev_set_name() return value syzbot reported a warning from rfkill_alloc(), and after a while I think that the reason is that it was doing fault injection and the dev_set_name() failed, leaving the name NULL, and we didn't check the return value and got to rfkill_alloc() with a NULL name. Since we really don't want a NULL name, we ought to check the return value. Fixes: fb28ad35906a ("net: struct device - replace bus_id with dev_name(), dev_set_name()") Reported-by: syzbot+1ddfb3357e1d7bb5b5d3@syzkaller.appspotmail.com Signed-off-by: Johannes Berg --- net/wireless/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/wireless/core.c b/net/wireless/core.c index fdde0d98fde1..a6f3cac8c640 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -439,6 +439,8 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, if (rv) goto use_default_name; } else { + int rv; + use_default_name: /* NOTE: This is *probably* safe w/out holding rtnl because of * the restrictions on phy names. Probably this call could @@ -446,7 +448,11 @@ use_default_name: * phyX. But, might should add some locking and check return * value, and use a different name if this one exists? */ - dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); + rv = dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); + if (rv < 0) { + kfree(rdev); + return NULL; + } } INIT_LIST_HEAD(&rdev->wiphy.wdev_list); -- cgit v1.2.3 From a0e3a18f4baf8e3754ac1e56f0ade924d0c0c721 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 15 Jan 2018 10:47:09 -0500 Subject: ring-buffer: Bring back context level recursive checks Commit 1a149d7d3f45 ("ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler") replaced the context level recursion checks with a simple counter. This would prevent the ring buffer code from recursively calling itself more than the max number of contexts that exist (Normal, softirq, irq, nmi). But this change caused a lockup in a specific case, which was during suspend and resume using a global clock. Adding a stack dump to see where this occurred, the issue was in the trace global clock itself: trace_buffer_lock_reserve+0x1c/0x50 __trace_graph_entry+0x2d/0x90 trace_graph_entry+0xe8/0x200 prepare_ftrace_return+0x69/0xc0 ftrace_graph_caller+0x78/0xa8 queued_spin_lock_slowpath+0x5/0x1d0 trace_clock_global+0xb0/0xc0 ring_buffer_lock_reserve+0xf9/0x390 The function graph tracer traced queued_spin_lock_slowpath that was called by trace_clock_global. This pointed out that the trace_clock_global() is not reentrant, as it takes a spin lock. It depended on the ring buffer recursive lock from letting that happen. By removing the context detection and adding just a max number of allowable recursions, it allowed the trace_clock_global() to be entered again and try to retake the spinlock it already held, causing a deadlock. Fixes: 1a149d7d3f45 ("ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler") Reported-by: David Weinehall Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 62 +++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9ab18995ff1e..0cddf60186da 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2534,29 +2534,59 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. There are four - * different contexts that we need to consider. + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. * - * Normal context. - * SoftIRQ context - * IRQ context - * NMI context + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. * - * If for some reason the ring buffer starts to recurse, we - * only allow that to happen at most 4 times (one for each - * context). If it happens 5 times, then we consider this a - * recusive loop and do not let it go further. + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. */ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - if (cpu_buffer->current_context >= 4) + unsigned int val = cpu_buffer->current_context; + unsigned long pc = preempt_count(); + int bit; + + if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) + bit = RB_CTX_NORMAL; + else + bit = pc & NMI_MASK ? RB_CTX_NMI : + pc & HARDIRQ_MASK ? RB_CTX_IRQ : + pc & SOFTIRQ_OFFSET ? 2 : RB_CTX_SOFTIRQ; + + if (unlikely(val & (1 << bit))) return 1; - cpu_buffer->current_context++; - /* Interrupts must see this update */ - barrier(); + val |= (1 << bit); + cpu_buffer->current_context = val; return 0; } @@ -2564,9 +2594,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - /* Don't let the dec leak out */ - barrier(); - cpu_buffer->current_context--; + cpu_buffer->current_context &= cpu_buffer->current_context - 1; } /** -- cgit v1.2.3 From d542296a4d0d9f41d0186edcac2baba1b674d02f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 8 Jan 2018 08:23:18 -0800 Subject: 9p: add missing module license for xen transport The 9P of Xen module is missing required license and module information. See https://bugzilla.kernel.org/show_bug.cgi?id=198109 Reported-by: Alan Bartlett Fixes: 868eb122739a ("xen/9pfs: introduce Xen 9pfs transport driver") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/9p/trans_xen.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 325c56043007..086a4abdfa7c 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -543,3 +543,7 @@ static void p9_trans_xen_exit(void) return xenbus_unregister_driver(&xen_9pfs_front_driver); } module_exit(p9_trans_xen_exit); + +MODULE_AUTHOR("Stefano Stabellini "); +MODULE_DESCRIPTION("Xen Transport for 9P"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 66940f35d5a81d5969bb5543171c70a434fc5110 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 10 Jan 2018 16:03:05 +0200 Subject: ptr_ring: document usage around __ptr_ring_peek This explains why is the net usage of __ptr_ring_peek actually ok without locks. Signed-off-by: Michael S. Tsirkin Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 6866df4f31b5..d72b2e7dd500 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -174,6 +174,15 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) * if they dereference the pointer - see e.g. PTR_RING_PEEK_CALL. * If ring is never resized, and if the pointer is merely * tested, there's no need to take the lock - see e.g. __ptr_ring_empty. + * However, if called outside the lock, and if some other CPU + * consumes ring entries at the same time, the value returned + * is not guaranteed to be correct. + * In this case - to avoid incorrectly detecting the ring + * as empty - the CPU consuming the ring entries is responsible + * for either consuming all ring entries until the ring is empty, + * or synchronizing with some other CPU and causing it to + * execute __ptr_ring_peek and/or consume the ring enteries + * after the synchronization point. */ static inline void *__ptr_ring_peek(struct ptr_ring *r) { @@ -182,10 +191,7 @@ static inline void *__ptr_ring_peek(struct ptr_ring *r) return NULL; } -/* Note: callers invoking this in a loop must use a compiler barrier, - * for example cpu_relax(). Callers must take consumer_lock - * if the ring is ever resized - see e.g. ptr_ring_empty. - */ +/* See __ptr_ring_peek above for locking rules. */ static inline bool __ptr_ring_empty(struct ptr_ring *r) { return !__ptr_ring_peek(r); -- cgit v1.2.3 From 0171c41835591e9aa2e384b703ef9a6ae367c610 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 10 Jan 2018 16:24:45 +0100 Subject: ppp: unlock all_ppp_mutex before registering device ppp_dev_uninit(), which is the .ndo_uninit() handler of PPP devices, needs to lock pn->all_ppp_mutex. Therefore we mustn't call register_netdevice() with pn->all_ppp_mutex already locked, or we'd deadlock in case register_netdevice() fails and calls .ndo_uninit(). Fortunately, we can unlock pn->all_ppp_mutex before calling register_netdevice(). This lock protects pn->units_idr, which isn't used in the device registration process. However, keeping pn->all_ppp_mutex locked during device registration did ensure that no device in transient state would be published in pn->units_idr. In practice, unlocking it before calling register_netdevice() doesn't change this property: ppp_unit_register() is called with 'ppp_mutex' locked and all searches done in pn->units_idr hold this lock too. Fixes: 8cb775bc0a34 ("ppp: fix device unregistration upon netns deletion") Reported-and-tested-by: syzbot+367889b9c9e279219175@syzkaller.appspotmail.com Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index d8e5747ff4e3..264d4af0bf69 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1006,17 +1006,18 @@ static int ppp_unit_register(struct ppp *ppp, int unit, bool ifname_is_set) if (!ifname_is_set) snprintf(ppp->dev->name, IFNAMSIZ, "ppp%i", ppp->file.index); + mutex_unlock(&pn->all_ppp_mutex); + ret = register_netdevice(ppp->dev); if (ret < 0) goto err_unit; atomic_inc(&ppp_unit_count); - mutex_unlock(&pn->all_ppp_mutex); - return 0; err_unit: + mutex_lock(&pn->all_ppp_mutex); unit_put(&pn->units_idr, ppp->file.index); err: mutex_unlock(&pn->all_ppp_mutex); -- cgit v1.2.3 From 6200b430220f3b9207861b16f57916950f4ecd8e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 Jan 2018 17:30:22 +0100 Subject: net: cs89x0: add MODULE_LICENSE This driver lacks a MODULE_LICENSE tag, leading to a Kbuild warning: WARNING: modpost: missing MODULE_LICENSE() in drivers/net/ethernet/cirrus/cs89x0.o This adds license, author, and description according to the comment block at the start of the file. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/cirrus/cs89x0.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/cirrus/cs89x0.c b/drivers/net/ethernet/cirrus/cs89x0.c index 410a0a95130b..b3e7fafee3df 100644 --- a/drivers/net/ethernet/cirrus/cs89x0.c +++ b/drivers/net/ethernet/cirrus/cs89x0.c @@ -1913,3 +1913,7 @@ static struct platform_driver cs89x0_driver = { module_platform_driver_probe(cs89x0_driver, cs89x0_platform_probe); #endif /* CONFIG_CS89x0_PLATFORM */ + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Crystal Semiconductor (Now Cirrus Logic) CS89[02]0 network driver"); +MODULE_AUTHOR("Russell Nelson "); -- cgit v1.2.3 From 749439bfac6e1a2932c582e2699f91d329658196 Mon Sep 17 00:00:00 2001 From: Mike Maloney Date: Wed, 10 Jan 2018 12:45:10 -0500 Subject: ipv6: fix udpv6 sendmsg crash caused by too small MTU The logic in __ip6_append_data() assumes that the MTU is at least large enough for the headers. A device's MTU may be adjusted after being added while sendmsg() is processing data, resulting in __ip6_append_data() seeing any MTU. For an mtu smaller than the size of the fragmentation header, the math results in a negative 'maxfraglen', which causes problems when refragmenting any previous skb in the skb_write_queue, leaving it possibly malformed. Instead sendmsg returns EINVAL when the mtu is calculated to be less than IPV6_MIN_MTU. Found by syzkaller: kernel BUG at ./include/linux/skbuff.h:2064! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 14216 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d0b68580 task.stack: ffff8801ac6b8000 RIP: 0010:__skb_pull include/linux/skbuff.h:2064 [inline] RIP: 0010:__ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: 0018:ffff8801ac6bf570 EFLAGS: 00010216 RAX: 0000000000010000 RBX: 0000000000000028 RCX: ffffc90003cce000 RDX: 00000000000001b8 RSI: ffffffff839df06f RDI: ffff8801d9478ca0 RBP: ffff8801ac6bf780 R08: ffff8801cc3f1dbc R09: 0000000000000000 R10: ffff8801ac6bf7a0 R11: 43cb4b7b1948a9e7 R12: ffff8801cc3f1dc8 R13: ffff8801cc3f1d40 R14: 0000000000001036 R15: dffffc0000000000 FS: 00007f43d740c700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7834984000 CR3: 00000001d79b9000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip6_finish_skb include/net/ipv6.h:911 [inline] udp_v6_push_pending_frames+0x255/0x390 net/ipv6/udp.c:1093 udpv6_sendmsg+0x280d/0x31a0 net/ipv6/udp.c:1363 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 SYSC_sendto+0x352/0x5a0 net/socket.c:1750 SyS_sendto+0x40/0x50 net/socket.c:1718 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512e9 RSP: 002b:00007f43d740bc08 EFLAGS: 00000216 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000007180a8 RCX: 00000000004512e9 RDX: 000000000000002e RSI: 0000000020d08000 RDI: 0000000000000005 RBP: 0000000000000086 R08: 00000000209c1000 R09: 000000000000001c R10: 0000000000040800 R11: 0000000000000216 R12: 00000000004b9c69 R13: 00000000ffffffff R14: 0000000000000005 R15: 00000000202c2000 Code: 9e 01 fe e9 c5 e8 ff ff e8 7f 9e 01 fe e9 4a ea ff ff 48 89 f7 e8 52 9e 01 fe e9 aa eb ff ff e8 a8 b6 cf fd 0f 0b e8 a1 b6 cf fd <0f> 0b 49 8d 45 78 4d 8d 45 7c 48 89 85 78 fe ff ff 49 8d 85 ba RIP: __skb_pull include/linux/skbuff.h:2064 [inline] RSP: ffff8801ac6bf570 RIP: __ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: ffff8801ac6bf570 Reported-by: syzbot Signed-off-by: Mike Maloney Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 688ba5f7516b..8fe58a2d305c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1206,14 +1206,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(&rt->dst); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } + if (mtu < IPV6_MIN_MTU) + return -EINVAL; cork->base.fragsize = mtu; if (dst_allfrag(rt->dst.path)) cork->base.flags |= IPCORK_ALLFRAG; -- cgit v1.2.3 From 59b36613e85fb16ebf9feaf914570879cd5c2a21 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 10 Jan 2018 12:50:25 -0800 Subject: tipc: fix a memory leak in tipc_nl_node_get_link() When tipc_node_find_by_name() fails, the nlmsg is not freed. While on it, switch to a goto label to properly free it. Fixes: be9c086715c ("tipc: narrow down exposure of struct tipc_node") Reported-by: Dmitry Vyukov Cc: Jon Maloy Cc: Ying Xue Signed-off-by: Cong Wang Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/node.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index 507017fe0f1b..9036d8756e73 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1880,36 +1880,38 @@ int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg); - if (err) { - nlmsg_free(msg.skb); - return err; - } + if (err) + goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); - if (!node) - return -EINVAL; + if (!node) { + err = -EINVAL; + goto err_free; + } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); - nlmsg_free(msg.skb); - return -EINVAL; + err = -EINVAL; + goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); - if (err) { - nlmsg_free(msg.skb); - return err; - } + if (err) + goto err_free; } return genlmsg_reply(msg.skb, info); + +err_free: + nlmsg_free(msg.skb); + return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) -- cgit v1.2.3 From cbbdf8433a5f117b1a2119ea30fc651b61ef7570 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 10 Jan 2018 13:00:39 -0800 Subject: netlink: extack needs to be reset each time through loop syzbot triggered the WARN_ON in netlink_ack testing the bad_attr value. The problem is that netlink_rcv_skb loops over the skb repeatedly invoking the callback and without resetting the extack leaving potentially stale data. Initializing each time through avoids the WARN_ON. Fixes: 2d4bc93368f5a ("netlink: extended ACK reporting") Reported-by: syzbot+315fa6766d0f7c359327@syzkaller.appspotmail.com Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 79cc1bf36e4a..47ef2d8683d6 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2384,7 +2384,7 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { - struct netlink_ext_ack extack = {}; + struct netlink_ext_ack extack; struct nlmsghdr *nlh; int err; @@ -2405,6 +2405,7 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; + memset(&extack, 0, sizeof(extack)); err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; -- cgit v1.2.3 From 6503a30440962f1e1ccb8868816b4e18201218d4 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Thu, 11 Jan 2018 18:36:26 +0900 Subject: net: ipv4: Make "ip route get" match iif lo rules again. Commit 3765d35ed8b9 ("net: ipv4: Convert inet_rtm_getroute to rcu versions of route lookup") broke "ip route get" in the presence of rules that specify iif lo. Host-originated traffic always has iif lo, because ip_route_output_key_hash and ip6_route_output_flags set the flow iif to LOOPBACK_IFINDEX. Thus, putting "iif lo" in an ip rule is a convenient way to select only originated traffic and not forwarded traffic. inet_rtm_getroute used to match these rules correctly because even though it sets the flow iif to 0, it called ip_route_output_key which overwrites iif with LOOPBACK_IFINDEX. But now that it calls ip_route_output_key_hash_rcu, the ifindex will remain 0 and not match the iif lo in the rule. As a result, "ip route get" will return ENETUNREACH. Fixes: 3765d35ed8b9 ("net: ipv4: Convert inet_rtm_getroute to rcu versions of route lookup") Tested: https://android.googlesource.com/kernel/tests/+/master/net/test/multinetwork_test.py passes again Signed-off-by: Lorenzo Colitti Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 43b69af242e1..4e153b23bcec 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2762,6 +2762,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { + fl4.flowi4_iif = LOOPBACK_IFINDEX; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) -- cgit v1.2.3 From 37f47bc90c7481e7959703ad1defc4fc9f5d85e3 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 11 Jan 2018 14:22:06 -0200 Subject: sctp: avoid compiler warning on implicit fallthru These fall-through are expected. Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 1 + net/sctp/outqueue.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 3b18085e3b10..5d4c15bf66d2 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -826,6 +826,7 @@ static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp) case AF_INET: if (!__ipv6_only_sock(sctp_opt2sk(sp))) return 1; + /* fallthru */ default: return 0; } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 7d67feeeffc1..c4ec99b20150 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -918,9 +918,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) break; case SCTP_CID_ABORT: - if (sctp_test_T_bit(chunk)) { + if (sctp_test_T_bit(chunk)) packet->vtag = asoc->c.my_vtag; - } + /* fallthru */ /* The following chunks are "response" chunks, i.e. * they are generated in response to something we * received. If we are sending these, then we can -- cgit v1.2.3 From 68e76e034b6b1c1ce2eece1ab8ae4008e14be470 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Jan 2018 11:07:27 -0800 Subject: tracing: Prevent PROFILE_ALL_BRANCHES when FORTIFY_SOURCE=y I regularly get 50 MB - 60 MB files during kernel randconfig builds. These large files mostly contain (many repeats of; e.g., 124,594): In file included from ../include/linux/string.h:6:0, from ../include/linux/uuid.h:20, from ../include/linux/mod_devicetable.h:13, from ../scripts/mod/devicetable-offsets.c:3: ../include/linux/compiler.h:64:4: warning: '______f' is static but declared in inline function 'strcpy' which is not static [enabled by default] ______f = { \ ^ ../include/linux/compiler.h:56:23: note: in expansion of macro '__trace_if' ^ ../include/linux/string.h:425:2: note: in expansion of macro 'if' if (p_size == (size_t)-1 && q_size == (size_t)-1) ^ This only happens when CONFIG_FORTIFY_SOURCE=y and CONFIG_PROFILE_ALL_BRANCHES=y, so prevent PROFILE_ALL_BRANCHES if FORTIFY_SOURCE=y. Link: http://lkml.kernel.org/r/9199446b-a141-c0c3-9678-a3f9107f2750@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 904c952ac383..f54dc62b599c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -355,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES - bool "Profile all if conditionals" + bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () -- cgit v1.2.3 From 95ef498d977bf44ac094778fd448b98af158a3e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2018 22:31:18 -0800 Subject: ipv6: ip6_make_skb() needs to clear cork.base.dst In my last patch, I missed fact that cork.base.dst was not initialized in ip6_make_skb() : If ip6_setup_cork() returns an error, we might attempt a dst_release() on some random pointer. Fixes: 862c03ee1deb ("ipv6: fix possible mem leaks in ipv6_make_skb()") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8fe58a2d305c..4f7d8de56611 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1735,6 +1735,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.flags = 0; cork.base.addr = 0; cork.base.opt = NULL; + cork.base.dst = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); if (err) { -- cgit v1.2.3 From 30be8f8dba1bd2aff73e8447d59228471233a3d4 Mon Sep 17 00:00:00 2001 From: "r.hering@avm.de" Date: Fri, 12 Jan 2018 15:42:06 +0100 Subject: net/tls: Fix inverted error codes to avoid endless loop sendfile() calls can hang endless with using Kernel TLS if a socket error occurs. Socket error codes must be inverted by Kernel TLS before returning because they are stored with positive sign. If returned non-inverted they are interpreted as number of bytes sent, causing endless looping of the splice mechanic behind sendfile(). Signed-off-by: Robert Hering Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- net/tls/tls_sw.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 936cfc5cab7d..9185e53a743c 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -170,7 +170,7 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) static inline void tls_err_abort(struct sock *sk) { - sk->sk_err = -EBADMSG; + sk->sk_err = EBADMSG; sk->sk_error_report(sk); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 73d19210dd49..9773571b6a34 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -391,7 +391,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) while (msg_data_left(msg)) { if (sk->sk_err) { - ret = sk->sk_err; + ret = -sk->sk_err; goto send_end; } @@ -544,7 +544,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, size_t copy, required_size; if (sk->sk_err) { - ret = sk->sk_err; + ret = -sk->sk_err; goto sendpage_end; } -- cgit v1.2.3 From 95a332088ecb113c2e8753fa3f1df9b0dda9beec Mon Sep 17 00:00:00 2001 From: William Tu Date: Fri, 12 Jan 2018 12:29:22 -0800 Subject: Revert "openvswitch: Add erspan tunnel support." This reverts commit ceaa001a170e43608854d5290a48064f57b565ed. The OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS attr should be designed as a nested attribute to support all ERSPAN v1 and v2's fields. The current attr is a be32 supporting only one field. Thus, this patch reverts it and later patch will redo it using nested attr. Signed-off-by: William Tu Cc: Jiri Benc Cc: Pravin Shelar Acked-by: Jiri Benc Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 - net/openvswitch/flow_netlink.c | 51 +--------------------------------------- 2 files changed, 1 insertion(+), 51 deletions(-) diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 4265d7f9e1f2..dcfab5e3b55c 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -363,7 +363,6 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ OVS_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ OVS_TUNNEL_KEY_ATTR_PAD, - OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* be32 ERSPAN index. */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 624ea74353dd..f143908b651d 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -49,7 +49,6 @@ #include #include #include -#include #include "flow_netlink.h" @@ -334,8 +333,7 @@ size_t ovs_tun_key_attr_size(void) * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. */ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ - + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_DST */ - + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ + + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ } static size_t ovs_nsh_key_attr_size(void) @@ -402,7 +400,6 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] .next = ovs_vxlan_ext_key_lens }, [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, - [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = sizeof(u32) }, }; static const struct ovs_len_tbl @@ -634,33 +631,6 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, return 0; } -static int erspan_tun_opt_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask, - bool log) -{ - unsigned long opt_key_offset; - struct erspan_metadata opts; - - BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); - - memset(&opts, 0, sizeof(opts)); - opts.index = nla_get_be32(attr); - - /* Index has only 20-bit */ - if (ntohl(opts.index) & ~INDEX_MASK) { - OVS_NLERR(log, "ERSPAN index number %x too large.", - ntohl(opts.index)); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask); - opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); - SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), - is_mask); - - return 0; -} - static int ip_tun_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) @@ -768,19 +738,6 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_PAD: break; - case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: - if (opts_type) { - OVS_NLERR(log, "Multiple metadata blocks provided"); - return -EINVAL; - } - - err = erspan_tun_opt_from_nlattr(a, match, is_mask, log); - if (err) - return err; - - tun_flags |= TUNNEL_ERSPAN_OPT; - opts_type = type; - break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -905,10 +862,6 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, else if (output->tun_flags & TUNNEL_VXLAN_OPT && vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; - else if (output->tun_flags & TUNNEL_ERSPAN_OPT && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, - ((struct erspan_metadata *)tun_opts)->index)) - return -EMSGSIZE; } return 0; @@ -2533,8 +2486,6 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: break; - case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: - break; } }; -- cgit v1.2.3 From 17d0fb0caa68f2bfd8aaa8125ff15abebfbfa1d7 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 13 Jan 2018 20:22:01 +0300 Subject: sh_eth: fix dumping ARSTR ARSTR is always located at the start of the TSU register region, thus using add_reg() instead of add_tsu_reg() in __sh_eth_get_regs() to dump it causes EDMR or EDSR (depending on the register layout) to be dumped instead of ARSTR. Use the correct condition/macro there... Fixes: 6b4b4fead342 ("sh_eth: Implement ethtool register dump operations") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index b9e2846589f8..53924a4fc31c 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2089,8 +2089,8 @@ static size_t __sh_eth_get_regs(struct net_device *ndev, u32 *buf) add_reg(CSMR); if (cd->select_mii) add_reg(RMII_MII); - add_reg(ARSTR); if (cd->tsu) { + add_tsu_reg(ARSTR); add_tsu_reg(TSU_CTRST); add_tsu_reg(TSU_FWEN0); add_tsu_reg(TSU_FWEN1); -- cgit v1.2.3 From 096b9854c04df86f03b38a97d40b6506e5730919 Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:50 -0800 Subject: net: Allow neigh contructor functions ability to modify the primary_key Use n->primary_key instead of pkey to account for the possibility that a neigh constructor function may have modified the primary_key value. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d1f5fe986edd..7f831711b6e0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -532,7 +532,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); - hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); @@ -544,7 +544,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, n1 != NULL; n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { - if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; -- cgit v1.2.3 From cd9ff4de0107c65d69d02253bb25d6db93c3dbc1 Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:51 -0800 Subject: ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY Map all lookup neigh keys to INADDR_ANY for loopback/point-to-point devices to avoid making an entry for every remote ip the device needs to talk to. This used the be the old behavior but became broken in a263b3093641f (ipv4: Make neigh lookups directly in output packet path) and later removed in 0bb4087cbec0 (ipv4: Fix neigh lookup keying over loopback/point-to-point devices) because it was broken. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller --- include/net/arp.h | 3 +++ net/ipv4/arp.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/net/arp.h b/include/net/arp.h index dc8cd47f883b..977aabfcdc03 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -20,6 +20,9 @@ static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + key = INADDR_ANY; + return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a8d7c5a9fb05..6c231b43974d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -223,11 +223,16 @@ static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) static int arp_constructor(struct neighbour *neigh) { - __be32 addr = *(__be32 *)neigh->primary_key; + __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; + u32 inaddr_any = INADDR_ANY; + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); + + addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { -- cgit v1.2.3 From 6311b7ce42e0c1d6d944bc099dc47e936c20cf11 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Jan 2018 12:42:25 +0100 Subject: netlink: extack: avoid parenthesized string constant warning NL_SET_ERR_MSG() and NL_SET_ERR_MSG_ATTR() lead to the following warning in newer versions of gcc: warning: array initialized from parenthesized string constant Just remove the parentheses, they're not needed in this context since anyway since there can be no operator precendence issues or similar. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/linux/netlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 49b4257ce1ea..f3075d6c7e82 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -85,7 +85,7 @@ struct netlink_ext_ack { * to the lack of an output buffer.) */ #define NL_SET_ERR_MSG(extack, msg) do { \ - static const char __msg[] = (msg); \ + static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ if (__extack) \ @@ -101,7 +101,7 @@ struct netlink_ext_ack { } while (0) #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) do { \ - static const char __msg[] = (msg); \ + static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ if (__extack) { \ -- cgit v1.2.3 From ae59c3f0b6cfd472fed96e50548a799b8971d876 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 12 Jan 2018 07:58:39 +0200 Subject: RDMA/mlx5: Fix out-of-bound access while querying AH The rdma_ah_find_type() accesses the port array based on an index controlled by userspace. The existing bounds check is after the first use of the index, so userspace can generate an out of bounds access, as shown by the KASN report below. ================================================================== BUG: KASAN: slab-out-of-bounds in to_rdma_ah_attr+0xa8/0x3b0 Read of size 4 at addr ffff880019ae2268 by task ibv_rc_pingpong/409 CPU: 0 PID: 409 Comm: ibv_rc_pingpong Not tainted 4.15.0-rc2-00031-gb60a3faf5b83-dirty #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 Call Trace: dump_stack+0xe9/0x18f print_address_description+0xa2/0x350 kasan_report+0x3a5/0x400 to_rdma_ah_attr+0xa8/0x3b0 mlx5_ib_query_qp+0xd35/0x1330 ib_query_qp+0x8a/0xb0 ib_uverbs_query_qp+0x237/0x7f0 ib_uverbs_write+0x617/0xd80 __vfs_write+0xf7/0x500 vfs_write+0x149/0x310 SyS_write+0xca/0x190 entry_SYSCALL_64_fastpath+0x18/0x85 RIP: 0033:0x7fe9c7a275a0 RSP: 002b:00007ffee5498738 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007fe9c7ce4b00 RCX: 00007fe9c7a275a0 RDX: 0000000000000018 RSI: 00007ffee5498800 RDI: 0000000000000003 RBP: 000055d0c8d3f010 R08: 00007ffee5498800 R09: 0000000000000018 R10: 00000000000000ba R11: 0000000000000246 R12: 0000000000008000 R13: 0000000000004fb0 R14: 000055d0c8d3f050 R15: 00007ffee5498560 Allocated by task 1: __kmalloc+0x3f9/0x430 alloc_mad_private+0x25/0x50 ib_mad_post_receive_mads+0x204/0xa60 ib_mad_init_device+0xa59/0x1020 ib_register_device+0x83a/0xbc0 mlx5_ib_add+0x50e/0x5c0 mlx5_add_device+0x142/0x410 mlx5_register_interface+0x18f/0x210 mlx5_ib_init+0x56/0x63 do_one_initcall+0x15b/0x270 kernel_init_freeable+0x2d8/0x3d0 kernel_init+0x14/0x190 ret_from_fork+0x24/0x30 Freed by task 0: (stack is not available) The buggy address belongs to the object at ffff880019ae2000 which belongs to the cache kmalloc-512 of size 512 The buggy address is located 104 bytes to the right of 512-byte region [ffff880019ae2000, ffff880019ae2200) The buggy address belongs to the page: page:000000005d674e18 count:1 mapcount:0 mapping: (null) index:0x0 compound_mapcount: 0 flags: 0x4000000000008100(slab|head) raw: 4000000000008100 0000000000000000 0000000000000000 00000001000c000c raw: dead000000000100 dead000000000200 ffff88001a402000 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff880019ae2100: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff880019ae2180: 00 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc >ffff880019ae2200: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff880019ae2280: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff880019ae2300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== Disabling lock debugging due to kernel taint Cc: Fixes: 44c58487d51a ("IB/core: Define 'ib' and 'roce' rdma_ah_attr types") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 31ad28853efa..cffe5966aef9 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4362,12 +4362,11 @@ static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev, memset(ah_attr, 0, sizeof(*ah_attr)); - ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, path->port); - rdma_ah_set_port_num(ah_attr, path->port); - if (rdma_ah_get_port_num(ah_attr) == 0 || - rdma_ah_get_port_num(ah_attr) > MLX5_CAP_GEN(dev, num_ports)) + if (!path->port || path->port > MLX5_CAP_GEN(dev, num_ports)) return; + ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, path->port); + rdma_ah_set_port_num(ah_attr, path->port); rdma_ah_set_sl(ah_attr, path->dci_cfi_prio_sl & 0xf); -- cgit v1.2.3 From 3d1661304f0b2b51a8a43785b764822611dbdd53 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Wed, 10 Jan 2018 19:39:52 -0600 Subject: ibmvnic: Fix pending MAC address changes Due to architecture limitations, the IBM VNIC client driver is unable to perform MAC address changes unless the device has "logged in" to its backing device. Currently, pending MAC changes are handled before login, resulting in an error and failure to change the MAC address. Moving that chunk to the end of the ibmvnic_login function, when we are sure that it was successful, fixes that. The MAC address can be changed when the device is up or down, so only check if the device is in a "PROBED" state before setting the MAC address. Fixes: c26eba03e407 ("ibmvnic: Update reset infrastructure to support tunable parameters") Signed-off-by: Thomas Falcon Reviewed-by: John Allen Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 1dc4aef37d3a..4b3df17c7a45 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -756,6 +756,12 @@ static int ibmvnic_login(struct net_device *netdev) } } while (adapter->renegotiate); + /* handle pending MAC address changes after successful login */ + if (adapter->mac_change_pending) { + __ibmvnic_set_mac(netdev, &adapter->desired.mac); + adapter->mac_change_pending = false; + } + return 0; } @@ -993,11 +999,6 @@ static int ibmvnic_open(struct net_device *netdev) mutex_lock(&adapter->reset_lock); - if (adapter->mac_change_pending) { - __ibmvnic_set_mac(netdev, &adapter->desired.mac); - adapter->mac_change_pending = false; - } - if (adapter->state != VNIC_CLOSED) { rc = ibmvnic_login(netdev); if (rc) { @@ -1527,7 +1528,7 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p) struct ibmvnic_adapter *adapter = netdev_priv(netdev); struct sockaddr *addr = p; - if (adapter->state != VNIC_OPEN) { + if (adapter->state == VNIC_PROBED) { memcpy(&adapter->desired.mac, addr, sizeof(struct sockaddr)); adapter->mac_change_pending = true; return 0; -- cgit v1.2.3 From 625637bf4afa45204bd87e4218645182a919485a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:01:19 +0800 Subject: sctp: reinit stream if stream outcnt has been change by sinit in sendmsg After introducing sctp_stream structure, sctp uses stream->outcnt as the out stream nums instead of c.sinit_num_ostreams. However when users use sinit in cmsg, it only updates c.sinit_num_ostreams in sctp_sendmsg. At that moment, stream->outcnt is still using previous value. If it's value is not updated, the sinit_num_ostreams of sinit could not really work. This patch is to fix it by updating stream->outcnt and reiniting stream if stream outcnt has been change by sinit in sendmsg. Fixes: a83863174a61 ("sctp: prepare asoc stream for stream reconf") Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9b01e994f661..15ae018b386f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1883,8 +1883,14 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) */ if (sinit) { if (sinit->sinit_num_ostreams) { - asoc->c.sinit_num_ostreams = - sinit->sinit_num_ostreams; + __u16 outcnt = sinit->sinit_num_ostreams; + + asoc->c.sinit_num_ostreams = outcnt; + /* outcnt has been changed, so re-init stream */ + err = sctp_stream_init(&asoc->stream, outcnt, 0, + GFP_KERNEL); + if (err) + goto out_free; } if (sinit->sinit_max_instreams) { asoc->c.sinit_max_instreams = -- cgit v1.2.3 From a0ff660058b88d12625a783ce9e5c1371c87951f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:01:36 +0800 Subject: sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf After commit cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep"), it may change to lock another sk if the asoc has been peeled off in sctp_wait_for_sndbuf. However, the asoc's new sk could be already closed elsewhere, as it's in the sendmsg context of the old sk that can't avoid the new sk's closing. If the sk's last one refcnt is held by this asoc, later on after putting this asoc, the new sk will be freed, while under it's own lock. This patch is to revert that commit, but fix the old issue by returning error under the old sk's lock. Fixes: cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep") Reported-by: syzbot+ac6ea7baa4432811eb50@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 15ae018b386f..feb2ca69827a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -85,7 +85,7 @@ static int sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk); + size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1977,7 +1977,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { /* sk can be changed by peel off when waiting for buf. */ - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk); + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) { if (err == -ESRCH) { /* asoc is already dead. */ @@ -8022,12 +8022,12 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk) + size_t msg_len) { struct sock *sk = asoc->base.sk; - int err = 0; long current_timeo = *timeo_p; DEFINE_WAIT(wait); + int err = 0; pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); @@ -8056,17 +8056,13 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); - if (sk != asoc->base.sk) { - release_sock(sk); - sk = asoc->base.sk; - lock_sock(sk); - } + if (sk != asoc->base.sk) + goto do_error; *timeo_p = current_timeo; } out: - *orig_sk = sk; finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ -- cgit v1.2.3 From c5006b8aa74599ce19104b31d322d2ea9ff887cc Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:02:00 +0800 Subject: sctp: do not allow the v4 socket to bind a v4mapped v6 address The check in sctp_sockaddr_af is not robust enough to forbid binding a v4mapped v6 addr on a v4 socket. The worse thing is that v4 socket's bind_verify would not convert this v4mapped v6 addr to a v4 addr. syzbot even reported a crash as the v4 socket bound a v6 addr. This patch is to fix it by doing the common sa.sa_family check first, then AF_INET check for v4mapped v6 addrs. Fixes: 7dab83de50c7 ("sctp: Support ipv6only AF_INET6 sockets.") Reported-by: syzbot+7b7b518b1228d2743963@syzkaller.appspotmail.com Acked-by: Neil Horman Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index feb2ca69827a..039fcb618c34 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -335,16 +335,14 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, if (len < sizeof (struct sockaddr)) return NULL; + if (!opt->pf->af_supported(addr->sa.sa_family, opt)) + return NULL; + /* V4 mapped address are really of AF_INET family */ if (addr->sa.sa_family == AF_INET6 && - ipv6_addr_v4mapped(&addr->v6.sin6_addr)) { - if (!opt->pf->af_supported(AF_INET, opt)) - return NULL; - } else { - /* Does this PF support this AF? */ - if (!opt->pf->af_supported(addr->sa.sa_family, opt)) - return NULL; - } + ipv6_addr_v4mapped(&addr->v6.sin6_addr) && + !opt->pf->af_supported(AF_INET, opt)) + return NULL; /* If we get this far, af is valid. */ af = sctp_get_af_specific(addr->sa.sa_family); -- cgit v1.2.3 From a5b1379afbfabf91e3a689e82ac619a7157336b3 Mon Sep 17 00:00:00 2001 From: Yuiko Oshino Date: Mon, 15 Jan 2018 13:24:28 -0500 Subject: lan78xx: Fix failure in USB Full Speed Fix initialize the uninitialized tx_qlen to an appropriate value when USB Full Speed is used. Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Yuiko Oshino Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 94c7804903c4..ec56ff29aac4 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2396,6 +2396,7 @@ static int lan78xx_reset(struct lan78xx_net *dev) buf = DEFAULT_BURST_CAP_SIZE / FS_USB_PKT_SIZE; dev->rx_urb_size = DEFAULT_BURST_CAP_SIZE; dev->rx_qlen = 4; + dev->tx_qlen = 4; } ret = lan78xx_write_reg(dev, BURST_CAP, buf); -- cgit v1.2.3 From 0d9c9f0f40ca262b67fc06a702b85f3976f5e1a1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Jan 2018 11:47:53 -0800 Subject: nfp: use the correct index for link speed table sts variable is holding link speed as well as state. We should be using ls to index into ls_to_ethtool. Fixes: 265aeb511bd5 ("nfp: add support for .get_link_ksettings()") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index 2801ecd09eab..6c02b2d6ba06 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -333,7 +333,7 @@ nfp_net_get_link_ksettings(struct net_device *netdev, ls >= ARRAY_SIZE(ls_to_ethtool)) return 0; - cmd->base.speed = ls_to_ethtool[sts]; + cmd->base.speed = ls_to_ethtool[ls]; cmd->base.duplex = DUPLEX_FULL; return 0; -- cgit v1.2.3 From 70eeff66c4696cee4076d6388b6bede5bd7ff71c Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 15 Jan 2018 12:24:49 -0800 Subject: qed: Fix potential use-after-free in qed_spq_post() We need to check if p_ent->comp_mode is QED_SPQ_MODE_EBLOCK before calling qed_spq_add_entry(). The test is fine is the mode is EBLOCK, but if it isn't then qed_spq_add_entry() might kfree(p_ent). Signed-off-by: Roland Dreier Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_spq.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c index be48d9abd001..3588081b2e27 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_spq.c +++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c @@ -776,6 +776,7 @@ int qed_spq_post(struct qed_hwfn *p_hwfn, int rc = 0; struct qed_spq *p_spq = p_hwfn ? p_hwfn->p_spq : NULL; bool b_ret_ent = true; + bool eblock; if (!p_hwfn) return -EINVAL; @@ -794,6 +795,11 @@ int qed_spq_post(struct qed_hwfn *p_hwfn, if (rc) goto spq_post_fail; + /* Check if entry is in block mode before qed_spq_add_entry, + * which might kfree p_ent. + */ + eblock = (p_ent->comp_mode == QED_SPQ_MODE_EBLOCK); + /* Add the request to the pending queue */ rc = qed_spq_add_entry(p_hwfn, p_ent, p_ent->priority); if (rc) @@ -811,7 +817,7 @@ int qed_spq_post(struct qed_hwfn *p_hwfn, spin_unlock_bh(&p_spq->lock); - if (p_ent->comp_mode == QED_SPQ_MODE_EBLOCK) { + if (eblock) { /* For entries in QED BLOCK mode, the completion code cannot * perform the necessary cleanup - if it did, we couldn't * access p_ent here to see whether it's successful or not. -- cgit v1.2.3 From 81d947e2b8dd2394586c3eaffdd2357797d3bf59 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 15 Jan 2018 23:12:09 +0100 Subject: net, sched: fix panic when updating miniq {b,q}stats While working on fixing another bug, I ran into the following panic on arm64 by simply attaching clsact qdisc, adding a filter and running traffic on ingress to it: [...] [ 178.188591] Unable to handle kernel read from unreadable memory at virtual address 810fb501f000 [ 178.197314] Mem abort info: [ 178.200121] ESR = 0x96000004 [ 178.203168] Exception class = DABT (current EL), IL = 32 bits [ 178.209095] SET = 0, FnV = 0 [ 178.212157] EA = 0, S1PTW = 0 [ 178.215288] Data abort info: [ 178.218175] ISV = 0, ISS = 0x00000004 [ 178.222019] CM = 0, WnR = 0 [ 178.224997] user pgtable: 4k pages, 48-bit VAs, pgd = 0000000023cb3f33 [ 178.231531] [0000810fb501f000] *pgd=0000000000000000 [ 178.236508] Internal error: Oops: 96000004 [#1] SMP [...] [ 178.311855] CPU: 73 PID: 2497 Comm: ping Tainted: G W 4.15.0-rc7+ #5 [ 178.319413] Hardware name: FOXCONN R2-1221R-A4/C2U4N_MB, BIOS G31FB18A 03/31/2017 [ 178.326887] pstate: 60400005 (nZCv daif +PAN -UAO) [ 178.331685] pc : __netif_receive_skb_core+0x49c/0xac8 [ 178.336728] lr : __netif_receive_skb+0x28/0x78 [ 178.341161] sp : ffff00002344b750 [ 178.344465] x29: ffff00002344b750 x28: ffff810fbdfd0580 [ 178.349769] x27: 0000000000000000 x26: ffff000009378000 [...] [ 178.418715] x1 : 0000000000000054 x0 : 0000000000000000 [ 178.424020] Process ping (pid: 2497, stack limit = 0x000000009f0a3ff4) [ 178.430537] Call trace: [ 178.432976] __netif_receive_skb_core+0x49c/0xac8 [ 178.437670] __netif_receive_skb+0x28/0x78 [ 178.441757] process_backlog+0x9c/0x160 [ 178.445584] net_rx_action+0x2f8/0x3f0 [...] Reason is that sch_ingress and sch_clsact are doing mini_qdisc_pair_init() which sets up miniq pointers to cpu_{b,q}stats from the underlying qdisc. Problem is that this cannot work since they are actually set up right after the qdisc ->init() callback in qdisc_create(), so first packet going into sch_handle_ingress() tries to call mini_qdisc_bstats_cpu_update() and we therefore panic. In order to fix this, allocation of {b,q}stats needs to happen before we call into ->init(). In net-next, there's already such option through commit d59f5ffa59d8 ("net: sched: a dflt qdisc may be used with per cpu stats"). However, the bug needs to be fixed in net still for 4.15. Thus, include these bits to reduce any merge churn and reuse the static_flags field to set TCQ_F_CPUSTATS, and remove the allocation from qdisc_create() since there is no other user left. Prashant Bhole ran into the same issue but for net-next, thus adding him below as well as co-author. Same issue was also reported by Sandipan Das when using bcc. Fixes: 46209401f8f6 ("net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath") Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2018-January/001190.html Reported-by: Sandipan Das Co-authored-by: Prashant Bhole Co-authored-by: John Fastabend Signed-off-by: Daniel Borkmann Cc: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 ++ net/sched/sch_api.c | 15 +-------------- net/sched/sch_generic.c | 18 +++++++++++++++++- net/sched/sch_ingress.c | 19 ++++--------------- 4 files changed, 24 insertions(+), 30 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 83a3e47d5845..becf86aa4ac6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -179,6 +179,7 @@ struct Qdisc_ops { const struct Qdisc_class_ops *cl_ops; char id[IFNAMSIZ]; int priv_size; + unsigned int static_flags; int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, @@ -444,6 +445,7 @@ void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n, unsigned int len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops); +void qdisc_free(struct Qdisc *qdisc); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, u32 parentid); void __qdisc_calculate_pkt_len(struct sk_buff *skb, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 0f1eab99ff4e..52529b7f8d96 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1063,17 +1063,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev, } if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { - if (qdisc_is_percpu_stats(sch)) { - sch->cpu_bstats = - netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); - if (!sch->cpu_bstats) - goto err_out4; - - sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); - if (!sch->cpu_qstats) - goto err_out4; - } - if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB]); if (IS_ERR(stab)) { @@ -1115,7 +1104,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, ops->destroy(sch); err_out3: dev_put(dev); - kfree((char *) sch - sch->padded); + qdisc_free(sch); err_out2: module_put(ops->owner); err_out: @@ -1123,8 +1112,6 @@ err_out: return NULL; err_out4: - free_percpu(sch->cpu_bstats); - free_percpu(sch->cpu_qstats); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 661c7144b53a..cac003fddf3e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -633,6 +633,19 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, qdisc_skb_head_init(&sch->q); spin_lock_init(&sch->q.lock); + if (ops->static_flags & TCQ_F_CPUSTATS) { + sch->cpu_bstats = + netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto errout1; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) { + free_percpu(sch->cpu_bstats); + goto errout1; + } + } + spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); @@ -642,6 +655,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; + sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; @@ -649,6 +663,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, refcount_set(&sch->refcnt, 1); return sch; +errout1: + kfree(p); errout: return ERR_PTR(err); } @@ -698,7 +714,7 @@ void qdisc_reset(struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_reset); -static void qdisc_free(struct Qdisc *qdisc) +void qdisc_free(struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index fc1286f499c1..003e1b063447 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -66,7 +66,6 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - int err; net_inc_ingress_queue(); @@ -76,13 +75,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) q->block_info.chain_head_change = clsact_chain_head_change; q->block_info.chain_head_change_priv = &q->miniqp; - err = tcf_block_get_ext(&q->block, sch, &q->block_info); - if (err) - return err; - - sch->flags |= TCQ_F_CPUSTATS; - - return 0; + return tcf_block_get_ext(&q->block, sch, &q->block_info); } static void ingress_destroy(struct Qdisc *sch) @@ -121,6 +114,7 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", .priv_size = sizeof(struct ingress_sched_data), + .static_flags = TCQ_F_CPUSTATS, .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, @@ -192,13 +186,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) q->egress_block_info.chain_head_change = clsact_chain_head_change; q->egress_block_info.chain_head_change_priv = &q->miniqp_egress; - err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); - if (err) - return err; - - sch->flags |= TCQ_F_CPUSTATS; - - return 0; + return tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); } static void clsact_destroy(struct Qdisc *sch) @@ -225,6 +213,7 @@ static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { .cl_ops = &clsact_class_ops, .id = "clsact", .priv_size = sizeof(struct clsact_sched_data), + .static_flags = TCQ_F_CPUSTATS, .init = clsact_init, .destroy = clsact_destroy, .dump = ingress_dump, -- cgit v1.2.3