From 77f88796cee819b9c4562b0b6b44691b3b7755b1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Mar 2017 16:54:24 -0400 Subject: cgroup, kthread: close race window where new kthreads can be migrated to non-root cgroups Creation of a kthread goes through a couple interlocked stages between the kthread itself and its creator. Once the new kthread starts running, it initializes itself and wakes up the creator. The creator then can further configure the kthread and then let it start doing its job by waking it up. In this configuration-by-creator stage, the creator is the only one that can wake it up but the kthread is visible to userland. When altering the kthread's attributes from userland is allowed, this is fine; however, for cases where CPU affinity is critical, kthread_bind() is used to first disable affinity changes from userland and then set the affinity. This also prevents the kthread from being migrated into non-root cgroups as that can affect the CPU affinity and many other things. Unfortunately, the cgroup side of protection is racy. While the PF_NO_SETAFFINITY flag prevents further migrations, userland can win the race before the creator sets the flag with kthread_bind() and put the kthread in a non-root cgroup, which can lead to all sorts of problems including incorrect CPU affinity and starvation. This bug got triggered by userland which periodically tries to migrate all processes in the root cpuset cgroup to a non-root one. Per-cpu workqueue workers got caught while being created and ended up with incorrected CPU affinity breaking concurrency management and sometimes stalling workqueue execution. This patch adds task->no_cgroup_migration which disallows the task to be migrated by userland. kthreadd starts with the flag set making every child kthread start in the root cgroup with migration disallowed. The flag is cleared after the kthread finishes initialization by which time PF_NO_SETAFFINITY is set if the kthread should stay in the root cgroup. It'd be better to wait for the initialization instead of failing but I couldn't think of a way of implementing that without adding either a new PF flag, or sleeping and retrying from waiting side. Even if userland depends on changing cgroup membership of a kthread, it either has to be synchronized with kthread_create() or periodically repeat, so it's unlikely that this would break anything. v2: Switch to a simpler implementation using a new task_struct bit field suggested by Oleg. Signed-off-by: Tejun Heo Suggested-by: Oleg Nesterov Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Reported-and-debugged-by: Chris Mason Cc: stable@vger.kernel.org # v4.3+ (we can't close the race on < v4.3) Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 9 +++++---- kernel/kthread.c | 3 +++ 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0125589c7428..638ef7568495 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2425,11 +2425,12 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_NO_SETAFFINITY and become - * trapped in a cpuset, or RT worker may be born in a cgroup - * with no rt_runtime allocated. Just say no. + * kthreads may acquire PF_NO_SETAFFINITY during initialization. + * If userland migrates such a kthread to a non-root cgroup, it can + * become trapped in a cpuset, or RT kthread may be born in a + * cgroup with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { + if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out_unlock_rcu; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 2f26adea0f84..26db528c1d88 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,6 +20,7 @@ #include #include #include +#include #include static DEFINE_SPINLOCK(kthread_create_lock); @@ -225,6 +226,7 @@ static int kthread(void *_create) ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { + cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } @@ -538,6 +540,7 @@ int kthreadd(void *unused) set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; + cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); -- cgit v1.2.3 From 7bf8222b9bd0ba867e18b7f4537b61ef2e92eee8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 3 Apr 2017 15:25:53 -0400 Subject: irq/affinity: Fix CPU spread for unbalanced nodes The irq_create_affinity_masks routine is responsible for assigning a number of interrupt vectors to CPUs. The optimal assignemnet will spread requested vectors to all CPUs, with the fewest CPUs sharing a vector. The algorithm may fail to assign some vectors to any CPUs if a node's CPU count is lower than the average number of vectors per node. These vectors are unusable and create an un-optimal spread. Recalculate the number of vectors to assign at each node iteration by using the remaining number of vectors and nodes to be assigned, not exceeding the number of CPUs in that node. This will guarantee that every CPU is assigned at least one vector. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Cc: linux-nvme@lists.infradead.org Link: http://lkml.kernel.org/r/1491247553-7603-1-git-send-email-keith.busch@intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/affinity.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4544b115f5eb..dc529116f7e6 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -59,7 +59,7 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) struct cpumask * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { - int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec; + int n, nodes, cpus_per_vec, extra_vecs, curvec; int affv = nvecs - affd->pre_vectors - affd->post_vectors; int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; @@ -94,19 +94,21 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) goto done; } - /* Spread the vectors per node */ - vecs_per_node = affv / nodes; - /* Account for rounding errors */ - extra_vecs = affv - (nodes * vecs_per_node); - for_each_node_mask(n, nodemsk) { - int ncpus, v, vecs_to_assign = vecs_per_node; + int ncpus, v, vecs_to_assign, vecs_per_node; + + /* Spread the vectors per node */ + vecs_per_node = (affv - curvec) / nodes; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); + vecs_to_assign = min(vecs_per_node, ncpus); + + /* Account for rounding errors */ + extra_vecs = ncpus - vecs_to_assign; for (v = 0; curvec < last_affv && v < vecs_to_assign; curvec++, v++) { @@ -115,14 +117,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Account for extra vectors to compensate rounding errors */ if (extra_vecs) { cpus_per_vec++; - if (!--extra_vecs) - vecs_per_node++; + --extra_vecs; } irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } if (curvec >= last_affv) break; + --nodes; } done: -- cgit v1.2.3 From 264d509637d95f9404e52ced5003ad352e0f6a26 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 10 Apr 2017 11:16:59 -0400 Subject: audit: make sure we don't let the retry queue grow without bounds The retry queue is intended to provide a temporary buffer in the case of transient errors when communicating with auditd, it is not meant as a long life queue, that functionality is provided by the hold queue. This patch fixes a problem identified by Seth where the retry queue could grow uncontrollably if an auditd instance did not connect to the kernel to drain the queues. This commit fixes this by doing the following: * Make sure we always call auditd_reset() if we decide the connection with audit is really dead. There were some cases in kauditd_hold_skb() where we did not reset the connection, this patch relocates the reset calls to kauditd_thread() so all the error conditions are caught and the connection reset. As a side effect, this means we could move auditd_reset() and get rid of the forward definition at the top of kernel/audit.c. * We never checked the status of the auditd connection when processing the main audit queue which meant that the retry queue could grow unchecked. This patch adds a call to auditd_reset() after the main queue has been processed if auditd is not connected, the auditd_reset() call will make sure the retry and hold queues are correctly managed/flushed so that the retry queue remains reasonable. Cc: # 4.10.x-: 5b52330bbfe6 Reported-by: Seth Forshee Signed-off-by: Paul Moore --- kernel/audit.c | 67 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 32 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 2f4964cfde0b..a871bf80fde1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -160,7 +160,6 @@ static LIST_HEAD(audit_freelist); /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; -static void kauditd_hold_skb(struct sk_buff *skb); /* queue msgs due to temporary unicast send problems */ static struct sk_buff_head audit_retry_queue; /* queue msgs waiting for new auditd connection */ @@ -453,30 +452,6 @@ static void auditd_set(int pid, u32 portid, struct net *net) spin_unlock_irqrestore(&auditd_conn.lock, flags); } -/** - * auditd_reset - Disconnect the auditd connection - * - * Description: - * Break the auditd/kauditd connection and move all the queued records into the - * hold queue in case auditd reconnects. - */ -static void auditd_reset(void) -{ - struct sk_buff *skb; - - /* if it isn't already broken, break the connection */ - rcu_read_lock(); - if (auditd_conn.pid) - auditd_set(0, 0, NULL); - rcu_read_unlock(); - - /* flush all of the main and retry queues to the hold queue */ - while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); - while ((skb = skb_dequeue(&audit_queue))) - kauditd_hold_skb(skb); -} - /** * kauditd_print_skb - Print the audit record to the ring buffer * @skb: audit record @@ -505,9 +480,6 @@ static void kauditd_rehold_skb(struct sk_buff *skb) { /* put the record back in the queue at the same place */ skb_queue_head(&audit_hold_queue, skb); - - /* fail the auditd connection */ - auditd_reset(); } /** @@ -544,9 +516,6 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); kfree_skb(skb); - - /* fail the auditd connection */ - auditd_reset(); } /** @@ -566,6 +535,30 @@ static void kauditd_retry_skb(struct sk_buff *skb) skb_queue_tail(&audit_retry_queue, skb); } +/** + * auditd_reset - Disconnect the auditd connection + * + * Description: + * Break the auditd/kauditd connection and move all the queued records into the + * hold queue in case auditd reconnects. + */ +static void auditd_reset(void) +{ + struct sk_buff *skb; + + /* if it isn't already broken, break the connection */ + rcu_read_lock(); + if (auditd_conn.pid) + auditd_set(0, 0, NULL); + rcu_read_unlock(); + + /* flush all of the main and retry queues to the hold queue */ + while ((skb = skb_dequeue(&audit_retry_queue))) + kauditd_hold_skb(skb); + while ((skb = skb_dequeue(&audit_queue))) + kauditd_hold_skb(skb); +} + /** * auditd_send_unicast_skb - Send a record via unicast to auditd * @skb: audit record @@ -758,6 +751,7 @@ static int kauditd_thread(void *dummy) NULL, kauditd_rehold_skb); if (rc < 0) { sk = NULL; + auditd_reset(); goto main_queue; } @@ -767,6 +761,7 @@ static int kauditd_thread(void *dummy) NULL, kauditd_hold_skb); if (rc < 0) { sk = NULL; + auditd_reset(); goto main_queue; } @@ -775,16 +770,18 @@ main_queue: * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the * multicast send and move the record to the retry queue */ - kauditd_send_queue(sk, portid, &audit_queue, 1, - kauditd_send_multicast_skb, - kauditd_retry_skb); + rc = kauditd_send_queue(sk, portid, &audit_queue, 1, + kauditd_send_multicast_skb, + kauditd_retry_skb); + if (sk == NULL || rc < 0) + auditd_reset(); + sk = NULL; /* drop our netns reference, no auditd sends past this line */ if (net) { put_net(net); net = NULL; } - sk = NULL; /* we have processed all the queues so wake everyone */ wake_up(&audit_backlog_wait); -- cgit v1.2.3 From bfb0b80db5f9dca5ac0a5fd0edb765ee555e5a8e Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Fri, 7 Apr 2017 16:51:55 +0800 Subject: cgroup: avoid attaching a cgroup root to two different superblocks Run this: touch file0 for ((; ;)) { mount -t cpuset xxx file0 } And this concurrently: touch file1 for ((; ;)) { mount -t cpuset xxx file1 } We'll trigger a warning like this: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 4675 at lib/percpu-refcount.c:317 percpu_ref_kill_and_confirm+0x92/0xb0 percpu_ref_kill_and_confirm called more than once on css_release! CPU: 1 PID: 4675 Comm: mount Not tainted 4.11.0-rc5+ #5 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 Call Trace: dump_stack+0x63/0x84 __warn+0xd1/0xf0 warn_slowpath_fmt+0x5f/0x80 percpu_ref_kill_and_confirm+0x92/0xb0 cgroup_kill_sb+0x95/0xb0 deactivate_locked_super+0x43/0x70 deactivate_super+0x46/0x60 ... ---[ end trace a79f61c2a2633700 ]--- Here's a race: Thread A Thread B cgroup1_mount() # alloc a new cgroup root cgroup_setup_root() cgroup1_mount() # no sb yet, returns NULL kernfs_pin_sb() # but succeeds in getting the refcnt, # so re-use cgroup root percpu_ref_tryget_live() # alloc sb with cgroup root cgroup_do_mount() cgroup_kill_sb() # alloc another sb with same root cgroup_do_mount() cgroup_kill_sb() We end up using the same cgroup root for two different superblocks, so percpu_ref_kill() will be called twice on the same root when the two superblocks are destroyed. We should fix to make sure the superblock pinning is really successful. Cc: stable@vger.kernel.org # 3.16+ Reported-by: Dmitry Vyukov Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 1dc22f6b49f5..12e19f0636ea 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1146,7 +1146,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * path is super cold. Let's just sleep a bit and retry. */ pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR(pinned_sb) || + if (IS_ERR_OR_NULL(pinned_sb) || !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); if (!IS_ERR_OR_NULL(pinned_sb)) -- cgit v1.2.3 From 96a94cc5158859943b7e4e72ae69e572815f5413 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 11 Apr 2017 12:10:58 +0200 Subject: bpf: reference may_access_skb() from __bpf_prog_run() It took me quite some time to figure out how this was linked, so in order to save the next person the effort of finding it add a comment in __bpf_prog_run() that indicates what exactly determines that a program can access the ctx == skb. Signed-off-by: Johannes Berg Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f45827e205d3..b4f1cb0c5ac7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1162,12 +1162,12 @@ out: LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ off = IMM; load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are - * only appearing in the programs where ctx == - * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] - * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, - * internal BPF verifier will check that BPF_R6 == - * ctx. + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only + * appearing in the programs where ctx == skb + * (see may_access_skb() in the verifier). All programs + * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, + * bpf_convert_filter() saves it in BPF_R6, internal BPF + * verifier will check that BPF_R6 == ctx. * * BPF_ABS and BPF_IND are wrappers of function calls, * so they scratch BPF_R1-BPF_R5 registers, preserve -- cgit v1.2.3 From 3412386b531244f24a27c79ee003506a52a00848 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 13 Apr 2017 13:28:12 -0400 Subject: irq/affinity: Fix extra vecs calculation This fixes a math error calculating the extra_vecs. The error assumed only 1 cpu per vector, but the value needs to account for the actual number of cpus per vector in order to get the correct remainder for extra CPU assignment. Fixes: 7bf8222b9bd0 ("irq/affinity: Fix CPU spread for unbalanced nodes") Reported-by: Xiaolong Ye Signed-off-by: Keith Busch Link: http://lkml.kernel.org/r/1492104492-19943-1-git-send-email-keith.busch@intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/affinity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index dc529116f7e6..d052947fe785 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -108,7 +108,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_to_assign = min(vecs_per_node, ncpus); /* Account for rounding errors */ - extra_vecs = ncpus - vecs_to_assign; + extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); for (v = 0; curvec < last_affv && v < vecs_to_assign; curvec++, v++) { -- cgit v1.2.3 From 82cc4fc2e70ec5baeff8f776f2773abc8b2cc0ae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 14 Apr 2017 17:45:45 -0400 Subject: ftrace: Fix removing of second function probe When two function probes are added to set_ftrace_filter, and then one of them is removed, the update to the function locations is not performed, and the record keeping of the function states are corrupted, and causes an ftrace_bug() to occur. This is easily reproducable by adding two probes, removing one, and then adding it back again. # cd /sys/kernel/debug/tracing # echo schedule:traceoff > set_ftrace_filter # echo do_IRQ:traceoff > set_ftrace_filter # echo \!do_IRQ:traceoff > /debug/tracing/set_ftrace_filter # echo do_IRQ:traceoff > set_ftrace_filter Causes: ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1098 at kernel/trace/ftrace.c:2369 ftrace_get_addr_curr+0x143/0x220 Modules linked in: [...] CPU: 2 PID: 1098 Comm: bash Not tainted 4.10.0-test+ #405 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v02.05 05/07/2012 Call Trace: dump_stack+0x68/0x9f __warn+0x111/0x130 ? trace_irq_work_interrupt+0xa0/0xa0 warn_slowpath_null+0x1d/0x20 ftrace_get_addr_curr+0x143/0x220 ? __fentry__+0x10/0x10 ftrace_replace_code+0xe3/0x4f0 ? ftrace_int3_handler+0x90/0x90 ? printk+0x99/0xb5 ? 0xffffffff81000000 ftrace_modify_all_code+0x97/0x110 arch_ftrace_update_code+0x10/0x20 ftrace_run_update_code+0x1c/0x60 ftrace_run_modify_code.isra.48.constprop.62+0x8e/0xd0 register_ftrace_function_probe+0x4b6/0x590 ? ftrace_startup+0x310/0x310 ? debug_lockdep_rcu_enabled.part.4+0x1a/0x30 ? update_stack_state+0x88/0x110 ? ftrace_regex_write.isra.43.part.44+0x1d3/0x320 ? preempt_count_sub+0x18/0xd0 ? mutex_lock_nested+0x104/0x800 ? ftrace_regex_write.isra.43.part.44+0x1d3/0x320 ? __unwind_start+0x1c0/0x1c0 ? _mutex_lock_nest_lock+0x800/0x800 ftrace_trace_probe_callback.isra.3+0xc0/0x130 ? func_set_flag+0xe0/0xe0 ? __lock_acquire+0x642/0x1790 ? __might_fault+0x1e/0x20 ? trace_get_user+0x398/0x470 ? strcmp+0x35/0x60 ftrace_trace_onoff_callback+0x48/0x70 ftrace_regex_write.isra.43.part.44+0x251/0x320 ? match_records+0x420/0x420 ftrace_filter_write+0x2b/0x30 __vfs_write+0xd7/0x330 ? do_loop_readv_writev+0x120/0x120 ? locks_remove_posix+0x90/0x2f0 ? do_lock_file_wait+0x160/0x160 ? __lock_is_held+0x93/0x100 ? rcu_read_lock_sched_held+0x5c/0xb0 ? preempt_count_sub+0x18/0xd0 ? __sb_start_write+0x10a/0x230 ? vfs_write+0x222/0x240 vfs_write+0xef/0x240 SyS_write+0xab/0x130 ? SyS_read+0x130/0x130 ? trace_hardirqs_on_caller+0x182/0x280 ? trace_hardirqs_on_thunk+0x1a/0x1c entry_SYSCALL_64_fastpath+0x18/0xad RIP: 0033:0x7fe61c157c30 RSP: 002b:00007ffe87890258 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: ffffffff8114a410 RCX: 00007fe61c157c30 RDX: 0000000000000010 RSI: 000055814798f5e0 RDI: 0000000000000001 RBP: ffff8800c9027f98 R08: 00007fe61c422740 R09: 00007fe61ca53700 R10: 0000000000000073 R11: 0000000000000246 R12: 0000558147a36400 R13: 00007ffe8788f160 R14: 0000000000000024 R15: 00007ffe8788f15c ? trace_hardirqs_off_caller+0xc0/0x110 ---[ end trace 99fa09b3d9869c2c ]--- Bad trampoline accounting at: ffffffff81cc3b00 (do_IRQ+0x0/0x150) Cc: stable@vger.kernel.org Fixes: 59df055f1991 ("ftrace: trace different functions with a different tracer") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b9691ee8f6c1..27bb2e61276e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3755,23 +3755,24 @@ static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) ftrace_probe_registered = 1; } -static void __disable_ftrace_function_probe(void) +static bool __disable_ftrace_function_probe(void) { int i; if (!ftrace_probe_registered) - return; + return false; for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; if (hhd->first) - return; + return false; } /* no more funcs left */ ftrace_shutdown(&trace_probe_ops, 0); ftrace_probe_registered = 0; + return true; } @@ -3901,6 +3902,7 @@ static void __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data, int flags) { + struct ftrace_ops_hash old_hash_ops; struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; @@ -3912,6 +3914,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; int i, ret; + bool disabled; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) func_g.search = NULL; @@ -3930,6 +3933,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_lock(&trace_probe_ops.func_hash->regex_lock); + old_hash_ops.filter_hash = old_hash; + /* Probes only have filters */ + old_hash_ops.notrace_hash = NULL; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) /* Hmm, should report this somehow */ @@ -3967,12 +3974,17 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } } mutex_lock(&ftrace_lock); - __disable_ftrace_function_probe(); + disabled = __disable_ftrace_function_probe(); /* * Remove after the disable is called. Otherwise, if the last * probe is removed, a null hash means *all enabled*. */ ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + /* still need to update the function call sites */ + if (ftrace_enabled && !disabled) + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + &old_hash_ops); synchronize_sched(); if (!ret) free_ftrace_hash_rcu(old_hash); -- cgit v1.2.3 From 330c418638612d7658b6314e6a244fcb5f7efac5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 16 Apr 2017 23:17:37 +0900 Subject: Revert "cgroup: avoid attaching a cgroup root to two different superblocks" This reverts commit bfb0b80db5f9dca5ac0a5fd0edb765ee555e5a8e. Andrei reports CRIU test hangs with the patch applied. The bug fixed by the patch isn't too likely to trigger in actual uses. Revert the patch for now. Signed-off-by: Tejun Heo Reported-by: Andrei Vagin Link: http://lkml.kernel.org/r/20170414232737.GC20350@outlook.office365.com --- kernel/cgroup/cgroup-v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 12e19f0636ea..1dc22f6b49f5 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1146,7 +1146,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * path is super cold. Let's just sleep a bit and retry. */ pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR_OR_NULL(pinned_sb) || + if (IS_ERR(pinned_sb) || !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); if (!IS_ERR_OR_NULL(pinned_sb)) -- cgit v1.2.3