From e96a7705e7d3fef96aec9b590c63b2f6f7d2ba22 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Thu, 23 Mar 2017 15:56:08 +0100 Subject: sched/rtmutex/deadline: Fix a PI crash for deadline tasks A crash happened while I was playing with deadline PI rtmutex. BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [] rt_mutex_get_top_task+0x1f/0x30 PGD 232a75067 PUD 230947067 PMD 0 Oops: 0000 [#1] SMP CPU: 1 PID: 10994 Comm: a.out Not tainted Call Trace: [] enqueue_task+0x2c/0x80 [] activate_task+0x23/0x30 [] pull_dl_task+0x1d5/0x260 [] pre_schedule_dl+0x16/0x20 [] __schedule+0xd3/0x900 [] schedule+0x29/0x70 [] __rt_mutex_slowlock+0x4b/0xc0 [] rt_mutex_slowlock+0xd1/0x190 [] rt_mutex_timed_lock+0x53/0x60 [] futex_lock_pi.isra.18+0x28c/0x390 [] do_futex+0x190/0x5b0 [] SyS_futex+0x80/0x180 This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi() are only protected by pi_lock when operating pi waiters, while rt_mutex_get_top_task(), will access them with rq lock held but not holding pi_lock. In order to tackle it, we introduce new "pi_top_task" pointer cached in task_struct, and add new rt_mutex_update_top_task() to update its value, it can be called by rt_mutex_setprio() which held both owner's pi_lock and rq lock. Thus "pi_top_task" can be safely accessed by enqueue_task_dl() under rq lock. Originally-From: Peter Zijlstra Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Reviewed-by: Thomas Gleixner Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170323150216.157682758@infradead.org Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ab9f6ac099a7..e1f44ec701c8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3713,6 +3713,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) goto out_unlock; } + rt_mutex_update_top_task(p); + trace_sched_pi_setprio(p, prio); oldprio = p->prio; -- cgit v1.2.3 From acd58620e415aee4a43a808d7d2fd87259ee0001 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Mar 2017 15:56:11 +0100 Subject: sched/rtmutex: Refactor rt_mutex_setprio() With the introduction of SCHED_DEADLINE the whole notion that priority is a single number is gone, therefore the @prio argument to rt_mutex_setprio() doesn't make sense anymore. So rework the code to pass a pi_task instead. Note this also fixes a problem with pi_top_task caching; previously we would not set the pointer (call rt_mutex_update_top_task) if the priority didn't change, this could lead to a stale pointer. As for the XXX, I think its fine to use pi_task->prio, because if it differs from waiter->prio, a PI chain update is immenent. Signed-off-by: Peter Zijlstra (Intel) Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170323150216.303827095@infradead.org Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 66 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 13 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e1f44ec701c8..37b152a782ae 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3671,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function); #ifdef CONFIG_RT_MUTEXES +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + /* * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) + * @p: task to boost + * @pi_task: donor task * * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). @@ -3682,18 +3697,42 @@ EXPORT_SYMBOL(default_wake_function); * Used by the rt_mutex code to implement priority inheritance * logic. Call site only calls if the priority of the task changed. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int oldprio, queued, running, queue_flag = + int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class; struct rq_flags rf; struct rq *rq; - BUG_ON(prio > MAX_PRIO); + /* XXX used to be waiter->prio, not waiter->task->prio */ + prio = __rt_effective_prio(pi_task, p->normal_prio); + + /* + * If nothing changed; bail early. + */ + if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) + return; rq = __task_rq_lock(p, &rf); update_rq_clock(rq); + /* + * Set under pi_lock && rq->lock, such that the value can be used under + * either lock. + * + * Note that there is loads of tricky to make this pointer cache work + * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to + * ensure a task is de-boosted (pi_task is set to NULL) before the + * task is allowed to run again (and can exit). This ensures the pointer + * points to a blocked task -- which guaratees the task is present. + */ + p->pi_top_task = pi_task; + + /* + * For FIFO/RR we only need to set prio, if that matches we're done. + */ + if (prio == p->prio && !dl_prio(prio)) + goto out_unlock; /* * Idle task boosting is a nono in general. There is one @@ -3713,9 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) goto out_unlock; } - rt_mutex_update_top_task(p); - - trace_sched_pi_setprio(p, prio); + trace_sched_pi_setprio(p, prio); /* broken */ oldprio = p->prio; if (oldprio == prio) @@ -3739,7 +3776,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) * running task */ if (dl_prio(prio)) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; @@ -3777,6 +3813,11 @@ out_unlock: balance_callback(rq); preempt_enable(); } +#else +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} #endif void set_user_nice(struct task_struct *p, long nice) @@ -4023,10 +4064,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, * Keep a potential priority boosting if called from * sched_setscheduler(). */ + p->prio = normal_prio(p); if (keep_boost) - p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); - else - p->prio = normal_prio(p); + p->prio = rt_effective_prio(p, p->prio); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -4313,7 +4353,7 @@ change: * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + new_effective_prio = rt_effective_prio(p, newprio); if (new_effective_prio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } -- cgit v1.2.3 From b91473ff6e979c0028f02f90e40c844959c736d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Mar 2017 15:56:12 +0100 Subject: sched,tracing: Update trace_sched_pi_setprio() Pass the PI donor task, instead of a numerical priority. Numerical priorities are not sufficient to describe state ever since SCHED_DEADLINE. Annotate all sched tracepoints that are currently broken; fixing them will bork userspace. *hate*. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170323150216.353599881@infradead.org Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 37b152a782ae..0c443bbdaaca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3752,7 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) goto out_unlock; } - trace_sched_pi_setprio(p, prio); /* broken */ + trace_sched_pi_setprio(p, pi_task); oldprio = p->prio; if (oldprio == prio) -- cgit v1.2.3