summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c3
-rw-r--r--kernel/bpf/core.c100
-rw-r--r--kernel/bpf/sockmap.c117
-rw-r--r--kernel/bpf/syscall.c23
-rw-r--r--kernel/compat.c1
-rw-r--r--kernel/events/ring_buffer.c7
-rw-r--r--kernel/events/uprobes.c7
-rw-r--r--kernel/kthread.c50
-rw-r--r--kernel/locking/rwsem-xadd.c19
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/locking/rwsem.h30
-rw-r--r--kernel/module.c5
-rw-r--r--kernel/sched/autogroup.c7
-rw-r--r--kernel/sched/core.c56
-rw-r--r--kernel/sched/cpufreq_schedutil.c16
-rw-r--r--kernel/sched/deadline.c4
-rw-r--r--kernel/sched/fair.c59
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h5
-rw-r--r--kernel/signal.c17
-rw-r--r--kernel/stop_machine.c19
-rw-r--r--kernel/time/clocksource.c63
-rw-r--r--kernel/time/tick-broadcast.c8
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/trace_events_filter.c3
-rw-r--r--kernel/trace/trace_events_hist.c12
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/trace/trace_uprobe.c35
-rw-r--r--kernel/tracepoint.c4
29 files changed, 398 insertions, 282 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 14750e7c5ee4..027107f4be53 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -476,7 +476,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
}
/* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -495,6 +495,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
+ .map_release_uref = bpf_fd_array_map_clear,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec39efb3..6ef6746a7871 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -218,47 +218,84 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
return 0;
}
-static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
+ u32 curr, const bool probe_pass)
{
+ const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+ s64 imm = insn->imm;
+
+ if (curr < pos && curr + imm + 1 > pos)
+ imm += delta;
+ else if (curr > pos + delta && curr + imm + 1 <= pos + delta)
+ imm -= delta;
+ if (imm < imm_min || imm > imm_max)
+ return -ERANGE;
+ if (!probe_pass)
+ insn->imm = imm;
+ return 0;
+}
+
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
+ u32 curr, const bool probe_pass)
+{
+ const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s32 off = insn->off;
+
+ if (curr < pos && curr + off + 1 > pos)
+ off += delta;
+ else if (curr > pos + delta && curr + off + 1 <= pos + delta)
+ off -= delta;
+ if (off < off_min || off > off_max)
+ return -ERANGE;
+ if (!probe_pass)
+ insn->off = off;
+ return 0;
+}
+
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
+ const bool probe_pass)
+{
+ u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0);
struct bpf_insn *insn = prog->insnsi;
- u32 i, insn_cnt = prog->len;
- bool pseudo_call;
- u8 code;
- int off;
+ int ret = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ u8 code;
+
+ /* In the probing pass we still operate on the original,
+ * unpatched image in order to check overflows before we
+ * do any other adjustments. Therefore skip the patchlet.
+ */
+ if (probe_pass && i == pos) {
+ i += delta + 1;
+ insn++;
+ }
code = insn->code;
- if (BPF_CLASS(code) != BPF_JMP)
- continue;
- if (BPF_OP(code) == BPF_EXIT)
+ if (BPF_CLASS(code) != BPF_JMP ||
+ BPF_OP(code) == BPF_EXIT)
continue;
+ /* Adjust offset of jmps if we cross patch boundaries. */
if (BPF_OP(code) == BPF_CALL) {
- if (insn->src_reg == BPF_PSEUDO_CALL)
- pseudo_call = true;
- else
+ if (insn->src_reg != BPF_PSEUDO_CALL)
continue;
+ ret = bpf_adj_delta_to_imm(insn, pos, delta, i,
+ probe_pass);
} else {
- pseudo_call = false;
+ ret = bpf_adj_delta_to_off(insn, pos, delta, i,
+ probe_pass);
}
- off = pseudo_call ? insn->imm : insn->off;
-
- /* Adjust offset of jmps if we cross boundaries. */
- if (i < pos && i + off + 1 > pos)
- off += delta;
- else if (i > pos + delta && i + off + 1 <= pos + delta)
- off -= delta;
-
- if (pseudo_call)
- insn->imm = off;
- else
- insn->off = off;
+ if (ret)
+ break;
}
+
+ return ret;
}
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len)
{
u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+ const u32 cnt_max = S16_MAX;
struct bpf_prog *prog_adj;
/* Since our patchlet doesn't expand the image, we're done. */
@@ -269,6 +306,15 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
insn_adj_cnt = prog->len + insn_delta;
+ /* Reject anything that would potentially let the insn->off
+ * target overflow when we have excessive program expansions.
+ * We need to probe here before we do any reallocation where
+ * we afterwards may not fail anymore.
+ */
+ if (insn_adj_cnt > cnt_max &&
+ bpf_adj_branches(prog, off, insn_delta, true))
+ return NULL;
+
/* Several new instructions need to be inserted. Make room
* for them. Likely, there's no need for a new allocation as
* last page could have large enough tailroom.
@@ -294,7 +340,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
sizeof(*patch) * insn_rest);
memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
- bpf_adj_branches(prog_adj, off, insn_delta);
+ /* We are guaranteed to not fail at this point, otherwise
+ * the ship has sailed to reverse to the original state. An
+ * overflow cannot happen at this point.
+ */
+ BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
return prog_adj;
}
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a3b21385e947..95a84b2f10ce 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -43,6 +43,7 @@
#include <net/tcp.h>
#include <linux/ptr_ring.h>
#include <net/inet_common.h>
+#include <linux/sched/signal.h>
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -325,6 +326,9 @@ retry:
if (ret > 0) {
if (apply)
apply_bytes -= ret;
+
+ sg->offset += ret;
+ sg->length -= ret;
size -= ret;
offset += ret;
if (uncharge)
@@ -332,8 +336,6 @@ retry:
goto retry;
}
- sg->length = size;
- sg->offset = offset;
return ret;
}
@@ -391,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
} while (i != md->sg_end);
}
-static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+static void free_bytes_sg(struct sock *sk, int bytes,
+ struct sk_msg_buff *md, bool charge)
{
struct scatterlist *sg = md->sg_data;
int i = md->sg_start, free;
@@ -401,11 +404,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
if (bytes < free) {
sg[i].length -= bytes;
sg[i].offset += bytes;
- sk_mem_uncharge(sk, bytes);
+ if (charge)
+ sk_mem_uncharge(sk, bytes);
break;
}
- sk_mem_uncharge(sk, sg[i].length);
+ if (charge)
+ sk_mem_uncharge(sk, sg[i].length);
put_page(sg_page(&sg[i]));
bytes -= sg[i].length;
sg[i].length = 0;
@@ -416,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
if (i == MAX_SKB_FRAGS)
i = 0;
}
+ md->sg_start = i;
}
static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
@@ -523,8 +529,6 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
i = md->sg_start;
do {
- r->sg_data[i] = md->sg_data[i];
-
size = (apply && apply_bytes < md->sg_data[i].length) ?
apply_bytes : md->sg_data[i].length;
@@ -535,6 +539,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
}
sk_mem_charge(sk, size);
+ r->sg_data[i] = md->sg_data[i];
r->sg_data[i].length = size;
md->sg_data[i].length -= size;
md->sg_data[i].offset += size;
@@ -575,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
struct sk_msg_buff *md,
int flags)
{
+ bool ingress = !!(md->flags & BPF_F_INGRESS);
struct smap_psock *psock;
struct scatterlist *sg;
- int i, err, free = 0;
- bool ingress = !!(md->flags & BPF_F_INGRESS);
+ int err = 0;
sg = md->sg_data;
@@ -606,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
out_rcu:
rcu_read_unlock();
out:
- i = md->sg_start;
- while (sg[i].length) {
- free += sg[i].length;
- put_page(sg_page(&sg[i]));
- sg[i].length = 0;
- i++;
- if (i == MAX_SKB_FRAGS)
- i = 0;
- }
- return free;
+ free_bytes_sg(NULL, send, md, false);
+ return err;
}
static inline void bpf_md_init(struct smap_psock *psock)
@@ -700,19 +697,26 @@ more_data:
err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
lock_sock(sk);
+ if (unlikely(err < 0)) {
+ free_start_sg(sk, m);
+ psock->sg_size = 0;
+ if (!cork)
+ *copied -= send;
+ } else {
+ psock->sg_size -= send;
+ }
+
if (cork) {
free_start_sg(sk, m);
+ psock->sg_size = 0;
kfree(m);
m = NULL;
+ err = 0;
}
- if (unlikely(err))
- *copied -= err;
- else
- psock->sg_size -= send;
break;
case __SK_DROP:
default:
- free_bytes_sg(sk, send, m);
+ free_bytes_sg(sk, send, m, true);
apply_bytes_dec(psock, send);
*copied -= send;
psock->sg_size -= send;
@@ -732,6 +736,26 @@ out_err:
return err;
}
+static int bpf_wait_data(struct sock *sk,
+ struct smap_psock *psk, int flags,
+ long timeo, int *err)
+{
+ int rc;
+
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ rc = sk_wait_event(sk, &timeo,
+ !list_empty(&psk->ingress) ||
+ !skb_queue_empty(&sk->sk_receive_queue),
+ &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ return rc;
+}
+
static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
@@ -755,6 +779,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
lock_sock(sk);
+bytes_ready:
while (copied != len) {
struct scatterlist *sg;
struct sk_msg_buff *md;
@@ -809,6 +834,28 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
}
}
+ if (!copied) {
+ long timeo;
+ int data;
+ int err = 0;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = bpf_wait_data(sk, psock, flags, timeo, &err);
+
+ if (data) {
+ if (!skb_queue_empty(&sk->sk_receive_queue)) {
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ return copied;
+ }
+ goto bytes_ready;
+ }
+
+ if (err)
+ copied = err;
+ }
+
release_sock(sk);
smap_release_sock(psock, sk);
return copied;
@@ -1656,11 +1703,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
* we increment the refcnt. If this is the case abort with an
* error.
*/
- verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+ verdict = bpf_prog_inc_not_zero(verdict);
if (IS_ERR(verdict))
return PTR_ERR(verdict);
- parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+ parse = bpf_prog_inc_not_zero(parse);
if (IS_ERR(parse)) {
bpf_prog_put(verdict);
return PTR_ERR(parse);
@@ -1668,12 +1715,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
}
if (tx_msg) {
- tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
+ tx_msg = bpf_prog_inc_not_zero(tx_msg);
if (IS_ERR(tx_msg)) {
- if (verdict)
- bpf_prog_put(verdict);
- if (parse)
+ if (parse && verdict) {
bpf_prog_put(parse);
+ bpf_prog_put(verdict);
+ }
return PTR_ERR(tx_msg);
}
}
@@ -1758,10 +1805,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
out_free:
smap_release_sock(psock, sock);
out_progs:
- if (verdict)
- bpf_prog_put(verdict);
- if (parse)
+ if (parse && verdict) {
bpf_prog_put(parse);
+ bpf_prog_put(verdict);
+ }
if (tx_msg)
bpf_prog_put(tx_msg);
write_unlock_bh(&sock->sk_callback_lock);
@@ -1831,7 +1878,7 @@ static int sock_map_update_elem(struct bpf_map *map,
return err;
}
-static void sock_map_release(struct bpf_map *map, struct file *map_file)
+static void sock_map_release(struct bpf_map *map)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
struct bpf_prog *orig;
@@ -1855,7 +1902,7 @@ const struct bpf_map_ops sock_map_ops = {
.map_get_next_key = sock_map_get_next_key,
.map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_map_delete_elem,
- .map_release = sock_map_release,
+ .map_release_uref = sock_map_release,
};
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ca46df19c9a..016ef9025827 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -26,6 +26,7 @@
#include <linux/cred.h>
#include <linux/timekeeping.h>
#include <linux/ctype.h>
+#include <linux/nospec.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -102,12 +103,14 @@ const struct bpf_map_ops bpf_map_offload_ops = {
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
const struct bpf_map_ops *ops;
+ u32 type = attr->map_type;
struct bpf_map *map;
int err;
- if (attr->map_type >= ARRAY_SIZE(bpf_map_types))
+ if (type >= ARRAY_SIZE(bpf_map_types))
return ERR_PTR(-EINVAL);
- ops = bpf_map_types[attr->map_type];
+ type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
+ ops = bpf_map_types[type];
if (!ops)
return ERR_PTR(-EINVAL);
@@ -122,7 +125,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
if (IS_ERR(map))
return map;
map->ops = ops;
- map->map_type = attr->map_type;
+ map->map_type = type;
return map;
}
@@ -257,8 +260,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
static void bpf_map_put_uref(struct bpf_map *map)
{
if (atomic_dec_and_test(&map->usercnt)) {
- if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
- bpf_fd_array_map_clear(map);
+ if (map->ops->map_release_uref)
+ map->ops->map_release_uref(map);
}
}
@@ -871,11 +874,17 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
{
- if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
+ const struct bpf_prog_ops *ops;
+
+ if (type >= ARRAY_SIZE(bpf_prog_types))
+ return -EINVAL;
+ type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
+ ops = bpf_prog_types[type];
+ if (!ops)
return -EINVAL;
if (!bpf_prog_is_dev_bound(prog->aux))
- prog->aux->ops = bpf_prog_types[type];
+ prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
prog->type = type;
diff --git a/kernel/compat.c b/kernel/compat.c
index 6d21894806b4..92d8c98c0f57 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -34,6 +34,7 @@ int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp)
{
struct compat_timex tx32;
+ memset(txc, 0, sizeof(struct timex));
if (copy_from_user(&tx32, utp, sizeof(struct compat_timex)))
return -EFAULT;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6c6b3c48db71..1d8ca9ea9979 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/circ_buf.h>
#include <linux/poll.h>
+#include <linux/nospec.h>
#include "internal.h"
@@ -867,8 +868,10 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
return NULL;
/* AUX space */
- if (pgoff >= rb->aux_pgoff)
- return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+ if (pgoff >= rb->aux_pgoff) {
+ int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
+ return virt_to_page(rb->aux_pages[aux_pgoff]);
+ }
}
return __perf_mmap_to_page(rb, pgoff);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ce6848e46e94..1725b902983f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -491,7 +491,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
if (!uprobe)
return NULL;
- uprobe->inode = igrab(inode);
+ uprobe->inode = inode;
uprobe->offset = offset;
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
@@ -502,7 +502,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
if (cur_uprobe) {
kfree(uprobe);
uprobe = cur_uprobe;
- iput(inode);
}
return uprobe;
@@ -701,7 +700,6 @@ static void delete_uprobe(struct uprobe *uprobe)
rb_erase(&uprobe->rb_node, &uprobes_tree);
spin_unlock(&uprobes_treelock);
RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
- iput(uprobe->inode);
put_uprobe(uprobe);
}
@@ -873,7 +871,8 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
- * unregisters.
+ * unregisters. Caller of uprobe_register() is required to keep @inode
+ * (and the containing mount) referenced.
*
* Return errno if it cannot successully install probes
* else return 0 (success)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index cd50e99202b0..2017a39ab490 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -55,7 +55,6 @@ enum KTHREAD_BITS {
KTHREAD_IS_PER_CPU = 0,
KTHREAD_SHOULD_STOP,
KTHREAD_SHOULD_PARK,
- KTHREAD_IS_PARKED,
};
static inline void set_kthread_struct(void *kthread)
@@ -177,14 +176,12 @@ void *kthread_probe_data(struct task_struct *task)
static void __kthread_parkme(struct kthread *self)
{
- __set_current_state(TASK_PARKED);
- while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
- if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
- complete(&self->parked);
+ for (;;) {
+ set_current_state(TASK_PARKED);
+ if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
+ break;
schedule();
- __set_current_state(TASK_PARKED);
}
- clear_bit(KTHREAD_IS_PARKED, &self->flags);
__set_current_state(TASK_RUNNING);
}
@@ -194,6 +191,11 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);
+void kthread_park_complete(struct task_struct *k)
+{
+ complete(&to_kthread(k)->parked);
+}
+
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
@@ -450,22 +452,15 @@ void kthread_unpark(struct task_struct *k)
{
struct kthread *kthread = to_kthread(k);
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
- * We clear the IS_PARKED bit here as we don't wait
- * until the task has left the park code. So if we'd
- * park before that happens we'd see the IS_PARKED bit
- * which might be about to be cleared.
+ * Newly created kthread was parked when the CPU was offline.
+ * The binding was lost and we need to set it again.
*/
- if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- /*
- * Newly created kthread was parked when the CPU was offline.
- * The binding was lost and we need to set it again.
- */
- if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
- __kthread_bind(k, kthread->cpu, TASK_PARKED);
- wake_up_state(k, TASK_PARKED);
- }
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu, TASK_PARKED);
+
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -488,12 +483,13 @@ int kthread_park(struct task_struct *k)
if (WARN_ON(k->flags & PF_EXITING))
return -ENOSYS;
- if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
- if (k != current) {
- wake_up_process(k);
- wait_for_completion(&kthread->parked);
- }
+ if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
+ return -EBUSY;
+
+ set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ if (k != current) {
+ wake_up_process(k);
+ wait_for_completion(&kthread->parked);
}
return 0;
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index e795908f3607..a90336779375 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -352,16 +352,15 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
struct task_struct *owner;
bool ret = true;
+ BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
+
if (need_resched())
return false;
rcu_read_lock();
owner = READ_ONCE(sem->owner);
- if (!rwsem_owner_is_writer(owner)) {
- /*
- * Don't spin if the rwsem is readers owned.
- */
- ret = !rwsem_owner_is_reader(owner);
+ if (!owner || !is_rwsem_owner_spinnable(owner)) {
+ ret = !owner; /* !owner is spinnable */
goto done;
}
@@ -382,11 +381,11 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner = READ_ONCE(sem->owner);
- if (!rwsem_owner_is_writer(owner))
- goto out;
+ if (!is_rwsem_owner_spinnable(owner))
+ return false;
rcu_read_lock();
- while (sem->owner == owner) {
+ while (owner && (READ_ONCE(sem->owner) == owner)) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
* checking sem->owner still matches owner, if that fails,
@@ -408,12 +407,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
cpu_relax();
}
rcu_read_unlock();
-out:
+
/*
* If there is a new owner or the owner is not set, we continue
* spinning.
*/
- return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
+ return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 30465a2f2b6c..bc1e507be9ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -221,5 +221,3 @@ void up_read_non_owner(struct rw_semaphore *sem)
EXPORT_SYMBOL(up_read_non_owner);
#endif
-
-
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a17cba8d94bb..b9d0e72aa80f 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,20 +1,24 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* The owner field of the rw_semaphore structure will be set to
- * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear
* the owner field when it unlocks. A reader, on the other hand, will
* not touch the owner field when it unlocks.
*
- * In essence, the owner field now has the following 3 states:
+ * In essence, the owner field now has the following 4 states:
* 1) 0
* - lock is free or the owner hasn't set the field yet
* 2) RWSEM_READER_OWNED
* - lock is currently or previously owned by readers (lock is free
* or not set by owner yet)
- * 3) Other non-zero value
- * - a writer owns the lock
+ * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well
+ * - lock is owned by an anonymous writer, so spinning on the lock
+ * owner should be disabled.
+ * 4) Other non-zero value
+ * - a writer owns the lock and other writers can spin on the lock owner.
*/
-#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
+#define RWSEM_ANONYMOUSLY_OWNED (1UL << 0)
+#define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED)
#ifdef CONFIG_DEBUG_RWSEMS
# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c)
@@ -51,14 +55,22 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
}
-static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+/*
+ * Return true if the a rwsem waiter can spin on the rwsem's owner
+ * and steal the lock, i.e. the lock is not anonymously owned.
+ * N.B. !owner is considered spinnable.
+ */
+static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
{
- return owner && owner != RWSEM_READER_OWNED;
+ return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
}
-static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+/*
+ * Return true if rwsem is owned by an anonymous writer or readers.
+ */
+static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
{
- return owner == RWSEM_READER_OWNED;
+ return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
}
#else
static inline void rwsem_set_owner(struct rw_semaphore *sem)
diff --git a/kernel/module.c b/kernel/module.c
index ce8066b88178..c9bea7f2b43e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3517,6 +3517,11 @@ static noinline int do_init_module(struct module *mod)
* walking this with preempt disabled. In all the failure paths, we
* call synchronize_sched(), but we don't want to slow down the success
* path, so use actual RCU here.
+ * Note that module_alloc() on most architectures creates W+X page
+ * mappings which won't be cleaned up until do_free_init() runs. Any
+ * code such as mark_rodata_ro() which depends on those mappings to
+ * be cleaned up needs to sync with the queued work - ie
+ * rcu_barrier_sched()
*/
call_rcu_sched(&freeinit->rcu, do_free_init);
mutex_unlock(&module_mutex);
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 6be6c575b6cd..2d4ff5353ded 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -2,6 +2,7 @@
/*
* Auto-group scheduling implementation:
*/
+#include <linux/nospec.h>
#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
@@ -209,7 +210,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
unsigned long shares;
- int err;
+ int err, idx;
if (nice < MIN_NICE || nice > MAX_NICE)
return -EINVAL;
@@ -227,7 +228,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
next = HZ / 10 + jiffies;
ag = autogroup_task_get(p);
- shares = scale_load(sched_prio_to_weight[nice + 20]);
+
+ idx = array_index_nospec(nice + 20, 40);
+ shares = scale_load(sched_prio_to_weight[idx]);
down_write(&ag->lock);
err = sched_group_set_shares(ag->tg, shares);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e10aaeebfcc..092f7c4de903 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,6 +7,9 @@
*/
#include "sched.h"
+#include <linux/kthread.h>
+#include <linux/nospec.h>
+
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -2718,20 +2721,28 @@ static struct rq *finish_task_switch(struct task_struct *prev)
membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm);
}
- if (unlikely(prev_state == TASK_DEAD)) {
- if (prev->sched_class->task_dead)
- prev->sched_class->task_dead(prev);
+ if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) {
+ switch (prev_state) {
+ case TASK_DEAD:
+ if (prev->sched_class->task_dead)
+ prev->sched_class->task_dead(prev);
- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);
+
+ /* Task is done with its stack. */
+ put_task_stack(prev);
- /* Task is done with its stack. */
- put_task_stack(prev);
+ put_task_struct(prev);
+ break;
- put_task_struct(prev);
+ case TASK_PARKED:
+ kthread_park_complete(prev);
+ break;
+ }
}
tick_nohz_task_switch();
@@ -3498,23 +3509,8 @@ static void __sched notrace __schedule(bool preempt)
void __noreturn do_task_dead(void)
{
- /*
- * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
- * when the following two conditions become true.
- * - There is race condition of mmap_sem (It is acquired by
- * exit_mm()), and
- * - SMI occurs before setting TASK_RUNINNG.
- * (or hypervisor of virtual machine switches to other guest)
- * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
- *
- * To avoid it, we have to wait for releasing tsk->pi_lock which
- * is held by try_to_wake_up()
- */
- raw_spin_lock_irq(&current->pi_lock);
- raw_spin_unlock_irq(&current->pi_lock);
-
/* Causes final put_task_struct in finish_task_switch(): */
- __set_current_state(TASK_DEAD);
+ set_special_state(TASK_DEAD);
/* Tell freezer to ignore us: */
current->flags |= PF_NOFREEZE;
@@ -6928,11 +6924,15 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 nice)
{
unsigned long weight;
+ int idx;
if (nice < MIN_NICE || nice > MAX_NICE)
return -ERANGE;
- weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+ idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
+ idx = array_index_nospec(idx, 40);
+ weight = sched_prio_to_weight[idx];
+
return sched_group_set_shares(css_tg(css), scale_load(weight));
}
#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index d2c6083304b4..e13df951aca7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -305,7 +305,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
- if (busy && next_f < sg_policy->next_freq) {
+ if (busy && next_f < sg_policy->next_freq &&
+ sg_policy->next_freq != UINT_MAX) {
next_f = sg_policy->next_freq;
/* Reset cached freq as next_freq has changed */
@@ -396,19 +397,6 @@ static void sugov_irq_work(struct irq_work *irq_work)
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- /*
- * For RT tasks, the schedutil governor shoots the frequency to maximum.
- * Special care must be taken to ensure that this kthread doesn't result
- * in the same behavior.
- *
- * This is (mostly) guaranteed by the work_in_progress flag. The flag is
- * updated only at the end of the sugov_work() function and before that
- * the schedutil governor rejects all other frequency scaling requests.
- *
- * There is a very rare case though, where the RT thread yields right
- * after the work_in_progress flag is cleared. The effects of that are
- * neglected for now.
- */
kthread_queue_work(&sg_policy->worker, &sg_policy->work);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e7b3008b85bb..1356afd1eeb6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1117,7 +1117,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
* should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
* So, overflow is not an issue here.
*/
-u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
u64 u_act;
@@ -2731,8 +2731,6 @@ bool dl_cpu_busy(unsigned int cpu)
#endif
#ifdef CONFIG_SCHED_DEBUG
-extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
-
void print_dl_stats(struct seq_file *m, int cpu)
{
print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54dc31e7ab9b..79f574dba096 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1854,7 +1854,6 @@ static int task_numa_migrate(struct task_struct *p)
static void numa_migrate_preferred(struct task_struct *p)
{
unsigned long interval = HZ;
- unsigned long numa_migrate_retry;
/* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1862,18 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)
/* Periodically retry migrating the task to the preferred node */
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
- numa_migrate_retry = jiffies + interval;
-
- /*
- * Check that the new retry threshold is after the current one. If
- * the retry is in the future, it implies that wake_affine has
- * temporarily asked NUMA balancing to backoff from placement.
- */
- if (numa_migrate_retry > p->numa_migrate_retry)
- return;
-
- /* Safe to try placing the task on the preferred node */
- p->numa_migrate_retry = numa_migrate_retry;
+ p->numa_migrate_retry = jiffies + interval;
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
@@ -5922,48 +5910,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
}
-#ifdef CONFIG_NUMA_BALANCING
-static void
-update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
-{
- unsigned long interval;
-
- if (!static_branch_likely(&sched_numa_balancing))
- return;
-
- /* If balancing has no preference then continue gathering data */
- if (p->numa_preferred_nid == -1)
- return;
-
- /*
- * If the wakeup is not affecting locality then it is neutral from
- * the perspective of NUMA balacing so continue gathering data.
- */
- if (cpu_to_node(prev_cpu) == cpu_to_node(target))
- return;
-
- /*
- * Temporarily prevent NUMA balancing trying to place waker/wakee after
- * wakee has been moved by wake_affine. This will potentially allow
- * related tasks to converge and update their data placement. The
- * 4 * numa_scan_period is to allow the two-pass filter to migrate
- * hot data to the wakers node.
- */
- interval = max(sysctl_numa_balancing_scan_delay,
- p->numa_scan_period << 2);
- p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
-
- interval = max(sysctl_numa_balancing_scan_delay,
- current->numa_scan_period << 2);
- current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
-}
-#else
-static void
-update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
-{
-}
-#endif
-
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
@@ -5979,7 +5925,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
if (target == nr_cpumask_bits)
return prev_cpu;
- update_wa_numa_placement(p, prev_cpu, target);
schedstat_inc(sd->ttwu_move_affine);
schedstat_inc(p->se.statistics.nr_wakeups_affine);
return target;
@@ -9847,6 +9792,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
+out:
/*
* While browsing the domains, we released the rq lock, a task could
* have been enqueued in the meantime. Since we're not going idle,
@@ -9855,7 +9801,6 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
-out:
/* Move the next balance forward */
if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7aef6b4e885a..ef3c4e6f5345 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2701,8 +2701,6 @@ int sched_rr_handler(struct ctl_table *table, int write,
}
#ifdef CONFIG_SCHED_DEBUG
-extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-
void print_rt_stats(struct seq_file *m, int cpu)
{
rt_rq_iter_t iter;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 15750c222ca2..1f0a4bc6a39d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2025,8 +2025,9 @@ extern bool sched_debug_enabled;
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
-extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
#ifdef CONFIG_NUMA_BALANCING
extern void
show_numa_stats(struct task_struct *p, struct seq_file *m);
diff --git a/kernel/signal.c b/kernel/signal.c
index d4ccea599692..9c33163a6165 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1961,14 +1961,27 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
return;
}
+ set_special_state(TASK_TRACED);
+
/*
* We're committing to trapping. TRACED should be visible before
* TRAPPING is cleared; otherwise, the tracer might fail do_wait().
* Also, transition to TRACED and updates to ->jobctl should be
* atomic with respect to siglock and should be done after the arch
* hook as siglock is released and regrabbed across it.
+ *
+ * TRACER TRACEE
+ *
+ * ptrace_attach()
+ * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED)
+ * do_wait()
+ * set_current_state() smp_wmb();
+ * ptrace_do_wait()
+ * wait_task_stopped()
+ * task_stopped_code()
+ * [L] task_is_traced() [S] task_clear_jobctl_trapping();
*/
- set_current_state(TASK_TRACED);
+ smp_wmb();
current->last_siginfo = info;
current->exit_code = exit_code;
@@ -2176,7 +2189,7 @@ static bool do_signal_stop(int signr)
if (task_participate_group_stop(current))
notify = CLD_STOPPED;
- __set_current_state(TASK_STOPPED);
+ set_special_state(TASK_STOPPED);
spin_unlock_irq(&current->sighand->siglock);
/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b7591261652d..64c0291b579c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -21,6 +21,7 @@
#include <linux/smpboot.h>
#include <linux/atomic.h>
#include <linux/nmi.h>
+#include <linux/sched/wake_q.h>
/*
* Structure to determine completion condition and record errors. May
@@ -65,27 +66,31 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
}
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
- struct cpu_stop_work *work)
+ struct cpu_stop_work *work,
+ struct wake_q_head *wakeq)
{
list_add_tail(&work->list, &stopper->works);
- wake_up_process(stopper->thread);
+ wake_q_add(wakeq, stopper->thread);
}
/* queue @work to @stopper. if offline, @work is completed immediately */
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ DEFINE_WAKE_Q(wakeq);
unsigned long flags;
bool enabled;
spin_lock_irqsave(&stopper->lock, flags);
enabled = stopper->enabled;
if (enabled)
- __cpu_stop_queue_work(stopper, work);
+ __cpu_stop_queue_work(stopper, work, &wakeq);
else if (work->done)
cpu_stop_signal_done(work->done);
spin_unlock_irqrestore(&stopper->lock, flags);
+ wake_up_q(&wakeq);
+
return enabled;
}
@@ -229,6 +234,7 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
{
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+ DEFINE_WAKE_Q(wakeq);
int err;
retry:
spin_lock_irq(&stopper1->lock);
@@ -252,8 +258,8 @@ retry:
goto unlock;
err = 0;
- __cpu_stop_queue_work(stopper1, work1);
- __cpu_stop_queue_work(stopper2, work2);
+ __cpu_stop_queue_work(stopper1, work1, &wakeq);
+ __cpu_stop_queue_work(stopper2, work2, &wakeq);
unlock:
spin_unlock(&stopper2->lock);
spin_unlock_irq(&stopper1->lock);
@@ -263,6 +269,9 @@ unlock:
cpu_relax();
goto retry;
}
+
+ wake_up_q(&wakeq);
+
return err;
}
/**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 0e974cface0b..84f37420fcf5 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -119,6 +119,16 @@ static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
+static void inline clocksource_watchdog_lock(unsigned long *flags)
+{
+ spin_lock_irqsave(&watchdog_lock, *flags);
+}
+
+static void inline clocksource_watchdog_unlock(unsigned long *flags)
+{
+ spin_unlock_irqrestore(&watchdog_lock, *flags);
+}
+
static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);
@@ -142,9 +152,19 @@ static void __clocksource_unstable(struct clocksource *cs)
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
+ /*
+ * If the clocksource is registered clocksource_watchdog_kthread() will
+ * re-rate and re-select.
+ */
+ if (list_empty(&cs->list)) {
+ cs->rating = 0;
+ return;
+ }
+
if (cs->mark_unstable)
cs->mark_unstable(cs);
+ /* kick clocksource_watchdog_kthread() */
if (finished_booting)
schedule_work(&watchdog_work);
}
@@ -153,10 +173,8 @@ static void __clocksource_unstable(struct clocksource *cs)
* clocksource_mark_unstable - mark clocksource unstable via watchdog
* @cs: clocksource to be marked unstable
*
- * This function is called instead of clocksource_change_rating from
- * cpu hotplug code to avoid a deadlock between the clocksource mutex
- * and the cpu hotplug mutex. It defers the update of the clocksource
- * to the watchdog thread.
+ * This function is called by the x86 TSC code to mark clocksources as unstable;
+ * it defers demotion and re-selection to a kthread.
*/
void clocksource_mark_unstable(struct clocksource *cs)
{
@@ -164,7 +182,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_lock_irqsave(&watchdog_lock, flags);
if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
- if (list_empty(&cs->wd_list))
+ if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
list_add(&cs->wd_list, &watchdog_list);
__clocksource_unstable(cs);
}
@@ -319,9 +337,8 @@ static void clocksource_resume_watchdog(void)
static void clocksource_enqueue_watchdog(struct clocksource *cs)
{
- unsigned long flags;
+ INIT_LIST_HEAD(&cs->wd_list);
- spin_lock_irqsave(&watchdog_lock, flags);
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
/* cs is a clocksource to be watched. */
list_add(&cs->wd_list, &watchdog_list);
@@ -331,7 +348,6 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
- spin_unlock_irqrestore(&watchdog_lock, flags);
}
static void clocksource_select_watchdog(bool fallback)
@@ -373,9 +389,6 @@ static void clocksource_select_watchdog(bool fallback)
static void clocksource_dequeue_watchdog(struct clocksource *cs)
{
- unsigned long flags;
-
- spin_lock_irqsave(&watchdog_lock, flags);
if (cs != watchdog) {
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
/* cs is a watched clocksource. */
@@ -384,21 +397,19 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs)
clocksource_stop_watchdog();
}
}
- spin_unlock_irqrestore(&watchdog_lock, flags);
}
static int __clocksource_watchdog_kthread(void)
{
struct clocksource *cs, *tmp;
unsigned long flags;
- LIST_HEAD(unstable);
int select = 0;
spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
list_del_init(&cs->wd_list);
- list_add(&cs->wd_list, &unstable);
+ __clocksource_change_rating(cs, 0);
select = 1;
}
if (cs->flags & CLOCK_SOURCE_RESELECT) {
@@ -410,11 +421,6 @@ static int __clocksource_watchdog_kthread(void)
clocksource_stop_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
- /* Needs to be done outside of watchdog lock */
- list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
- list_del_init(&cs->wd_list);
- __clocksource_change_rating(cs, 0);
- }
return select;
}
@@ -447,6 +453,9 @@ static inline int __clocksource_watchdog_kthread(void) { return 0; }
static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
void clocksource_mark_unstable(struct clocksource *cs) { }
+static void inline clocksource_watchdog_lock(unsigned long *flags) { }
+static void inline clocksource_watchdog_unlock(unsigned long *flags) { }
+
#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
/**
@@ -779,14 +788,19 @@ EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
*/
int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
+ unsigned long flags;
/* Initialize mult/shift and max_idle_ns */
__clocksource_update_freq_scale(cs, scale, freq);
/* Add clocksource to the clocksource list */
mutex_lock(&clocksource_mutex);
+
+ clocksource_watchdog_lock(&flags);
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
+ clocksource_watchdog_unlock(&flags);
+
clocksource_select();
clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
@@ -808,8 +822,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
*/
void clocksource_change_rating(struct clocksource *cs, int rating)
{
+ unsigned long flags;
+
mutex_lock(&clocksource_mutex);
+ clocksource_watchdog_lock(&flags);
__clocksource_change_rating(cs, rating);
+ clocksource_watchdog_unlock(&flags);
+
clocksource_select();
clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
@@ -821,6 +840,8 @@ EXPORT_SYMBOL(clocksource_change_rating);
*/
static int clocksource_unbind(struct clocksource *cs)
{
+ unsigned long flags;
+
if (clocksource_is_watchdog(cs)) {
/* Select and try to install a replacement watchdog. */
clocksource_select_watchdog(true);
@@ -834,8 +855,12 @@ static int clocksource_unbind(struct clocksource *cs)
if (curr_clocksource == cs)
return -EBUSY;
}
+
+ clocksource_watchdog_lock(&flags);
clocksource_dequeue_watchdog(cs);
list_del_init(&cs->list);
+ clocksource_watchdog_unlock(&flags);
+
return 0;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b398c2ea69b2..aa2094d5dd27 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -612,6 +612,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
now = ktime_get();
/* Find all expired events */
for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
+ /*
+ * Required for !SMP because for_each_cpu() reports
+ * unconditionally CPU0 as set on UP kernels.
+ */
+ if (!IS_ENABLED(CONFIG_SMP) &&
+ cpumask_empty(tick_broadcast_oneshot_mask))
+ break;
+
td = &per_cpu(tick_cpu_device, cpu);
if (td->evtdev->next_event <= now) {
cpumask_set_cpu(cpu, tmpmask);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 16bbf062018f..8d83bcf9ef69 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5514,10 +5514,10 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- trace_create_file("set_graph_function", 0444, d_tracer,
+ trace_create_file("set_graph_function", 0644, d_tracer,
NULL,
&ftrace_graph_fops);
- trace_create_file("set_graph_notrace", 0444, d_tracer,
+ trace_create_file("set_graph_notrace", 0644, d_tracer,
NULL,
&ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1f951b3df60c..7d306b74230f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -762,6 +762,9 @@ static int regex_match_full(char *str, struct regex *r, int len)
static int regex_match_front(char *str, struct regex *r, int len)
{
+ if (len < r->len)
+ return 0;
+
if (strncmp(str, r->pattern, r->len) == 0)
return 1;
return 0;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0d7b3ffbecc2..b9061ed59bbd 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2466,6 +2466,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
else if (strcmp(modifier, "usecs") == 0)
*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
else {
+ hist_err("Invalid field modifier: ", modifier);
field = ERR_PTR(-EINVAL);
goto out;
}
@@ -2481,6 +2482,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
else {
field = trace_find_event_field(file->event_call, field_name);
if (!field || !field->size) {
+ hist_err("Couldn't find field: ", field_name);
field = ERR_PTR(-EINVAL);
goto out;
}
@@ -4913,6 +4915,16 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
seq_printf(m, "%s", field_name);
} else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
seq_puts(m, "common_timestamp");
+
+ if (hist_field->flags) {
+ if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
+ !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
+ const char *flags = get_hist_field_flags(hist_field);
+
+ if (flags)
+ seq_printf(m, ".%s", flags);
+ }
+ }
}
static int event_hist_trigger_print(struct seq_file *m,
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3c7bfc4bf5e9..4237eba4ef20 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -472,7 +472,7 @@ static __init int stack_trace_init(void)
NULL, &stack_trace_fops);
#ifdef CONFIG_DYNAMIC_FTRACE
- trace_create_file("stack_trace_filter", 0444, d_tracer,
+ trace_create_file("stack_trace_filter", 0644, d_tracer,
&trace_ops, &stack_trace_filter_fops);
#endif
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 34fd0e0ec51d..ac892878dbe6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -55,6 +55,7 @@ struct trace_uprobe {
struct list_head list;
struct trace_uprobe_filter filter;
struct uprobe_consumer consumer;
+ struct path path;
struct inode *inode;
char *filename;
unsigned long offset;
@@ -289,7 +290,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
for (i = 0; i < tu->tp.nr_args; i++)
traceprobe_free_probe_arg(&tu->tp.args[i]);
- iput(tu->inode);
+ path_put(&tu->path);
kfree(tu->tp.call.class->system);
kfree(tu->tp.call.name);
kfree(tu->filename);
@@ -363,7 +364,6 @@ end:
static int create_trace_uprobe(int argc, char **argv)
{
struct trace_uprobe *tu;
- struct inode *inode;
char *arg, *event, *group, *filename;
char buf[MAX_EVENT_NAME_LEN];
struct path path;
@@ -371,7 +371,6 @@ static int create_trace_uprobe(int argc, char **argv)
bool is_delete, is_return;
int i, ret;
- inode = NULL;
ret = 0;
is_delete = false;
is_return = false;
@@ -437,21 +436,16 @@ static int create_trace_uprobe(int argc, char **argv)
}
/* Find the last occurrence, in case the path contains ':' too. */
arg = strrchr(argv[1], ':');
- if (!arg) {
- ret = -EINVAL;
- goto fail_address_parse;
- }
+ if (!arg)
+ return -EINVAL;
*arg++ = '\0';
filename = argv[1];
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
- goto fail_address_parse;
-
- inode = igrab(d_real_inode(path.dentry));
- path_put(&path);
+ return ret;
- if (!inode || !S_ISREG(inode->i_mode)) {
+ if (!d_is_reg(path.dentry)) {
ret = -EINVAL;
goto fail_address_parse;
}
@@ -490,7 +484,7 @@ static int create_trace_uprobe(int argc, char **argv)
goto fail_address_parse;
}
tu->offset = offset;
- tu->inode = inode;
+ tu->path = path;
tu->filename = kstrdup(filename, GFP_KERNEL);
if (!tu->filename) {
@@ -558,7 +552,7 @@ error:
return ret;
fail_address_parse:
- iput(inode);
+ path_put(&path);
pr_info("Failed to parse address or file.\n");
@@ -922,6 +916,7 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
goto err_flags;
tu->consumer.filter = filter;
+ tu->inode = d_real_inode(tu->path.dentry);
ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
if (ret)
goto err_buffer;
@@ -967,6 +962,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+ tu->inode = NULL;
tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
uprobe_buffer_disable();
@@ -1337,7 +1333,6 @@ struct trace_event_call *
create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
{
struct trace_uprobe *tu;
- struct inode *inode;
struct path path;
int ret;
@@ -1345,11 +1340,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
if (ret)
return ERR_PTR(ret);
- inode = igrab(d_inode(path.dentry));
- path_put(&path);
-
- if (!inode || !S_ISREG(inode->i_mode)) {
- iput(inode);
+ if (!d_is_reg(path.dentry)) {
+ path_put(&path);
return ERR_PTR(-EINVAL);
}
@@ -1364,11 +1356,12 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
if (IS_ERR(tu)) {
pr_info("Failed to allocate trace_uprobe.(%d)\n",
(int)PTR_ERR(tu));
+ path_put(&path);
return ERR_CAST(tu);
}
tu->offset = offs;
- tu->inode = inode;
+ tu->path = path;
tu->filename = kstrdup(name, GFP_KERNEL);
init_trace_event_call(tu, &tu->tp.call);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 671b13457387..1e37da2e0c25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -207,7 +207,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
lockdep_is_held(&tracepoints_mutex));
old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
- WARN_ON_ONCE(1);
+ WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
@@ -239,7 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
lockdep_is_held(&tracepoints_mutex));
old = func_remove(&tp_funcs, func);
if (IS_ERR(old)) {
- WARN_ON_ONCE(1);
+ WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}