diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit_watch.c | 4 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 3 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 4 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 70 | ||||
-rw-r--r-- | kernel/cpu.c | 2 | ||||
-rw-r--r-- | kernel/events/core.c | 38 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 6 | ||||
-rw-r--r-- | kernel/exit.c | 1 | ||||
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | kernel/irq/manage.c | 5 | ||||
-rw-r--r-- | kernel/kcov.c | 9 | ||||
-rw-r--r-- | kernel/locking/lockdep_internals.h | 20 | ||||
-rw-r--r-- | kernel/locking/qspinlock_stat.h | 12 | ||||
-rw-r--r-- | kernel/power/suspend.c | 4 | ||||
-rw-r--r-- | kernel/power/suspend_test.c | 4 | ||||
-rw-r--r-- | kernel/printk/printk.c | 30 | ||||
-rw-r--r-- | kernel/ptrace.c | 16 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 36 | ||||
-rw-r--r-- | kernel/sched/core.c | 28 | ||||
-rw-r--r-- | kernel/sched/fair.c | 23 | ||||
-rw-r--r-- | kernel/sched/wait.c | 10 | ||||
-rw-r--r-- | kernel/softirq.c | 2 | ||||
-rw-r--r-- | kernel/taskstats.c | 6 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 2 | ||||
-rw-r--r-- | kernel/time/timer.c | 74 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 24 |
26 files changed, 282 insertions, 155 deletions
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0d302a87f21b..99401efef7ac 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -548,8 +548,8 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) exe_file = get_task_exe_file(tsk); if (!exe_file) return 0; - ino = exe_file->f_inode->i_ino; - dev = exe_file->f_inode->i_sb->s_dev; + ino = file_inode(exe_file)->i_ino; + dev = file_inode(exe_file)->i_sb->s_dev; fput(exe_file); return audit_mark_compare(mark, ino, dev); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 570eeca7bdfa..ad1bc67aff1b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab_elem_free(htab, l); + if (l->state != HTAB_EXTRA_ELEM_USED) + htab_elem_free(htab, l); } } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 228f962447a5..237f3d6a7ddc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -194,7 +194,7 @@ static int map_create(union bpf_attr *attr) err = bpf_map_charge_memlock(map); if (err) - goto free_map; + goto free_map_nouncharge; err = bpf_map_new_fd(map); if (err < 0) @@ -204,6 +204,8 @@ static int map_create(union bpf_attr *attr) return err; free_map: + bpf_map_uncharge_memlock(map); +free_map_nouncharge: map->ops->map_free(map); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99a7e5b388f2..6a936159c6e0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -216,8 +216,8 @@ static void print_verifier_state(struct bpf_verifier_state *state) reg->map_ptr->key_size, reg->map_ptr->value_size); if (reg->min_value != BPF_REGISTER_MIN_RANGE) - verbose(",min_value=%llu", - (unsigned long long)reg->min_value); + verbose(",min_value=%lld", + (long long)reg->min_value); if (reg->max_value != BPF_REGISTER_MAX_RANGE) verbose(",max_value=%llu", (unsigned long long)reg->max_value); @@ -758,7 +758,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if ((s64)reg->min_value < 0) { + if (reg->min_value < 0) { verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; @@ -1468,7 +1468,8 @@ static void check_reg_overflow(struct bpf_reg_state *reg) { if (reg->max_value > BPF_REGISTER_MAX_RANGE) reg->max_value = BPF_REGISTER_MAX_RANGE; - if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE) + if (reg->min_value < BPF_REGISTER_MIN_RANGE || + reg->min_value > BPF_REGISTER_MAX_RANGE) reg->min_value = BPF_REGISTER_MIN_RANGE; } @@ -1476,7 +1477,8 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; - u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; + s64 min_val = BPF_REGISTER_MIN_RANGE; + u64 max_val = BPF_REGISTER_MAX_RANGE; bool min_set = false, max_set = false; u8 opcode = BPF_OP(insn->code); @@ -1512,22 +1514,43 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, return; } + /* If one of our values was at the end of our ranges then we can't just + * do our normal operations to the register, we need to set the values + * to the min/max since they are undefined. + */ + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + switch (opcode) { case BPF_ADD: - dst_reg->min_value += min_val; - dst_reg->max_value += max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value += min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value += max_val; break; case BPF_SUB: - dst_reg->min_value -= min_val; - dst_reg->max_value -= max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value -= min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value -= max_val; break; case BPF_MUL: - dst_reg->min_value *= min_val; - dst_reg->max_value *= max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value *= min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value *= max_val; break; case BPF_AND: - /* & is special since it could end up with 0 bits set. */ - dst_reg->min_value &= min_val; + /* Disallow AND'ing of negative numbers, ain't nobody got time + * for that. Otherwise the minimum is 0 and the max is the max + * value we could AND against. + */ + if (min_val < 0) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value = 0; dst_reg->max_value = max_val; break; case BPF_LSH: @@ -1537,24 +1560,25 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, */ if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else + else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value <<= min_val; if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->max_value = BPF_REGISTER_MAX_RANGE; - else + else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value <<= max_val; break; case BPF_RSH: - dst_reg->min_value >>= min_val; - dst_reg->max_value >>= max_val; - break; - case BPF_MOD: - /* % is special since it is an unsigned modulus, so the floor - * will always be 0. + /* RSH by a negative number is undefined, and the BPF_RSH is an + * unsigned shift, so make the appropriate casts. */ - dst_reg->min_value = 0; - dst_reg->max_value = max_val - 1; + if (min_val < 0 || dst_reg->min_value < 0) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value = + (u64)(dst_reg->min_value) >> min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value >>= max_val; break; default: reset_reg_range_values(regs, insn->dst_reg); diff --git a/kernel/cpu.c b/kernel/cpu.c index 5df20d6d1520..29de1a9352c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -228,7 +228,7 @@ static struct { .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), #ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = {.name = "cpu_hotplug.lock" }, + .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), #endif }; diff --git a/kernel/events/core.c b/kernel/events/core.c index c6e47e97b33f..5134a1c17186 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -902,6 +902,17 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); + + /* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */ + if (perf_cgroup_from_task(current, ctx) != event->cgrp) { + /* + * We are removing the last cpu event in this context. + * If that event is not active in this cpu, cpuctx->cgrp + * should've been cleared by perf_cgroup_switch. + */ + WARN_ON_ONCE(!add && cpuctx->cgrp); + return; + } cpuctx->cgrp = add ? event->cgrp : NULL; } @@ -1960,6 +1971,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -6684,7 +6701,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { - if (filter->inode != file->f_inode) + if (filter->inode != file_inode(file)) return false; if (filter->offset > offset + size) @@ -7075,8 +7092,8 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } READ_ONCE(event->overflow_handler)(event, data, regs); @@ -8012,6 +8029,7 @@ restart: * if <size> is not specified, the range is treated as a single address. */ enum { + IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, @@ -8035,6 +8053,7 @@ static const match_table_t if_tokens = { { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, + { IF_ACT_NONE, NULL }, }; /* @@ -8855,7 +8874,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { + int remove_device; + mutex_lock(&pmus_lock); + remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); mutex_unlock(&pmus_lock); @@ -8869,10 +8891,12 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); + if (remove_device) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); + } free_pmu_context(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d4129bb05e5d..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, + &vma); if (ret <= 0) return ret; @@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + NULL); if (result < 0) return result; diff --git a/kernel/exit.c b/kernel/exit.c index 9d68c45ebbe3..3076f3089919 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -836,6 +836,7 @@ void __noreturn do_exit(long code) */ perf_event_exit_task(tsk); + sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* diff --git a/kernel/fork.c b/kernel/fork.c index 623259fc794d..997ac1d584f7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,6 +315,9 @@ static void account_kernel_stack(struct task_struct *tsk, int account) static void release_task_stack(struct task_struct *tsk) { + if (WARN_ON(tsk->state != TASK_DEAD)) + return; /* Better to leak the stack than to free prematurely */ + account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); @@ -1862,6 +1865,7 @@ bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: + p->state = TASK_DEAD; put_task_stack(p); free_task(p); fork_out: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0c5f1a5db654..6b669593e7eb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq) irq_put_desc_unlock(desc, flags); return 0; } +EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* @@ -1340,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; - unsigned int omsk = irq_settings_get_trigger_mask(desc); + unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); if (nmsk != omsk) /* hope the handler works with current trigger mode */ pr_warn("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + irq, omsk, nmsk); } *old_ptr = new; diff --git a/kernel/kcov.c b/kernel/kcov.c index 8d44b3fea9d0..30e6d05aa5a9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. + * The checks for whether we are in an interrupt are open-coded, because + * 1. We can't use in_interrupt() here, since it also returns true + * when we are inside local_bh_disable() section. + * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), + * since that leads to slower generated code (three separate tests, + * one for each of the flags). */ - if (!t || in_interrupt()) + if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET + | NMI_MASK))) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 51c4b24b6328..c2b88490d857 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -46,6 +46,14 @@ enum { (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) /* + * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text, + * .data and .bss to fit in required 32MB limit for the kernel. With + * PROVE_LOCKING we could go over this limit and cause system boot-up problems. + * So, reduce the static allocations for lockdeps related structures so that + * everything fits in current required size limit. + */ +#ifdef CONFIG_PROVE_LOCKING_SMALL +/* * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies * we track. * @@ -54,18 +62,24 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ +#define MAX_LOCKDEP_ENTRIES 16384UL +#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_STACK_TRACE_ENTRIES 262144UL +#else #define MAX_LOCKDEP_ENTRIES 32768UL #define MAX_LOCKDEP_CHAINS_BITS 16 -#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) - -#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ #define MAX_STACK_TRACE_ENTRIES 524288UL +#endif + +#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index eb0a599fcf58..e852be4851fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -108,11 +108,7 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, /* * Get the counter ID stored in file->f_inode->i_private */ - if (!file->f_inode) { - WARN_ON_ONCE(1); - return -EBADF; - } - counter = (long)(file->f_inode->i_private); + counter = (long)file_inode(file)->i_private; if (counter >= qstat_num) return -EBADF; @@ -177,11 +173,7 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, /* * Get the counter ID stored in file->f_inode->i_private */ - if (!file->f_inode) { - WARN_ON_ONCE(1); - return -EBADF; - } - if ((long)(file->f_inode->i_private) != qstat_reset_cnts) + if ((long)file_inode(file)->i_private != qstat_reset_cnts) return count; for_each_possible_cpu(cpu) { diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1e7f5da648d9..6ccb08f57fcb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); - printk(KERN_INFO "PM: Syncing filesystems ... "); + pr_info("PM: Syncing filesystems ... "); sys_sync(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 084452e34a12..bdff5ed57f10 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -203,8 +203,10 @@ static int __init test_suspend(void) /* RTCs have initialized by now too ... can we use one? */ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); - if (dev) + if (dev) { rtc = rtc_class_open(dev_name(dev)); + put_device(dev); + } if (!rtc) { printk(warn_no_rtc); return 0; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d5e397315473..f6bda15396df 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -253,17 +253,6 @@ static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); -#ifdef CONFIG_OF -static bool of_specified_console; - -void console_set_by_of(void) -{ - of_specified_console = true; -} -#else -# define of_specified_console false -#endif - /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -759,7 +748,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) return -ENOMEM; buf[len] = '\0'; - if (copy_from_iter(buf, len, from) != len) { + if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } @@ -794,8 +783,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) return ret; } -static void cont_flush(void); - static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -811,7 +798,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; @@ -874,7 +860,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) return -ESPIPE; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); switch (whence) { case SEEK_SET: /* the first record */ @@ -913,7 +898,6 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) @@ -1300,7 +1284,6 @@ static int syslog_print(char __user *buf, int size) size_t skip; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1360,7 +1343,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return -ENOMEM; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (buf) { u64 next_seq; u64 seq; @@ -1522,7 +1504,6 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1769,6 +1750,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c cont_flush(); } + /* Skip empty continuation lines that couldn't be added - they just flush */ + if (!text_len && (lflags & LOG_CONT)) + return 0; + /* If it doesn't end in a newline, try to buffer the current line */ if (!(lflags & LOG_NEWLINE)) { if (cont_add(facility, level, lflags, text, text_len)) @@ -2653,7 +2638,7 @@ void register_console(struct console *newcon) * didn't select a console we take the first one * that registers here. */ - if (preferred_console < 0 && !of_specified_console) { + if (preferred_console < 0) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || @@ -3035,7 +3020,6 @@ void kmsg_dump(enum kmsg_dump_reason reason) dumper->active = true; raw_spin_lock_irqsave(&logbuf_lock, flags); - cont_flush(); dumper->cur_seq = clear_seq; dumper->cur_idx = clear_idx; dumper->next_seq = log_next_seq; @@ -3126,7 +3110,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, bool ret; raw_spin_lock_irqsave(&logbuf_lock, flags); - cont_flush(); ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); raw_spin_unlock_irqrestore(&logbuf_lock, flags); @@ -3169,7 +3152,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, goto out; raw_spin_lock_irqsave(&logbuf_lock, flags); - cont_flush(); if (dumper->cur_seq < log_first_seq) { /* messages are gone, move to first available one */ dumper->cur_seq = log_first_seq; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2a99027312a6..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); + retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); if (!retval) { if (copied) break; @@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); + retval = access_process_vm(tsk, dst, buf, this_len, + FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) break; @@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + copied = access_process_vm(tsk, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); + ret = access_process_vm(child, addr, &word, sizeof(word), + FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; else @@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); + ret = access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index a5d966cb8891..f1c8fd566246 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. + * If we race with autogroup_move_group() the caller can use the old + * value of signal->autogroup but in this case sched_move_task() will + * be called again before autogroup_kref_put(). + * + * However, there is no way sched_autogroup_exit_task() could tell us + * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case. */ if (p->flags & PF_EXITING) return false; @@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } +void sched_autogroup_exit_task(struct task_struct *p) +{ + /* + * We are going to call exit_notify() and autogroup_move_group() can't + * see this thread after that: we can no longer use signal->autogroup. + * See the PF_EXITING check in task_wants_autogroup(). + */ + sched_move_task(p); +} + static void autogroup_move_group(struct task_struct *p, struct autogroup *ag) { @@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) } p->signal->autogroup = autogroup_kref_get(ag); - - if (!READ_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - + /* + * We can't avoid sched_move_task() after we changed signal->autogroup, + * this process can already run with task_group() == prev->tg or we can + * race with cgroup code which can read autogroup = prev under rq->lock. + * In the latter case for_each_thread() can not miss a migrating thread, + * cpu_cgroup_attach() must not be possible after cgroup_exit() and it + * can't be removed from thread list, we hold ->siglock. + * + * If an exiting thread was already removed from thread list we rely on + * sched_autogroup_exit_task(). + */ for_each_thread(p, t) sched_move_task(t); -out: + unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..154fd689fe02 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5192,21 +5192,14 @@ void sched_show_task(struct task_struct *p) int ppid; unsigned long state = p->state; + if (!try_get_task_stack(p)) + return; if (state) state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else if (state == TASK_RUNNING) printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -5221,6 +5214,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); show_stack(p, NULL); + put_task_stack(p); } void show_state_filter(unsigned long state_filter) @@ -7515,11 +7509,27 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2d4ad72f8f3c..c242944f5cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* * At this point, util_avg won't be used in select_task_rq_fair anyway @@ -5471,13 +5478,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { - struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - u64 avg_idle = this_rq()->avg_idle; - u64 avg_cost = this_sd->avg_scan_cost; + struct sched_domain *this_sd; + u64 avg_cost, avg_idle = this_rq()->avg_idle; u64 time, cost; s64 delta; int cpu, wrap; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + + avg_cost = this_sd->avg_scan_cost; + /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. @@ -8827,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8842,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4f7053579fe3..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash diff --git a/kernel/softirq.c b/kernel/softirq.c index 1bf81ef91375..744fa611cae0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); const char * const softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b3f05ee20d18..cbb387a265db 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -54,7 +54,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { +/* + * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. + * Make sure they are always aligned. + */ +static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c3aad685bbc0..12dd190634ab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) static int alarm_timer_create(struct k_itimer *new_timer) { enum alarmtimer_type type; - struct alarm_base *base; if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) return -EPERM; type = clock2alarm(new_timer->it_clock); - base = &alarm_bases[type]; alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); return 0; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d47980a1bc4..c611c47de884 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags) #ifdef CONFIG_NO_HZ_COMMON static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { #ifdef CONFIG_SMP if ((tflags & TIMER_PINNED) || !base->migration_enabled) @@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { + unsigned long jnow = READ_ONCE(jiffies); + /* * We only forward the base when it's idle and we have a delta between * base clock and jiffies. */ - if (!base->is_idle || (long) (jiffies - base->clk) < 2) + if (!base->is_idle || (long) (jnow - base->clk) < 2) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jiffies)) - base->clk = jiffies; + if (time_after(base->next_expiry, jnow)) + base->clk = jnow; else base->clk = base->next_expiry; } #else static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { return get_timer_this_cpu_base(tflags); } @@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { } #endif -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ - struct timer_base *target = __get_target_base(base, tflags); - - forward_timer_base(target); - return target; -} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means @@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, { for (;;) { struct timer_base *base; - u32 tf = timer->flags; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); @@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) unsigned long clk = 0, flags; int ret = 0; + BUG_ON(!timer->function); + /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the @@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) if (timer_pending(timer)) { if (timer->expires == expires) return 1; + /* - * Take the current timer_jiffies of base, but without holding - * the lock! + * We lock timer base and calculate the bucket index right + * here. If the timer ends up in the same bucket, then we + * just update the expiry time and avoid the whole + * dequeue/enqueue dance. */ - base = get_timer_base(timer->flags); - clk = base->clk; + base = lock_timer_base(timer, &flags); + clk = base->clk; idx = calc_wheel_index(expires, clk); /* @@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) */ if (idx == timer_get_idx(timer)) { timer->expires = expires; - return 1; + ret = 1; + goto out_unlock; } + } else { + base = lock_timer_base(timer, &flags); } timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, false); if (!ret && pending_only) @@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } + /* Try to forward a stale timer base clock */ + forward_timer_base(base); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance - * between calculating 'idx' and taking the lock, only enqueue_timer() - * and trigger_dyntick_cpu() is required. Otherwise we need to - * (re)calculate the wheel index via internal_add_timer(). + * between calculating 'idx' and possibly switching the base, only + * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise + * we need to (re)calculate the wheel index via + * internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) { enqueue_timer(base, timer, idx); @@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; /* - * We have a fresh next event. Check whether we can forward the base: + * We have a fresh next event. Check whether we can forward the + * base. We can only do that when @basej is past base->clk + * otherwise we might rewind base->clk. */ - if (time_after(nextevt, jiffies)) - base->clk = jiffies; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; + if (time_after(basej, base->clk)) { + if (time_after(nextevt, basej)) + base->clk = basej; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; + } if (time_before_eq(nextevt, basej)) { expires = basem; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2050a7652a86..da87b3cba5b3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1862,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, /* Update rec->flags */ do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + /* We need to update only differences of filter_hash */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -1884,6 +1888,10 @@ rollback: /* Roll back what we did above */ do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (rec == end) goto err_out; @@ -2397,6 +2405,10 @@ void __weak ftrace_replace_code(int enable) return; do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + failed = __ftrace_replace_code(rec, enable); if (failed) { ftrace_bug(failed, rec); @@ -2763,7 +2775,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) struct dyn_ftrace *rec; do_for_each_ftrace_rec(pg, rec) { - if (FTRACE_WARN_ON_ONCE(rec->flags)) + if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED)) pr_warn(" %pS flags:%lx\n", (void *)rec->ip, rec->flags); } while_for_each_ftrace_rec(); @@ -3598,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) goto out_unlock; do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { ret = enter_record(hash, rec, clear_filter); if (ret < 0) { @@ -3793,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, do_for_each_ftrace_rec(pg, rec) { + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (!ftrace_match_record(rec, &func_g, NULL, 0)) continue; @@ -4685,6 +4704,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) do_for_each_ftrace_rec(pg, rec) { + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (ftrace_match_record(rec, &func_g, NULL, 0)) { /* if it is in the array */ exists = false; |