diff options
Diffstat (limited to 'kernel')
45 files changed, 691 insertions, 410 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 5068e2a4e75f..2251882daf53 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -124,8 +124,8 @@ config INLINE_SPIN_LOCK_IRQSAVE def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ ARCH_INLINE_SPIN_LOCK_IRQSAVE -config INLINE_SPIN_UNLOCK - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) +config UNINLINE_SPIN_UNLOCK + bool config INLINE_SPIN_UNLOCK_BH def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 24e7cb0ba26a..3f9c97419f02 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" select PREEMPT_COUNT + select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f4ea4b6f3cf1..ed64ccac67c9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1883,7 +1883,7 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - int retval; + int retval = 0; struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; diff --git a/kernel/compat.c b/kernel/compat.c index f346cedfe24d..74ff8498809a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -31,11 +31,10 @@ #include <asm/uaccess.h> /* - * Note that the native side is already converted to a timespec, because - * that's what we want anyway. + * Get/set struct timeval with struct timespec on the native side */ -static int compat_get_timeval(struct timespec *o, - struct compat_timeval __user *i) +static int compat_get_timeval_convert(struct timespec *o, + struct compat_timeval __user *i) { long usec; @@ -46,8 +45,8 @@ static int compat_get_timeval(struct timespec *o, return 0; } -static int compat_put_timeval(struct compat_timeval __user *o, - struct timeval *i) +static int compat_put_timeval_convert(struct compat_timeval __user *o, + struct timeval *i) { return (put_user(i->tv_sec, &o->tv_sec) || put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; @@ -117,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, if (tv) { struct timeval ktv; do_gettimeofday(&ktv); - if (compat_put_timeval(tv, &ktv)) + if (compat_put_timeval_convert(tv, &ktv)) return -EFAULT; } if (tz) { @@ -135,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, struct timezone ktz; if (tv) { - if (compat_get_timeval(&kts, tv)) + if (compat_get_timeval_convert(&kts, tv)) return -EFAULT; } if (tz) { @@ -146,12 +145,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); } +int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +{ + return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || + __get_user(tv->tv_sec, &ctv->tv_sec) || + __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(get_compat_timeval); + +int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +{ + return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || + __put_user(tv->tv_sec, &ctv->tv_sec) || + __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(put_compat_timeval); + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(get_compat_timespec); int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) { @@ -161,6 +177,42 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user } EXPORT_SYMBOL_GPL(put_compat_timespec); +int compat_get_timeval(struct timeval *tv, const void __user *utv) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; + else + return get_compat_timeval(tv, utv); +} +EXPORT_SYMBOL_GPL(compat_get_timeval); + +int compat_put_timeval(const struct timeval *tv, void __user *utv) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; + else + return put_compat_timeval(tv, utv); +} +EXPORT_SYMBOL_GPL(compat_put_timeval); + +int compat_get_timespec(struct timespec *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; + else + return get_compat_timespec(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec); + +int compat_put_timespec(const struct timespec *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; + else + return put_compat_timespec(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec); + static long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1010cc61931f..14f7070b4ba2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -270,11 +270,11 @@ static struct file_system_type cpuset_fs_type = { * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. If we get * all the way to the top and still haven't found any online cpus, - * return cpu_online_map. Or if passed a NULL cs from an exit'ing - * task, return cpu_online_map. + * return cpu_online_mask. Or if passed a NULL cs from an exit'ing + * task, return cpu_online_mask. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_map. + * of cpu_online_mask. * * Call with callback_mutex held. */ @@ -867,7 +867,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, int retval; int is_load_balanced; - /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ + /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) return -EACCES; @@ -2149,7 +2149,7 @@ void __init cpuset_init_smp(void) * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_map, even if this means going outside the + * subset of cpu_online_mask, even if this means going outside the * tasks cpuset. **/ @@ -2162,10 +2162,9 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) mutex_unlock(&callback_mutex); } -int cpuset_cpus_allowed_fallback(struct task_struct *tsk) +void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpuset *cs; - int cpu; rcu_read_lock(); cs = task_cs(tsk); @@ -2186,22 +2185,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary * set any mask even if it is not right from task_cs() pov, * the pending set_cpus_allowed_ptr() will fix things. + * + * select_fallback_rq() will fix things ups and set cpu_possible_mask + * if required. */ - - cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); - if (cpu >= nr_cpu_ids) { - /* - * Either tsk->cpus_allowed is wrong (see above) or it - * is actually empty. The latter case is only possible - * if we are racing with remove_tasks_in_empty_cpuset(). - * Like above we can temporary set any mask and rely on - * set_cpus_allowed_ptr() as synchronization point. - */ - do_set_cpus_allowed(tsk, cpu_possible_mask); - cpu = cpumask_any(cpu_active_mask); - } - - return cpu; } void cpuset_init_current_mems_allowed(void) diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 3f88a45e6f0a..1dc53bae56e1 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -53,7 +53,6 @@ #include <asm/cacheflush.h> #include <asm/byteorder.h> #include <linux/atomic.h> -#include <asm/system.h> #include "debug_core.h" diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 7179eac7b41c..07c9bbb94a0b 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -15,7 +15,6 @@ #include <linux/sched.h> #include <linux/kdb.h> #include <linux/nmi.h> -#include <asm/system.h> #include "kdb_private.h" diff --git a/kernel/dma.c b/kernel/dma.c index 68a2306522c8..6c6262f86c17 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -18,7 +18,6 @@ #include <linux/proc_fs.h> #include <linux/init.h> #include <asm/dma.h> -#include <asm/system.h> diff --git a/kernel/events/core.c b/kernel/events/core.c index 4b50357914fb..a6a9ec4cd8f5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3348,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event, *running = ctx_time - event->tstamp_running; } -void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { } @@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); - perf_update_user_clock(userpg, now); + arch_perf_update_userpage(userpg, now); barrier(); ++userpg->lock; @@ -7116,6 +7116,13 @@ void __init perf_event_init(void) /* do not patch jump label more than once per second */ jump_label_rate_limit(&perf_sched_events, HZ); + + /* + * Build time assertion that we keep the data_head at the intended + * location. IOW, validation we got the __reserved[] size right. + */ + BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) + != 1024); } static int __init perf_event_sysfs_init(void) diff --git a/kernel/exit.c b/kernel/exit.c index 3db1909faed9..d8bd3b425fa7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -474,7 +474,7 @@ static void close_files(struct files_struct * files) i = j * __NFDBITS; if (i >= fdt->max_fds) break; - set = fdt->open_fds->fds_bits[j++]; + set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); diff --git a/kernel/futex.c b/kernel/futex.c index 72efa1e4359a..e2b0fb9a0b3b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -59,6 +59,7 @@ #include <linux/magic.h> #include <linux/pid.h> #include <linux/nsproxy.h> +#include <linux/ptrace.h> #include <asm/futex.h> @@ -2443,40 +2444,31 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, { struct robust_list_head __user *head; unsigned long ret; - const struct cred *cred = current_cred(), *pcred; + struct task_struct *p; if (!futex_cmpxchg_enabled) return -ENOSYS; + WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); + + rcu_read_lock(); + + ret = -ESRCH; if (!pid) - head = current->robust_list; + p = current; else { - struct task_struct *p; - - ret = -ESRCH; - rcu_read_lock(); p = find_task_by_vpid(pid); if (!p) goto err_unlock; - ret = -EPERM; - pcred = __task_cred(p); - /* If victim is in different user_ns, then uids are not - comparable, so we must have CAP_SYS_PTRACE */ - if (cred->user->user_ns != pcred->user->user_ns) { - if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; - goto ok; - } - /* If victim is in same user_ns, then uids are comparable */ - if (cred->euid != pcred->euid && - cred->euid != pcred->uid && - !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; -ok: - head = p->robust_list; - rcu_read_unlock(); } + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ)) + goto err_unlock; + + head = p->robust_list; + rcu_read_unlock(); + if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 5f9e689dc8f0..83e368b005fc 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -10,6 +10,7 @@ #include <linux/compat.h> #include <linux/nsproxy.h> #include <linux/futex.h> +#include <linux/ptrace.h> #include <asm/uaccess.h> @@ -136,40 +137,31 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, { struct compat_robust_list_head __user *head; unsigned long ret; - const struct cred *cred = current_cred(), *pcred; + struct task_struct *p; if (!futex_cmpxchg_enabled) return -ENOSYS; + WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); + + rcu_read_lock(); + + ret = -ESRCH; if (!pid) - head = current->compat_robust_list; + p = current; else { - struct task_struct *p; - - ret = -ESRCH; - rcu_read_lock(); p = find_task_by_vpid(pid); if (!p) goto err_unlock; - ret = -EPERM; - pcred = __task_cred(p); - /* If victim is in different user_ns, then uids are not - comparable, so we must have CAP_SYS_PTRACE */ - if (cred->user->user_ns != pcred->user->user_ns) { - if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; - goto ok; - } - /* If victim is in same user_ns, then uids are comparable */ - if (cred->euid != pcred->euid && - cred->euid != pcred->uid && - !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; -ok: - head = p->compat_robust_list; - rcu_read_unlock(); } + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ)) + goto err_unlock; + + head = p->compat_robust_list; + rcu_read_unlock(); + if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5a38bf4de641..cf1a4a68ce44 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -13,7 +13,7 @@ config GENERIC_HARDIRQS # Options selectable by the architecture code # Make sparse irq Kconfig switch below available -config HAVE_SPARSE_IRQ +config MAY_HAVE_SPARSE_IRQ bool # Enable the generic irq autoprobe mechanism @@ -56,13 +56,22 @@ config GENERIC_IRQ_CHIP config IRQ_DOMAIN bool +config IRQ_DOMAIN_DEBUG + bool "Expose hardware/virtual IRQ mapping via debugfs" + depends on IRQ_DOMAIN && DEBUG_FS + help + This option will show the mapping relationship between hardware irq + numbers and Linux irq numbers. The mapping is exposed via debugfs + in the file "virq_mapping". + + If you don't know what this means you don't need it. + # Support forced irq threading config IRQ_FORCED_THREADING bool config SPARSE_IRQ - bool "Support sparse irq numbering" - depends on HAVE_SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ ---help--- Sparse irq numbering is useful for distro kernels that want diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6ff84e6a954c..bdb180325551 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -54,14 +54,18 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) { /* - * Wake up the handler thread for this action. In case the - * thread crashed and was killed we just pretend that we - * handled the interrupt. The hardirq handler has disabled the - * device interrupt, so no irq storm is lurking. If the + * In case the thread crashed and was killed we just pretend that + * we handled the interrupt. The hardirq handler has disabled the + * device interrupt, so no irq storm is lurking. + */ + if (action->thread->flags & PF_EXITING) + return; + + /* + * Wake up the handler thread for this action. If the * RUNTHREAD bit is already set, nothing to do. */ - if ((action->thread->flags & PF_EXITING) || - test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) return; /* diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index af48e59bc2ff..3601f3fbf67c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -632,7 +632,7 @@ unsigned int irq_linear_revmap(struct irq_domain *domain, return revmap[hwirq]; } -#ifdef CONFIG_VIRQ_DEBUG +#ifdef CONFIG_IRQ_DOMAIN_DEBUG static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; @@ -668,7 +668,7 @@ static int virq_debug_show(struct seq_file *m, void *private) data = irq_desc_get_chip_data(desc); seq_printf(m, "0x%16p ", data); - if (desc->irq_data.domain->of_node) + if (desc->irq_data.domain && desc->irq_data.domain->of_node) p = desc->irq_data.domain->of_node->full_name; else p = none; @@ -695,14 +695,14 @@ static const struct file_operations virq_debug_fops = { static int __init irq_debugfs_init(void) { - if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root, + if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL, NULL, &virq_debug_fops) == NULL) return -ENOMEM; return 0; } __initcall(irq_debugfs_init); -#endif /* CONFIG_VIRQ_DEBUG */ +#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b0ccd1ac2d6a..89a3ea82569b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -282,7 +282,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { struct irq_chip *chip = irq_desc_get_chip(desc); struct cpumask *set = irq_default_affinity; - int ret; + int ret, node = desc->irq_data.node; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) @@ -301,6 +301,13 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) } cpumask_and(mask, cpu_online_mask, set); + if (node != NUMA_NO_NODE) { + const struct cpumask *nodemask = cpumask_of_node(node); + + /* make sure at least one of the cpus in nodemask is online */ + if (cpumask_intersects(mask, nodemask)) + cpumask_and(mask, mask, nodemask); + } ret = chip->irq_set_affinity(&desc->irq_data, mask, false); switch (ret) { case IRQ_SET_MASK_OK: @@ -645,7 +652,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) * is marked MASKED. */ static void irq_finalize_oneshot(struct irq_desc *desc, - struct irqaction *action, bool force) + struct irqaction *action) { if (!(desc->istate & IRQS_ONESHOT)) return; @@ -679,7 +686,7 @@ again: * we would clear the threads_oneshot bit of this thread which * was just set. */ - if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) goto out_unlock; desc->threads_oneshot &= ~action->thread_mask; @@ -739,7 +746,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) local_bh_disable(); ret = action->thread_fn(action->irq, action->dev_id); - irq_finalize_oneshot(desc, action, false); + irq_finalize_oneshot(desc, action); local_bh_enable(); return ret; } @@ -755,7 +762,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, irqreturn_t ret; ret = action->thread_fn(action->irq, action->dev_id); - irq_finalize_oneshot(desc, action, false); + irq_finalize_oneshot(desc, action); return ret; } @@ -844,7 +851,7 @@ void exit_irq_thread(void) wake_threads_waitq(desc); /* Prevent a stale desc->threads_oneshot */ - irq_finalize_oneshot(desc, action, true); + irq_finalize_oneshot(desc, action); } static void irq_setup_forced_threading(struct irqaction *new) diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 47420908fba0..c3c89751b327 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -43,12 +43,16 @@ void irq_move_masked_irq(struct irq_data *idata) * masking the irqs. */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) - if (!chip->irq_set_affinity(&desc->irq_data, - desc->pending_mask, false)) { + < nr_cpu_ids)) { + int ret = chip->irq_set_affinity(&desc->irq_data, + desc->pending_mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: cpumask_copy(desc->irq_data.affinity, desc->pending_mask); + case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); } + } cpumask_clear(desc->pending_mask); } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c3c46c72046e..0c56d44b9fd5 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -5,6 +5,7 @@ * context. The enqueueing is NMI-safe. */ +#include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/irq_work.h> diff --git a/kernel/kexec.c b/kernel/kexec.c index a6a675cb9818..4e2e472f6aeb 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -37,7 +37,6 @@ #include <asm/page.h> #include <asm/uaccess.h> #include <asm/io.h> -#include <asm/system.h> #include <asm/sections.h> /* Per cpu memory for storing cpu states in case of system crash. */ @@ -1359,6 +1358,10 @@ static int __init parse_crashkernel_simple(char *cmdline, if (*cur == '@') *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warning("crashkernel: unrecognized char\n"); + return -EINVAL; + } return 0; } @@ -1462,7 +1465,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmlist); diff --git a/kernel/module.c b/kernel/module.c index 2c932760fd33..78ac6ec1e425 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -105,6 +105,7 @@ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ /* Block module loading/unloading? */ int modules_disabled = 0; +core_param(nomodule, modules_disabled, bint, 0); /* Waiting for a module to finish initializing? */ static DECLARE_WAIT_QUEUE_HEAD(module_wq); @@ -903,6 +904,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr, static struct module_attribute modinfo_refcnt = __ATTR(refcnt, 0444, show_refcnt, NULL); +void __module_get(struct module *module) +{ + if (module) { + preempt_disable(); + __this_cpu_inc(module->refptr->incs); + trace_module_get(module, _RET_IP_); + preempt_enable(); + } +} +EXPORT_SYMBOL(__module_get); + +bool try_module_get(struct module *module) +{ + bool ret = true; + + if (module) { + preempt_disable(); + + if (likely(module_is_live(module))) { + __this_cpu_inc(module->refptr->incs); + trace_module_get(module, _RET_IP_); + } else + ret = false; + + preempt_enable(); + } + return ret; +} +EXPORT_SYMBOL(try_module_get); + void module_put(struct module *module) { if (module) { @@ -2380,8 +2411,7 @@ static int copy_and_check(struct load_info *info, return -ENOEXEC; /* Suck in entire file: we'll want most of it. */ - /* vmalloc barfs on "unusual" numbers. Check here */ - if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) + if ((hdr = vmalloc(len)) == NULL) return -ENOMEM; if (copy_from_user(hdr, umod, len) != 0) { @@ -2922,7 +2952,8 @@ static struct module *load_module(void __user *umod, mutex_unlock(&module_mutex); /* Module is ready to execute: parsing args may do that. */ - err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); + err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, + -32768, 32767, NULL); if (err < 0) goto unlink; diff --git a/kernel/padata.c b/kernel/padata.c index 6f10eb285ece..89fe3d1b9efb 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1,6 +1,8 @@ /* * padata.c - generic interface to process data streams in parallel * + * See Documentation/padata.txt for an api documentation. + * * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> * @@ -354,13 +356,13 @@ static int padata_setup_cpumasks(struct parallel_data *pd, if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) return -ENOMEM; - cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); + cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { free_cpumask_var(pd->cpumask.cbcpu); return -ENOMEM; } - cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); + cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); return 0; } @@ -564,7 +566,7 @@ EXPORT_SYMBOL(padata_unregister_cpumask_notifier); static bool padata_validate_cpumask(struct padata_instance *pinst, const struct cpumask *cpumask) { - if (!cpumask_intersects(cpumask, cpu_active_mask)) { + if (!cpumask_intersects(cpumask, cpu_online_mask)) { pinst->flags |= PADATA_INVALID; return false; } @@ -678,7 +680,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd; - if (cpumask_test_cpu(cpu, cpu_active_mask)) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) { pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, pinst->cpumask.cbcpu); if (!pd) @@ -746,6 +748,9 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) return -ENOMEM; padata_replace(pinst, pd); + + cpumask_clear_cpu(cpu, pd->cpumask.cbcpu); + cpumask_clear_cpu(cpu, pd->cpumask.pcpu); } return 0; diff --git a/kernel/params.c b/kernel/params.c index 47f5bf12434a..f37d82631347 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -87,6 +87,8 @@ static int parse_one(char *param, char *val, const struct kernel_param *params, unsigned num_params, + s16 min_level, + s16 max_level, int (*handle_unknown)(char *param, char *val)) { unsigned int i; @@ -95,6 +97,9 @@ static int parse_one(char *param, /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { + if (params[i].level < min_level + || params[i].level > max_level) + return 0; /* No one handled NULL, so do it here. */ if (!val && params[i].ops->set != param_set_bool && params[i].ops->set != param_set_bint) @@ -174,6 +179,8 @@ int parse_args(const char *name, char *args, const struct kernel_param *params, unsigned num, + s16 min_level, + s16 max_level, int (*unknown)(char *param, char *val)) { char *param, *val; @@ -189,7 +196,8 @@ int parse_args(const char *name, args = next_arg(args, ¶m, &val); irq_was_disabled = irqs_disabled(); - ret = parse_one(param, val, params, num, unknown); + ret = parse_one(param, val, params, num, + min_level, max_level, unknown); if (irq_was_disabled && !irqs_disabled()) { printk(KERN_WARNING "parse_args(): option '%s' enabled " "irq's!\n", param); @@ -297,35 +305,18 @@ EXPORT_SYMBOL(param_ops_charp); /* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, const struct kernel_param *kp) { - bool v; - int ret; - /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ - ret = strtobool(val, &v); - if (ret) - return ret; - - if (kp->flags & KPARAM_ISBOOL) - *(bool *)kp->arg = v; - else - *(int *)kp->arg = v; - return 0; + return strtobool(val, kp->arg); } EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { - bool val; - if (kp->flags & KPARAM_ISBOOL) - val = *(bool *)kp->arg; - else - val = *(int *)kp->arg; - /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", val ? 'Y' : 'N'); + return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); @@ -343,7 +334,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp) struct kernel_param dummy; dummy.arg = &boolval; - dummy.flags = KPARAM_ISBOOL; ret = param_set_bool(val, &dummy); if (ret == 0) *(bool *)kp->arg = !boolval; @@ -372,7 +362,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp) /* Match bool exactly, by re-using it. */ boolkp = *kp; boolkp.arg = &v; - boolkp.flags |= KPARAM_ISBOOL; ret = param_set_bool(val, &boolkp); if (ret == 0) @@ -393,7 +382,7 @@ static int param_array(const char *name, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, const struct kernel_param *kp), - u16 flags, + s16 level, unsigned int *num) { int ret; @@ -403,7 +392,7 @@ static int param_array(const char *name, /* Get the name right for errors. */ kp.name = name; kp.arg = elem; - kp.flags = flags; + kp.level = level; *num = 0; /* We expect a comma-separated list of values. */ @@ -444,7 +433,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp) unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->ops->set, kp->flags, + arr->elemsize, arr->ops->set, kp->level, arr->num ?: &temp_num); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 17b232869a04..57bc1fd35b3c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -15,6 +15,7 @@ #include <linux/acct.h> #include <linux/slab.h> #include <linux/proc_fs.h> +#include <linux/reboot.h> #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -183,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); + if (pid_ns->reboot) + current->signal->group_exit_code = pid_ns->reboot; + acct_exit_ns(pid_ns); return; } @@ -217,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = { static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; +int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) +{ + if (pid_ns == &init_pid_ns) + return 0; + + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART2: + case LINUX_REBOOT_CMD_RESTART: + pid_ns->reboot = SIGHUP; + break; + + case LINUX_REBOOT_CMD_POWER_OFF: + case LINUX_REBOOT_CMD_HALT: + pid_ns->reboot = SIGINT; + break; + default: + return -EINVAL; + } + + read_lock(&tasklist_lock); + force_sig(SIGKILL, pid_ns->child_reaper); + read_unlock(&tasklist_lock); + + do_exit(0); + + /* Not reached */ + return 0; +} + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/rwsem.c b/kernel/rwsem.c index b152f74f02de..6850f53e02d8 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -10,7 +10,6 @@ #include <linux/export.h> #include <linux/rwsem.h> -#include <asm/system.h> #include <linux/atomic.h> /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 503d6426126d..4603b9d8f30a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include <linux/init_task.h> #include <linux/binfmts.h> +#include <asm/switch_to.h> #include <asm/tlb.h> #include <asm/irq_regs.h> #include <asm/mutex.h> @@ -1264,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int select_fallback_rq(int cpu, struct task_struct *p) { - int dest_cpu; const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + enum { cpuset, possible, fail } state = cpuset; + int dest_cpu; /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + for_each_cpu(dest_cpu, nodemask) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; + } + + for (;;) { + /* Any allowed, online CPU? */ + for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + goto out; + } - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - return dest_cpu; + switch (state) { + case cpuset: + /* No more Mr. Nice Guy. */ + cpuset_cpus_allowed_fallback(p); + state = possible; + break; - /* No more Mr. Nice Guy. */ - dest_cpu = cpuset_cpus_allowed_fallback(p); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); + case possible: + do_set_cpus_allowed(p, cpu_possible_mask); + state = fail; + break; + + case fail: + BUG(); + break; + } + } + +out: + if (state != cpuset) { + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk_sched("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } } return dest_cpu; @@ -1933,6 +1964,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); + finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); if (mm) @@ -3070,8 +3102,6 @@ EXPORT_SYMBOL(sub_preempt_count); */ static noinline void __schedule_bug(struct task_struct *prev) { - struct pt_regs *regs = get_irq_regs(); - if (oops_in_progress) return; @@ -3082,11 +3112,7 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); - - if (regs) - show_regs(regs); - else - dump_stack(); + dump_stack(); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94340c7544a9..0d97ebdc58f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec); +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -1162,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) __clear_buddies_skip(se); } -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -1546,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, resched_task(rq_of(cfs_rq)->curr); } -static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; @@ -2073,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) {} +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b60dad720173..44af55e6d5d0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1428,7 +1428,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) next_idx: if (idx >= MAX_RT_PRIO) continue; - if (next && next->prio < idx) + if (next && next->prio <= idx) continue; list_for_each_entry(rt_se, array->queue + idx, run_list) { struct task_struct *p; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 42b1f304b044..fb3acba4d52e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -681,6 +681,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #ifndef finish_arch_switch # define finish_arch_switch(prev) do { } while (0) #endif +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif #ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) diff --git a/kernel/signal.c b/kernel/signal.c index d523da02dd14..17afcaf582d0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -36,6 +36,7 @@ #include <asm/uaccess.h> #include <asm/unistd.h> #include <asm/siginfo.h> +#include <asm/cacheflush.h> #include "audit.h" /* audit_signal_info() */ /* diff --git a/kernel/smp.c b/kernel/smp.c index db197d60489b..2f8b10ecf759 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -701,3 +701,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait) return ret; } EXPORT_SYMBOL(on_each_cpu); + +/** + * on_each_cpu_mask(): Run a function on processors specified by + * cpumask, which may include the local processor. + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * If @wait is true, then returns once @func has returned. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait) +{ + int cpu = get_cpu(); + + smp_call_function_many(mask, func, info, wait); + if (cpumask_test_cpu(cpu, mask)) { + local_irq_disable(); + func(info); + local_irq_enable(); + } + put_cpu(); +} +EXPORT_SYMBOL(on_each_cpu_mask); + +/* + * on_each_cpu_cond(): Call a function on each processor for which + * the supplied function cond_func returns true, optionally waiting + * for all the required CPUs to finish. This may include the local + * processor. + * @cond_func: A callback function that is passed a cpu id and + * the the info parameter. The function is called + * with preemption disabled. The function should + * return a blooean value indicating whether to IPI + * the specified CPU. + * @func: The function to run on all applicable CPUs. + * This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to both functions. + * @wait: If true, wait (atomically) until function has + * completed on other CPUs. + * @gfp_flags: GFP flags to use when allocating the cpumask + * used internally by the function. + * + * The function might sleep if the GFP flags indicates a non + * atomic allocation is allowed. + * + * Preemption is disabled to protect against CPUs going offline but not online. + * CPUs going online during the call will not be seen or sent an IPI. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + cpumask_var_t cpus; + int cpu, ret; + + might_sleep_if(gfp_flags & __GFP_WAIT); + + if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) + cpumask_set_cpu(cpu, cpus); + on_each_cpu_mask(cpus, func, info, wait); + preempt_enable(); + free_cpumask_var(cpus); + } else { + /* + * No free cpumask, bother. No matter, we'll + * just have to IPI them one by one. + */ + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) { + ret = smp_call_function_single(cpu, func, + info, wait); + WARN_ON_ONCE(!ret); + } + preempt_enable(); + } +} +EXPORT_SYMBOL(on_each_cpu_cond); diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 84c7d96918bf..5cdd8065a3ce 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) EXPORT_SYMBOL(_raw_spin_lock_bh); #endif -#ifndef CONFIG_INLINE_SPIN_UNLOCK +#ifdef CONFIG_UNINLINE_SPIN_UNLOCK void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { __raw_spin_unlock(lock); diff --git a/kernel/sys.c b/kernel/sys.c index 9eb7fcab8df6..e7006eb6c1e4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; + /* + * If pid namespaces are enabled and the current task is in a child + * pid_namespace, the command is handled by reboot_pid_ns() which will + * call do_exit(). + */ + ret = reboot_pid_ns(task_active_pid_ns(current), cmd); + if (ret) + return ret; + /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d48ff4fd44c3..52b3a06a02f8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -23,6 +23,7 @@ #include <linux/swap.h> #include <linux/slab.h> #include <linux/sysctl.h> +#include <linux/bitmap.h> #include <linux/signal.h> #include <linux/printk.h> #include <linux/proc_fs.h> @@ -68,6 +69,9 @@ #include <asm/stacktrace.h> #include <asm/io.h> #endif +#ifdef CONFIG_SPARC +#include <asm/setup.h> +#endif #ifdef CONFIG_BSD_PROCESS_ACCT #include <linux/acct.h> #endif @@ -142,7 +146,6 @@ static const int cap_last_cap = CAP_LAST_CAP; #include <linux/inotify.h> #endif #ifdef CONFIG_SPARC -#include <asm/system.h> #endif #ifdef CONFIG_SPARC64 @@ -2393,9 +2396,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } } - while (val_a <= val_b) - set_bit(val_a++, tmp_bitmap); - + bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; proc_skip_char(&kbuf, &left, '\n'); } @@ -2438,8 +2439,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (*ppos) bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); else - memcpy(bitmap, tmp_bitmap, - BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long)); + bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } kfree(tmp_bitmap); *lenp -= left; diff --git a/kernel/time.c b/kernel/time.c index 73e416db0a1e..ba744cf80696 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -163,7 +163,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) return error; if (tz) { - /* SMP safe, global irq locking makes it work. */ sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { @@ -173,12 +172,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) } } if (tv) - { - /* SMP safe, again the code in arch/foo/time.c should - * globally block out interrupts when it runs. - */ return do_settimeofday(tv); - } return 0; } diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a46f5d64504..8a538c55fc7b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -96,6 +96,11 @@ static int alarmtimer_rtc_add_device(struct device *dev, return 0; } +static inline void alarmtimer_rtc_timer_init(void) +{ + rtc_timer_init(&rtctimer, NULL, NULL); +} + static struct class_interface alarmtimer_rtc_interface = { .add_dev = &alarmtimer_rtc_add_device, }; @@ -117,6 +122,7 @@ static inline struct rtc_device *alarmtimer_get_rtcdev(void) #define rtcdev (NULL) static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } +static inline void alarmtimer_rtc_timer_init(void) { } #endif /** @@ -783,6 +789,8 @@ static int __init alarmtimer_init(void) .nsleep = alarm_timer_nsleep, }; + alarmtimer_rtc_timer_init(); + posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a45ca167ab24..c9583382141a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -500,7 +500,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) { u64 ret; /* - * We won't try to correct for more then 11% adjustments (110,000 ppm), + * We won't try to correct for more than 11% adjustments (110,000 ppm), */ ret = (u64)cs->mult * 11; do_div(ret,100); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 6e039b144daf..f03fd83b170b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -34,8 +34,6 @@ unsigned long tick_nsec; static u64 tick_length; static u64 tick_length_base; -static struct hrtimer leap_timer; - #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) @@ -381,70 +379,63 @@ u64 ntp_tick_length(void) /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * Also handles leap second processing, and returns leap offset */ -static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) +int second_overflow(unsigned long secs) { - enum hrtimer_restart res = HRTIMER_NORESTART; - unsigned long flags; + s64 delta; int leap = 0; + unsigned long flags; spin_lock_irqsave(&ntp_lock, flags); + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. + */ switch (time_state) { case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; break; case TIME_INS: - leap = -1; - time_state = TIME_OOP; - printk(KERN_NOTICE - "Clock: inserting leap second 23:59:60 UTC\n"); - hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); - res = HRTIMER_RESTART; + if (secs % 86400 == 0) { + leap = -1; + time_state = TIME_OOP; + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); + } break; case TIME_DEL: - leap = 1; - time_tai--; - time_state = TIME_WAIT; - printk(KERN_NOTICE - "Clock: deleting leap second 23:59:59 UTC\n"); + if ((secs + 1) % 86400 == 0) { + leap = 1; + time_tai--; + time_state = TIME_WAIT; + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); + } break; case TIME_OOP: time_tai++; time_state = TIME_WAIT; - /* fall through */ + break; + case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; break; } - spin_unlock_irqrestore(&ntp_lock, flags); - /* - * We have to call this outside of the ntp_lock to keep - * the proper locking hierarchy - */ - if (leap) - timekeeping_leap_insert(leap); - - return res; -} - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - */ -void second_overflow(void) -{ - s64 delta; - unsigned long flags; - - spin_lock_irqsave(&ntp_lock, flags); /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -481,15 +472,17 @@ void second_overflow(void) tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; time_adjust = 0; + + + out: spin_unlock_irqrestore(&ntp_lock, flags); + + return leap; } #ifdef CONFIG_GENERIC_CMOS_UPDATE -/* Disable the cmos update - used by virtualization and embedded */ -int no_sync_cmos_clock __read_mostly; - static void sync_cmos_clock(struct work_struct *work); static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); @@ -536,35 +529,13 @@ static void sync_cmos_clock(struct work_struct *work) static void notify_cmos_timer(void) { - if (!no_sync_cmos_clock) - schedule_delayed_work(&sync_cmos_work, 0); + schedule_delayed_work(&sync_cmos_work, 0); } #else static inline void notify_cmos_timer(void) { } #endif -/* - * Start the leap seconds timer: - */ -static inline void ntp_start_leap_timer(struct timespec *ts) -{ - long now = ts->tv_sec; - - if (time_status & STA_INS) { - time_state = TIME_INS; - now += 86400 - now % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - - return; - } - - if (time_status & STA_DEL) { - time_state = TIME_DEL; - now += 86400 - (now + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - } -} /* * Propagate a new txc->status value into the NTP state: @@ -589,22 +560,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; - switch (time_state) { - case TIME_OK: - ntp_start_leap_timer(ts); - break; - case TIME_INS: - case TIME_DEL: - time_state = TIME_OK; - ntp_start_leap_timer(ts); - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - case TIME_OOP: - hrtimer_restart(&leap_timer); - break; - } } /* * Called with the xtime lock held, so we can access and modify @@ -686,9 +641,6 @@ int do_adjtimex(struct timex *txc) (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; - - if (txc->modes & ADJ_STATUS && time_state != TIME_OK) - hrtimer_cancel(&leap_timer); } if (txc->modes & ADJ_SETOFFSET) { @@ -1010,6 +962,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { ntp_clear(); - hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - leap_timer.function = ntp_leap_second; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 15be32e19c6e..d66b21308f7c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -184,18 +184,6 @@ static void timekeeping_update(bool clearntp) } -void timekeeping_leap_insert(int leapsecond) -{ - unsigned long flags; - - write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeper.xtime.tv_sec += leapsecond; - timekeeper.wall_to_monotonic.tv_sec -= leapsecond; - timekeeping_update(false); - write_sequnlock_irqrestore(&timekeeper.lock, flags); - -} - /** * timekeeping_forward_now - update clock to the current time * @@ -448,9 +436,12 @@ EXPORT_SYMBOL(timekeeping_inject_offset); static int change_clocksource(void *data) { struct clocksource *new, *old; + unsigned long flags; new = (struct clocksource *) data; + write_seqlock_irqsave(&timekeeper.lock, flags); + timekeeping_forward_now(); if (!new->enable || new->enable(new) == 0) { old = timekeeper.clock; @@ -458,6 +449,10 @@ static int change_clocksource(void *data) if (old->disable) old->disable(old); } + timekeeping_update(true); + + write_sequnlock_irqrestore(&timekeeper.lock, flags); + return 0; } @@ -827,7 +822,7 @@ static void timekeeping_adjust(s64 offset) int adj; /* - * The point of this is to check if the error is greater then half + * The point of this is to check if the error is greater than half * an interval. * * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. @@ -835,7 +830,7 @@ static void timekeeping_adjust(s64 offset) * Note we subtract one in the shift, so that error is really error*2. * This "saves" dividing(shifting) interval twice, but keeps the * (error > interval) comparison as still measuring if error is - * larger then half an interval. + * larger than half an interval. * * Note: It does not "save" on aggravation when reading the code. */ @@ -843,7 +838,7 @@ static void timekeeping_adjust(s64 offset) if (error > interval) { /* * We now divide error by 4(via shift), which checks if - * the error is greater then twice the interval. + * the error is greater than twice the interval. * If it is greater, we need a bigadjust, if its smaller, * we can adjust by 1. */ @@ -874,13 +869,15 @@ static void timekeeping_adjust(s64 offset) } else /* No adjustment needed */ return; - WARN_ONCE(timekeeper.clock->maxadj && - (timekeeper.mult + adj > timekeeper.clock->mult + - timekeeper.clock->maxadj), - "Adjusting %s more then 11%% (%ld vs %ld)\n", + if (unlikely(timekeeper.clock->maxadj && + (timekeeper.mult + adj > + timekeeper.clock->mult + timekeeper.clock->maxadj))) { + printk_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", timekeeper.clock->name, (long)timekeeper.mult + adj, (long)timekeeper.clock->mult + timekeeper.clock->maxadj); + } /* * So the following can be confusing. * @@ -952,7 +949,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; u64 raw_nsecs; - /* If the offset is smaller then a shifted interval, do nothing */ + /* If the offset is smaller than a shifted interval, do nothing */ if (offset < timekeeper.cycle_interval<<shift) return offset; @@ -962,9 +959,11 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; while (timekeeper.xtime_nsec >= nsecps) { + int leap; timekeeper.xtime_nsec -= nsecps; timekeeper.xtime.tv_sec++; - second_overflow(); + leap = second_overflow(timekeeper.xtime.tv_sec); + timekeeper.xtime.tv_sec += leap; } /* Accumulate raw time */ @@ -1018,13 +1017,13 @@ static void update_wall_time(void) * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, * we calculate the largest doubling multiple of cycle_intervals - * that is smaller then the offset. We then accumulate that + * that is smaller than the offset. We then accumulate that * chunk in one go, and then try to consume the next smaller * doubled multiple. */ shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); shift = max(0, shift); - /* Bound shift to one less then what overflows tick_length */ + /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { @@ -1072,12 +1071,14 @@ static void update_wall_time(void) /* * Finally, make sure that after the rounding - * xtime.tv_nsec isn't larger then NSEC_PER_SEC + * xtime.tv_nsec isn't larger than NSEC_PER_SEC */ if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { + int leap; timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; timekeeper.xtime.tv_sec++; - second_overflow(); + leap = second_overflow(timekeeper.xtime.tv_sec); + timekeeper.xtime.tv_sec += leap; } timekeeping_update(false); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cd3134510f3d..a1d2849f2473 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -141,7 +141,7 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE + select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 867bd1dd2dd0..0fa92f677c92 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -249,7 +249,8 @@ static void update_ftrace_function(void) #else __ftrace_trace_function = func; #endif - ftrace_trace_function = ftrace_test_stop_func; + ftrace_trace_function = + (func == ftrace_stub) ? func : ftrace_test_stop_func; #endif } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f5b7b5c1195b..cf8d11e91efd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -154,33 +154,10 @@ enum { static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -/** - * tracing_on - enable all tracing buffers - * - * This function enables all tracing buffers that may have been - * disabled with tracing_off. - */ -void tracing_on(void) -{ - set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_on); +/* Used for individual buffers (after the counter) */ +#define RB_BUFFER_OFF (1 << 20) -/** - * tracing_off - turn off all tracing buffers - * - * This function stops all tracing buffers from recording data. - * It does not disable any overhead the tracers themselves may - * be causing. This function simply causes all recording to - * the ring buffers to fail. - */ -void tracing_off(void) -{ - clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_off); +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) /** * tracing_off_permanent - permanently disable ring buffers @@ -193,15 +170,6 @@ void tracing_off_permanent(void) set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); } -/** - * tracing_is_on - show state of ring buffers enabled - */ -int tracing_is_on(void) -{ - return ring_buffer_flags == RB_BUFFERS_ON; -} -EXPORT_SYMBOL_GPL(tracing_is_on); - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) @@ -2619,6 +2587,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_record_enable); /** + * ring_buffer_record_off - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * This is different than ring_buffer_record_disable() as + * it works like an on/off switch, where as the disable() verison + * must be paired with a enable(). + */ +void ring_buffer_record_off(struct ring_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + do { + rd = atomic_read(&buffer->record_disabled); + new_rd = rd | RB_BUFFER_OFF; + } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_off); + +/** + * ring_buffer_record_on - restart writes into the buffer + * @buffer: The ring buffer to start writes to. + * + * This enables all writes to the buffer that was disabled by + * ring_buffer_record_off(). + * + * This is different than ring_buffer_record_enable() as + * it works like an on/off switch, where as the enable() verison + * must be paired with a disable(). + */ +void ring_buffer_record_on(struct ring_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + do { + rd = atomic_read(&buffer->record_disabled); + new_rd = rd & ~RB_BUFFER_OFF; + } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_on); + +/** + * ring_buffer_record_is_on - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * + * Returns true if the ring buffer is in a state that it accepts writes. + */ +int ring_buffer_record_is_on(struct ring_buffer *buffer) +{ + return !atomic_read(&buffer->record_disabled); +} + +/** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop @@ -4039,68 +4064,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); -#ifdef CONFIG_TRACING -static ssize_t -rb_simple_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - char buf[64]; - int r; - - if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) - r = sprintf(buf, "permanently disabled\n"); - else - r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -rb_simple_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - if (val) - set_bit(RB_BUFFERS_ON_BIT, p); - else - clear_bit(RB_BUFFERS_ON_BIT, p); - - (*ppos)++; - - return cnt; -} - -static const struct file_operations rb_simple_fops = { - .open = tracing_open_generic, - .read = rb_simple_read, - .write = rb_simple_write, - .llseek = default_llseek, -}; - - -static __init int rb_init_debugfs(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - - trace_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - - return 0; -} - -fs_initcall(rb_init_debugfs); -#endif - #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 10d5503f0d04..ed7b5d1e12f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -36,6 +36,7 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/poll.h> +#include <linux/nmi.h> #include <linux/fs.h> #include "trace.h" @@ -352,6 +353,59 @@ static void wakeup_work_handler(struct work_struct *work) static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); /** + * tracing_on - enable tracing buffers + * + * This function enables tracing buffers that may have been + * disabled with tracing_off. + */ +void tracing_on(void) +{ + if (global_trace.buffer) + ring_buffer_record_on(global_trace.buffer); + /* + * This flag is only looked at when buffers haven't been + * allocated yet. We don't really care about the race + * between setting this flag and actually turning + * on the buffer. + */ + global_trace.buffer_disabled = 0; +} +EXPORT_SYMBOL_GPL(tracing_on); + +/** + * tracing_off - turn off tracing buffers + * + * This function stops the tracing buffers from recording data. + * It does not disable any overhead the tracers themselves may + * be causing. This function simply causes all recording to + * the ring buffers to fail. + */ +void tracing_off(void) +{ + if (global_trace.buffer) + ring_buffer_record_on(global_trace.buffer); + /* + * This flag is only looked at when buffers haven't been + * allocated yet. We don't really care about the race + * between setting this flag and actually turning + * on the buffer. + */ + global_trace.buffer_disabled = 1; +} +EXPORT_SYMBOL_GPL(tracing_off); + +/** + * tracing_is_on - show state of ring buffers enabled + */ +int tracing_is_on(void) +{ + if (global_trace.buffer) + return ring_buffer_record_is_on(global_trace.buffer); + return !global_trace.buffer_disabled; +} +EXPORT_SYMBOL_GPL(tracing_is_on); + +/** * trace_wake_up - wake up tasks waiting for trace input * * Schedules a delayed work to wake up any task that is blocked on the @@ -1644,6 +1698,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; + int next_size = 0; int cpu; /* @@ -1675,9 +1730,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, next_cpu = cpu; next_ts = ts; next_lost = lost_events; + next_size = iter->ent_size; } } + iter->ent_size = next_size; + if (ent_cpu) *ent_cpu = next_cpu; @@ -4567,6 +4625,55 @@ static __init void create_trace_options_dir(void) create_trace_option_core_file(trace_options[i], i); } +static ssize_t +rb_simple_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ring_buffer *buffer = filp->private_data; + char buf[64]; + int r; + + if (buffer) + r = ring_buffer_record_is_on(buffer); + else + r = 0; + + r = sprintf(buf, "%d\n", r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +rb_simple_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ring_buffer *buffer = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (buffer) { + if (val) + ring_buffer_record_on(buffer); + else + ring_buffer_record_off(buffer); + } + + (*ppos)++; + + return cnt; +} + +static const struct file_operations rb_simple_fops = { + .open = tracing_open_generic, + .read = rb_simple_read, + .write = rb_simple_write, + .llseek = default_llseek, +}; + static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; @@ -4626,6 +4733,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("trace_clock", 0644, d_tracer, NULL, &trace_clock_fops); + trace_create_file("tracing_on", 0644, d_tracer, + global_trace.buffer, &rb_simple_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -4798,6 +4908,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); } + touch_nmi_watchdog(); trace_printk_seq(&iter.seq); } @@ -4863,6 +4974,8 @@ __init static int tracer_alloc_buffers(void) goto out_free_cpumask; } global_trace.entries = ring_buffer_size(global_trace.buffer); + if (global_trace.buffer_disabled) + tracing_off(); #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 54faec790bc1..95059f091a24 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -154,6 +154,7 @@ struct trace_array { struct ring_buffer *buffer; unsigned long entries; int cpu; + int buffer_disabled; cycle_t time_start; struct task_struct *waiter; struct trace_array_cpu *data[NR_CPUS]; @@ -835,13 +836,11 @@ extern const char *__stop___trace_bprintk_fmt[]; filter) #include "trace_entries.h" -#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_FUNCTION_TRACER int perf_ftrace_event_register(struct ftrace_event_call *call, enum trace_reg type, void *data); #else #define perf_ftrace_event_register NULL #endif /* CONFIG_FUNCTION_TRACER */ -#endif /* CONFIG_PERF_EVENTS */ #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index d91eb0541b3a..4108e1250ca2 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -166,6 +166,12 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, #define FTRACE_STACK_ENTRIES 8 +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, @@ -175,8 +181,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry, __dynamic_array(unsigned long, caller ) ), - F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], __entry->caller[6], __entry->caller[7]), @@ -193,8 +200,9 @@ FTRACE_ENTRY(user_stack, userstack_entry, __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) ), - F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], __entry->caller[6], __entry->caller[7]), diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7b46c9bd22ae..3dd15e8bc856 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -162,7 +162,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define __dynamic_array(type, item) #undef F_printk -#define F_printk(fmt, args...) #fmt ", " __stringify(args) +#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) #undef FTRACE_ENTRY_REG #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ |