diff options
author | David S. Miller <davem@davemloft.net> | 2011-01-24 13:17:06 -0800 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-01-24 13:17:06 -0800 |
commit | e92427b289d252cfbd4cb5282d92f4ce1a5bb1fb (patch) | |
tree | 6d30e5e7b7f8e9aaa51d43b7128ac56860fa03bb /kernel | |
parent | c506653d35249bb4738bb139c24362e1ae724bc1 (diff) | |
parent | ec30f343d61391ab23705e50a525da1d55395780 (diff) | |
download | linux-stable-e92427b289d252cfbd4cb5282d92f4ce1a5bb1fb.tar.gz linux-stable-e92427b289d252cfbd4cb5282d92f4ce1a5bb1fb.tar.bz2 linux-stable-e92427b289d252cfbd4cb5282d92f4ce1a5bb1fb.zip |
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'kernel')
52 files changed, 1135 insertions, 674 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0b5ff083fa22..353d3fe8ba33 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif @@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ +obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o @@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. # Info from config_data can be extracted from /proc/config* targets += config_data.gz -$(obj)/config_data.gz: .config FORCE +$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) quiet_cmd_ikconfiggz = IKCFG $@ diff --git a/kernel/audit.c b/kernel/audit.c index 77770a034d59..e4956244ae50 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb) if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd dissapeared\n"); + audit_log_lost("auditd disappeared\n"); audit_pid = 0; /* we might get lucky and get this in the next auditd */ audit_hold_skb(skb); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 51cddc11cd85..b24d7027b83c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -763,9 +763,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); * -> cgroup_mkdir. */ -static struct dentry *cgroup_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd); static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; @@ -862,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } +static int cgroup_delete(const struct dentry *d) +{ + return 1; +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -912,7 +916,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry) parent = dentry->d_parent; spin_lock(&parent->d_lock); - spin_lock(&dentry->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); list_del_init(&dentry->d_u.d_child); spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); @@ -1451,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data) static int cgroup_get_rootdir(struct super_block *sb) { + static const struct dentry_operations cgroup_dops = { + .d_iput = cgroup_diput, + .d_delete = cgroup_delete, + }; + struct inode *inode = cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); struct dentry *dentry; @@ -1468,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb) return -ENOMEM; } sb->s_root = dentry; + /* for everything else we want ->d_op set */ + sb->s_d_op = &cgroup_dops; return 0; } @@ -2197,6 +2208,14 @@ static const struct inode_operations cgroup_dir_inode_operations = { .rename = cgroup_rename, }; +static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; +} + /* * Check if a file is a control file */ @@ -2207,26 +2226,6 @@ static inline struct cftype *__file_cft(struct file *file) return __d_cft(file->f_dentry); } -static int cgroup_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - -static struct dentry *cgroup_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) -{ - static const struct dentry_operations cgroup_dentry_operations = { - .d_delete = cgroup_delete_dentry, - .d_iput = cgroup_diput, - }; - - if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); - d_set_d_op(dentry, &cgroup_dentry_operations); - d_add(dentry, NULL); - return NULL; -} - static int cgroup_create_file(struct dentry *dentry, mode_t mode, struct super_block *sb) { diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index a6e729766821..bd3e8e29caa3 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void) } } -/* Intialize kdb_printf, breakpoint tables and kdb state */ +/* Initialize kdb_printf, breakpoint tables and kdb state */ void __init kdb_init(int lvl) { static int kdb_init_lvl = KDB_NOT_INITIALIZED; diff --git a/kernel/exit.c b/kernel/exit.c index 89c74861a3da..f9a45ebcc7b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code) exit_fs(tsk); check_stack_usage(); exit_thread(); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + * + * because of cgroup mode, must be called before cgroup_exit() + */ + perf_event_exit_task(tsk); + cgroup_exit(tsk, 1); if (group_dead) @@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code) * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/fork.c b/kernel/fork.c index d9b44f20b6b0..25e429152ddc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> #include <linux/oom.h> +#include <linux/khugepaged.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) retval = ksm_fork(mm, oldmm); if (retval) goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { @@ -529,6 +533,9 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -543,6 +550,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -669,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mm->pmd_huge_pte = NULL; +#endif + if (!mm_init(mm, tsk)) goto fail_nomem; @@ -910,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); @@ -1410,23 +1423,6 @@ long do_fork(unsigned long clone_flags, } /* - * We hope to recycle these flags after 2.6.26 - */ - if (unlikely(clone_flags & CLONE_STOPPED)) { - static int __read_mostly count = 100; - - if (count > 0 && printk_ratelimit()) { - char comm[TASK_COMM_LEN]; - - count--; - printk(KERN_INFO "fork(): process `%s' used deprecated " - "clone flags 0x%lx\n", - get_task_comm(comm, current), - clone_flags & CLONE_STOPPED); - } - } - - /* * When called from kernel_thread, don't do user tracing stuff. */ if (likely(user_mode(regs))) @@ -1464,16 +1460,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - if (unlikely(clone_flags & CLONE_STOPPED)) { - /* - * We'll start up with an immediate SIGSTOP. - */ - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - __set_task_state(p, TASK_STOPPED); - } else { - wake_up_new_task(p, clone_flags); - } + wake_up_new_task(p, clone_flags); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); diff --git a/kernel/freezer.c b/kernel/freezer.c index bd1d42b17cb2..66ecd2ead215 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only) } if (should_send_signal(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); + fake_signal_wake_up(p); + /* + * fake_signal_wake_up() goes through p's scheduler + * lock and guarantees that TASK_STOPPED/TRACED -> + * TASK_RUNNING transition can't race with task state + * testing in try_to_freeze_tasks(). + */ } else if (sig_only) { return false; } else { diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92e6917..b766d28accd6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *page_head; int err; /* @@ -265,11 +265,46 @@ again: if (err < 0) return err; - page = compound_head(page); - lock_page(page); - if (!page->mapping) { - unlock_page(page); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page_head = page; + if (unlikely(PageTail(page))) { put_page(page); + /* serialize against __split_huge_page_splitting() */ + local_irq_disable(); + if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + page_head = compound_head(page); + /* + * page_head is valid pointer but we must pin + * it before taking the PG_lock and/or + * PG_compound_lock. The moment we re-enable + * irqs __split_huge_page_splitting() can + * return and the head page can be freed from + * under us. We can't take the PG_lock and/or + * PG_compound_lock on a page that could be + * freed from under us. + */ + if (page != page_head) { + get_page(page_head); + put_page(page); + } + local_irq_enable(); + } else { + local_irq_enable(); + goto again; + } + } +#else + page_head = compound_head(page); + if (page != page_head) { + get_page(page_head); + put_page(page); + } +#endif + + lock_page(page_head); + if (!page_head->mapping) { + unlock_page(page_head); + put_page(page_head); goto again; } @@ -280,20 +315,20 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page)) { + if (PageAnon(page_head)) { key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page->mapping->host; - key->shared.pgoff = page->index; + key->shared.inode = page_head->mapping->host; + key->shared.pgoff = page_head->index; } get_futex_key_refs(key); - unlock_page(page); - put_page(page); + unlock_page(page_head); + put_page(page_head); return 0; } @@ -791,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* - * This happens when we have stolen the lock and the original - * pending owner did not enqueue itself back on the rt_mutex. - * Thats not a tragedy. We know that way, that a lock waiter - * is on the fly. We make the futex_q waiter the pending owner. + * It is possible that the next waiter (the one that brought + * this owner to the kernel) timed out and is no longer + * waiting on the lock. */ if (!new_owner) new_owner = this->task; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 45da2b6920ab..0c8d7c048615 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1745,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, } /* - * A NULL parameter means "inifinte" + * A NULL parameter means "infinite" */ if (!expires) { schedule(); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 31d766bf5d2e..8e42fec7686d 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -9,9 +9,6 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - # Select this to disable the deprecated stuff config GENERIC_HARDIRQS_NO_DEPRECATED def_bool n diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e2347eb63306..3540a7190122 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) return retval; } - -#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ - -#ifdef CONFIG_ENABLE_WARN_DEPRECATED -# warning __do_IRQ is deprecated. Please convert to proper flow handlers -#endif - -/** - * __do_IRQ - original all in one highlevel IRQ handler - * @irq: the interrupt number - * - * __do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - * - * This is the original x86 implementation which is used for every - * interrupt type. - */ -unsigned int __do_IRQ(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - unsigned int status; - - kstat_incr_irqs_this_cpu(irq, desc); - - if (CHECK_IRQ_PER_CPU(desc->status)) { - irqreturn_t action_ret; - - /* - * No locking required for CPU-local interrupts: - */ - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); - if (likely(!(desc->status & IRQ_DISABLED))) { - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - } - desc->irq_data.chip->end(irq); - return 1; - } - - raw_spin_lock(&desc->lock); - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); - /* - * REPLAY is when Linux resends an IRQ that was dropped earlier - * WAITING is used by probe to mark irqs that are being tested - */ - status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); - status |= IRQ_PENDING; /* we _want_ to handle it */ - - /* - * If the IRQ is disabled for whatever reason, we cannot - * use the action we have. - */ - action = NULL; - if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { - action = desc->action; - status &= ~IRQ_PENDING; /* we commit to handling */ - status |= IRQ_INPROGRESS; /* we are handling it */ - } - desc->status = status; - - /* - * If there is no IRQ handler or it was disabled, exit early. - * Since we set PENDING, if another processor is handling - * a different instance of this same irq, the other processor - * will take care of it. - */ - if (unlikely(!action)) - goto out; - - /* - * Edge triggered interrupts need to remember - * pending events. - * This applies to any hw interrupts that allow a second - * instance of the same irq to arrive while we are in do_IRQ - * or in the handler. But the code here only handles the _second_ - * instance of the irq, not the third or fourth. So it is mostly - * useful for irq hardware that does not mask cleanly in an - * SMP environment. - */ - for (;;) { - irqreturn_t action_ret; - - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - if (likely(!(desc->status & IRQ_PENDING))) - break; - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; - -out: - /* - * The ->end() handler has to deal with interrupts which got - * disabled while the handler was running. - */ - desc->irq_data.chip->end(irq); - raw_spin_unlock(&desc->lock); - - return 1; -} -#endif diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03797f5..282f20230e67 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) { + int cpu; + desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; @@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_count = 0; desc->irqs_unhandled = 0; desc->name = NULL; - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + for_each_possible_cpu(cpu) + *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; desc_smp_init(desc, node); } @@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node) if (!desc) return NULL; /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), - gfp, node); + desc->kstat_irqs = alloc_percpu(unsigned int); if (!desc->kstat_irqs) goto err_desc; @@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node) return desc; err_kstat: - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); err_desc: kfree(desc); return NULL; @@ -166,7 +168,7 @@ static void free_desc(unsigned int irq) mutex_unlock(&sparse_irq_lock); free_masks(desc); - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); kfree(desc); } @@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; int __init early_irq_init(void) { int count, i, node = first_online_node; @@ -250,7 +251,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - desc[i].kstat_irqs = kstat_irqs_all[i]; + /* TODO : do this allocation on-demand ... */ + desc[i].kstat_irqs = alloc_percpu(unsigned int); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -275,6 +277,22 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { +#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) + struct irq_desc *desc; + unsigned int i; + + for (i = 0; i < cnt; i++) { + desc = irq_to_desc(start + i); + if (desc && !desc->kstat_irqs) { + unsigned int __percpu *stats = alloc_percpu(unsigned int); + + if (!stats) + return -1; + if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) + free_percpu(stats); + } + } +#endif return start; } #endif /* !CONFIG_SPARSE_IRQ */ @@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; + + return desc && desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } #ifdef CONFIG_GENERIC_HARDIRQS @@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq) int cpu; int sum = 0; - if (!desc) + if (!desc || !desc->kstat_irqs) return 0; for_each_possible_cpu(cpu) - sum += desc->kstat_irqs[cpu]; + sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } #endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/kernel/kexec.c b/kernel/kexec.c index b55045bc7563..ec19b92c7ebd 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, * just verifies it is an address we can use. * * Since the kernel does everything in page size chunks ensure - * the destination addreses are page aligned. Too many + * the destination addresses are page aligned. Too many * special cases crop of when we don't do this. The most * insidious is getting overlapping destination addresses * simply because addresses are changed to page size diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 17110a4a4fc2..ee74b35e528d 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v) seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < MAXLR; i++) { - if (latency_record[i].backtrace[0]) { + struct latency_record *lr = &latency_record[i]; + + if (lr->backtrace[0]) { int q; - seq_printf(m, "%i %lu %lu ", - latency_record[i].count, - latency_record[i].time, - latency_record[i].max); + seq_printf(m, "%i %lu %lu", + lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - char sym[KSYM_SYMBOL_LEN]; - char *c; - if (!latency_record[i].backtrace[q]) + unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (latency_record[i].backtrace[q] == ULONG_MAX) + if (bt == ULONG_MAX) break; - sprint_symbol(sym, latency_record[i].backtrace[q]); - c = strchr(sym, '+'); - if (c) - *c = 0; - seq_printf(m, "%s ", sym); + seq_printf(m, " %ps", (void *)bt); } seq_printf(m, "\n"); } diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 42ba65dff7d9..0d2058da80f5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) } /* - * Debugging helper: via this flag we know that we are in - * 'early bootup code', and will warn about any invalid irqs-on event: - */ -static int early_boot_irqs_enabled; - -void early_boot_irqs_off(void) -{ - early_boot_irqs_enabled = 0; -} - -void early_boot_irqs_on(void) -{ - early_boot_irqs_enabled = 1; -} - -/* * Hardirqs will be enabled: */ void trace_hardirqs_on_caller(unsigned long ip) @@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; - if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) + if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) return; if (unlikely(curr->hardirqs_enabled)) { diff --git a/kernel/panic.c b/kernel/panic.c index 4c13b1a88ebb..991bb87a1704 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); int panic_timeout; +EXPORT_SYMBOL_GPL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); diff --git a/kernel/params.c b/kernel/params.c index 08107d181758..0da1411222b9 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static void __init kernel_add_sysfs_param(const char *name, - struct kernel_param *kparam, - unsigned int name_skip) +static struct module_kobject * __init locate_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; @@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name, kobj = kset_find_obj(module_kset, name); if (kobj) { - /* We already have one. Remove params so we can add more. */ mk = to_module_kobject(kobj); - /* We need to remove it before adding parameters. */ - sysfs_remove_group(&mk->kobj, &mk->mp->grp); } else { mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); BUG_ON(!mk); @@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name, "%s", name); if (err) { kobject_put(&mk->kobj); - printk(KERN_ERR "Module '%s' failed add to sysfs, " - "error number %d\n", name, err); - printk(KERN_ERR "The system will be unstable now.\n"); - return; + printk(KERN_ERR + "Module '%s' failed add to sysfs, error number %d\n", + name, err); + printk(KERN_ERR + "The system will be unstable now.\n"); + return NULL; } - /* So that exit path is even. */ + + /* So that we hold reference in both cases. */ kobject_get(&mk->kobj); } + return mk; +} + +static void __init kernel_add_sysfs_param(const char *name, + struct kernel_param *kparam, + unsigned int name_skip) +{ + struct module_kobject *mk; + int err; + + mk = locate_module_kobject(name); + if (!mk) + return; + + /* We need to remove old parameters before adding more. */ + if (mk->mp) + sysfs_remove_group(&mk->kobj, &mk->mp->grp); + /* These should not fail at boot. */ err = add_sysfs_param(mk, kparam, kparam->name + name_skip); BUG_ON(err); @@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void) } } +ssize_t __modver_version_show(struct module_attribute *mattr, + struct module *mod, char *buf) +{ + struct module_version_attribute *vattr = + container_of(mattr, struct module_version_attribute, mattr); + + return sprintf(buf, "%s\n", vattr->version); +} + +extern struct module_version_attribute __start___modver[], __stop___modver[]; + +static void __init version_sysfs_builtin(void) +{ + const struct module_version_attribute *vattr; + struct module_kobject *mk; + int err; + + for (vattr = __start___modver; vattr < __stop___modver; vattr++) { + mk = locate_module_kobject(vattr->module_name); + if (mk) { + err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); + kobject_uevent(&mk->kobj, KOBJ_ADD); + kobject_put(&mk->kobj); + } + } +} /* module-related sysfs stuff */ @@ -875,6 +917,7 @@ static int __init param_sysfs_init(void) } module_sysfs_initialized = 1; + version_sysfs_builtin(); param_sysfs_builtin(); return 0; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 11847bf1e8cc..126a302c481c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -38,6 +38,12 @@ #include <asm/irq_regs.h> +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + atomic_t perf_task_events __read_mostly; static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -65,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; static atomic64_t perf_event_id; +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -72,6 +84,11 @@ extern __weak const char *perf_pmu_name(void) return "pmu"; } +static inline u64 perf_clock(void) +{ + return local_clock(); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -240,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) put_ctx(ctx); } -static inline u64 perf_clock(void) -{ - return local_clock(); -} - /* * Update the record of the current time in a context. */ @@ -256,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx) ctx->timestamp = now; } +static u64 perf_event_time(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + return ctx ? ctx->time : 0; +} + /* * Update the total_time_enabled and total_time_running fields for a event. */ @@ -269,7 +287,7 @@ static void update_event_times(struct perf_event *event) return; if (ctx->is_active) - run_end = ctx->time; + run_end = perf_event_time(event); else run_end = event->tstamp_stopped; @@ -278,7 +296,7 @@ static void update_event_times(struct perf_event *event) if (event->state == PERF_EVENT_STATE_INACTIVE) run_end = event->tstamp_stopped; else - run_end = ctx->time; + run_end = perf_event_time(event); event->total_time_running = run_end - event->tstamp_running; } @@ -534,6 +552,7 @@ event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); u64 delta; /* * An event which could not be activated because of @@ -545,7 +564,7 @@ event_sched_out(struct perf_event *event, && !event_filter_match(event)) { delta = ctx->time - event->tstamp_stopped; event->tstamp_running += delta; - event->tstamp_stopped = ctx->time; + event->tstamp_stopped = tstamp; } if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -556,7 +575,7 @@ event_sched_out(struct perf_event *event, event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = ctx->time; + event->tstamp_stopped = tstamp; event->pmu->del(event, 0); event->oncpu = -1; @@ -768,6 +787,8 @@ event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); + if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -784,9 +805,9 @@ event_sched_in(struct perf_event *event, return -EAGAIN; } - event->tstamp_running += ctx->time - event->tstamp_stopped; + event->tstamp_running += tstamp - event->tstamp_stopped; - event->shadow_ctx_time = ctx->time - ctx->timestamp; + event->shadow_ctx_time = tstamp - ctx->timestamp; if (!is_software_event(event)) cpuctx->active_oncpu++; @@ -898,11 +919,13 @@ static int group_can_go_on(struct perf_event *event, static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); + list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = ctx->time; - event->tstamp_running = ctx->time; - event->tstamp_stopped = ctx->time; + event->tstamp_enabled = tstamp; + event->tstamp_running = tstamp; + event->tstamp_stopped = tstamp; } /* @@ -937,7 +960,7 @@ static void __perf_install_in_context(void *info) add_event_to_ctx(event, ctx); - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) goto unlock; /* @@ -1042,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *sub; + u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = ctx->time - event->total_time_enabled; + event->tstamp_enabled = tstamp - event->total_time_enabled; list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = - ctx->time - sub->total_time_enabled; - } + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = tstamp - sub->total_time_enabled; } } @@ -1082,7 +1104,7 @@ static void __perf_event_enable(void *info) goto unlock; __perf_event_mark_enabled(event, ctx); - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) goto unlock; /* @@ -1193,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) @@ -1435,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, list_for_each_entry(event, &ctx->pinned_groups, group_entry) { if (event->state <= PERF_EVENT_STATE_OFF) continue; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) continue; if (group_can_go_on(event, cpuctx, 1)) @@ -1467,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, * Listen to the 'cpu' scheduling filter constraint * of events: */ - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) continue; if (group_can_go_on(event, cpuctx, can_add_hw)) { @@ -1694,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) if (event->state != PERF_EVENT_STATE_ACTIVE) continue; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) continue; hwc = &event->hw; @@ -2185,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid) if (!task) return ERR_PTR(-ESRCH); - /* - * Can't attach events to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - /* Reuse ptrace permission checks for now. */ err = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ)) @@ -2212,14 +2221,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) unsigned long flags; int ctxn, err; - if (!task && cpu != -1) { + if (!task) { /* Must be root to operate on a CPU event: */ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - if (cpu < 0 || cpu >= nr_cpumask_bits) - return ERR_PTR(-EINVAL); - /* * We could be clever and allow to attach a event to an * offline CPU and activate it when the CPU comes up, but @@ -2255,14 +2261,27 @@ retry: get_ctx(ctx); - if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { - /* - * We raced with some other task; use - * the context they set. - */ + err = 0; + mutex_lock(&task->perf_event_mutex); + /* + * If it has already passed perf_event_exit_task(). + * we must see PF_EXITING, it takes this mutex too. + */ + if (task->flags & PF_EXITING) + err = -ESRCH; + else if (task->perf_event_ctxp[ctxn]) + err = -EAGAIN; + else + rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + mutex_unlock(&task->perf_event_mutex); + + if (unlikely(err)) { put_task_struct(task); kfree(ctx); - goto retry; + + if (err == -EAGAIN) + goto retry; + goto errout; } } @@ -3893,7 +3912,7 @@ static int perf_event_task_match(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) return 0; if (event->attr.comm || event->attr.mmap || @@ -4030,7 +4049,7 @@ static int perf_event_comm_match(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) return 0; if (event->attr.comm) @@ -4178,7 +4197,7 @@ static int perf_event_mmap_match(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) return 0; if ((!executable && event->attr.mmap_data) || @@ -4648,7 +4667,7 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -void inline perf_swevent_put_recursion_context(int rctx) +inline void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); @@ -5361,6 +5380,8 @@ free_dev: goto out; } +static struct lock_class_key cpuctx_mutex; + int perf_pmu_register(struct pmu *pmu, char *name, int type) { int cpu, ret; @@ -5409,6 +5430,7 @@ skip_type: cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; @@ -5525,6 +5547,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct hw_perf_event *hwc; long err; + if ((unsigned)cpu >= nr_cpu_ids) { + if (!task || cpu != -1) + return ERR_PTR(-EINVAL); + } + event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return ERR_PTR(-ENOMEM); @@ -5573,7 +5600,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) overflow_handler = parent_event->overflow_handler; - + event->overflow_handler = overflow_handler; if (attr->disabled) @@ -6109,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); task_ctx_sched_out(child_ctx, EVENT_ALL); /* @@ -6422,11 +6449,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) unsigned long flags; int ret = 0; - child->perf_event_ctxp[ctxn] = NULL; - - mutex_init(&child->perf_event_mutex); - INIT_LIST_HEAD(&child->perf_event_list); - if (likely(!parent->perf_event_ctxp[ctxn])) return 0; @@ -6478,7 +6500,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; - raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); child_ctx = child->perf_event_ctxp[ctxn]; @@ -6486,12 +6507,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn) /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. - * Note that if the parent is a clone, it could get - * uncloned at any point, but that doesn't matter - * because the list of events and the generation - * count can't have changed since we took the mutex. + * + * Note that if the parent is a clone, the holding of + * parent_ctx->lock avoids it from being uncloned. */ - cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + cloned_ctx = parent_ctx->parent_ctx; if (cloned_ctx) { child_ctx->parent_ctx = cloned_ctx; child_ctx->parent_gen = parent_ctx->parent_gen; @@ -6502,6 +6522,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) get_ctx(child_ctx->parent_ctx); } + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); @@ -6516,6 +6537,10 @@ int perf_event_init_task(struct task_struct *child) { int ctxn, ret; + memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + for_each_task_context_nr(ctxn) { ret = perf_event_init_context(child, ctxn); if (ret) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a5aff3ebad38..265729966ece 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n -config SUSPEND_NVS - bool - config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE - select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -140,7 +136,6 @@ config HIBERNATION depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE select LZO_COMPRESS select LZO_DECOMPRESS - select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6b185d..c350e18b53e3 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,7 +1,4 @@ - -ifeq ($(CONFIG_PM_DEBUG),y) -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o @@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o -obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 048d0b514831..1832bd264219 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -51,18 +51,18 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; -static struct platform_hibernation_ops *hibernation_ops; +static const struct platform_hibernation_ops *hibernation_ops; /** * hibernation_set_ops - set the global hibernate operations * @ops: the hibernation operations to use in subsequent hibernation transitions */ -void hibernation_set_ops(struct platform_hibernation_ops *ops) +void hibernation_set_ops(const struct platform_hibernation_ops *ops) { if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore - && ops->restore_cleanup)) { + && ops->restore_cleanup && ops->leave)) { WARN_ON(1); return; } @@ -278,7 +278,7 @@ static int create_image(int platform_mode) goto Enable_irqs; } - if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) + if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) goto Power_up; in_suspend = 1; @@ -516,7 +516,7 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); - if (!pm_check_wakeup_events()) { + if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; } @@ -647,6 +647,7 @@ int hibernate(void) swsusp_free(); if (!error) power_down(); + in_suspend = 0; pm_restore_gfp_mask(); } else { pr_debug("PM: Image restored successfully.\n"); diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c deleted file mode 100644 index 1836db60bbb6..000000000000 --- a/kernel/power/nvs.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory - * - * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include <linux/io.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/suspend.h> - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * suspend and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * suspend_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int suspend_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * suspend_nvs_free - free data pages allocated for saving NVS regions - */ -void suspend_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * suspend_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int suspend_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - suspend_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * suspend_nvs_save - save NVS memory regions - */ -void suspend_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - memcpy(entry->data, entry->kaddr, entry->size); - } -} - -/** - * suspend_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void suspend_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} diff --git a/kernel/power/process.c b/kernel/power/process.c index e50b4c1b2a0f..d6d2a10320e0 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only) * perturb a task in TASK_STOPPED or TASK_TRACED. * It is "frozen enough". If the task does wake * up, it will immediately call try_to_freeze. + * + * Because freeze_task() goes through p's + * scheduler lock after setting TIF_FREEZE, it's + * guaranteed that either we see TASK_RUNNING or + * try_to_stop() after schedule() in ptrace/signal + * stop sees TIF_FREEZE. */ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) @@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only) if (!todo || time_after(jiffies, end_time)) break; - if (!pm_check_wakeup_events()) { + if (pm_wakeup_pending()) { wakeup = true; break; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 031d5e3a6197..de6f86bfa303 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_MEM] = "mem", }; -static struct platform_suspend_ops *suspend_ops; +static const struct platform_suspend_ops *suspend_ops; /** * suspend_set_ops - Set the global suspend method table. * @ops: Pointer to ops structure. */ -void suspend_set_ops(struct platform_suspend_ops *ops) +void suspend_set_ops(const struct platform_suspend_ops *ops) { mutex_lock(&pm_mutex); suspend_ops = ops; @@ -164,7 +164,7 @@ static int suspend_enter(suspend_state_t state) error = sysdev_suspend(PMSG_SUSPEND); if (!error) { - if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { + if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { error = suspend_ops->enter(state); events_check_enabled = false; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8c7e4832b9be..7c97c3a0eee3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -224,7 +224,7 @@ static int swsusp_swap_check(void) return res; root_swap = res; - res = blkdev_get(hib_resume_bdev, FMODE_WRITE); + res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); if (res) return res; @@ -888,7 +888,7 @@ out_finish: /** * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should - * be written into this memeory location + * be written into this memory location */ int swsusp_read(unsigned int *flags_p) @@ -930,7 +930,8 @@ int swsusp_check(void) { int error; - hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + FMODE_READ, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); diff --git a/kernel/printk.c b/kernel/printk.c index 4642a5c439eb..53d9a9ec88e6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -39,6 +39,7 @@ #include <linux/syslog.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <linux/rculist.h> #include <asm/uaccess.h> @@ -273,12 +274,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * at open time. */ if (type == SYSLOG_ACTION_OPEN || !from_file) { - if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) - return -EPERM; + if (dmesg_restrict && !capable(CAP_SYSLOG)) + goto warn; /* switch to return -EPERM after 2.6.39 */ if ((type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; + !capable(CAP_SYSLOG)) + goto warn; /* switch to return -EPERM after 2.6.39 */ } error = security_syslog(type); @@ -422,6 +423,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) } out: return error; +warn: + /* remove after 2.6.39 */ + if (capable(CAP_SYS_ADMIN)) + WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " + "but no CAP_SYSLOG (deprecated and denied).\n"); + return -EPERM; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) @@ -1496,7 +1503,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper) /* Don't allow registering multiple times */ if (!dumper->registered) { dumper->registered = 1; - list_add_tail(&dumper->list, &dump_list); + list_add_tail_rcu(&dumper->list, &dump_list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); @@ -1520,29 +1527,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) spin_lock_irqsave(&dump_list_lock, flags); if (dumper->registered) { dumper->registered = 0; - list_del(&dumper->list); + list_del_rcu(&dumper->list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); + synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); -static const char * const kmsg_reasons[] = { - [KMSG_DUMP_OOPS] = "oops", - [KMSG_DUMP_PANIC] = "panic", - [KMSG_DUMP_KEXEC] = "kexec", -}; - -static const char *kmsg_to_str(enum kmsg_dump_reason reason) -{ - if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) - return "unknown"; - - return kmsg_reasons[reason]; -} - /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping @@ -1581,13 +1575,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) l2 = chars; } - if (!spin_trylock_irqsave(&dump_list_lock, flags)) { - printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", - kmsg_to_str(reason)); - return; - } - list_for_each_entry(dumper, &dump_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) dumper->dump(dumper, reason, s1, l1, s2, l2); - spin_unlock_irqrestore(&dump_list_lock, flags); + rcu_read_unlock(); } #endif diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 034493724749..0c343b9a46d5 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -189,7 +189,8 @@ static int rcu_kthread(void *arg) unsigned long flags; for (;;) { - wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); + wait_event_interruptible(rcu_kthread_wq, + have_rcu_kthread_work != 0); morework = rcu_boost(); local_irq_save(flags); work = have_rcu_kthread_work; diff --git a/kernel/sched.c b/kernel/sched.c index a0eb0941fa84..18d38e4ec7ba 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -553,9 +553,6 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; - - /* BKL stats */ - unsigned int bkl_count; #endif }; @@ -609,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; struct cgroup_subsys_state *css; + if (p->flags & PF_EXITING) + return &root_task_group; + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); @@ -2505,7 +2505,7 @@ out: * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * - * Put @p on the run-queue if it's not alredy there. The caller must + * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. this_rq() stays locked over invocation. */ @@ -3887,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); #ifdef CONFIG_SCHEDSTATS if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), bkl_count); + schedstat_inc(this_rq(), rq_sched_info.bkl_count); schedstat_inc(prev, sched_info.bkl_count); } #endif @@ -4871,7 +4871,8 @@ recheck: * assigned. */ if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) { + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); return -EPERM; @@ -8882,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } } +static void +cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + sched_move_task(task); +} + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) @@ -8954,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .destroy = cpu_cgroup_destroy, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, + .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, .early_init = 1, diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 32a723b8f84c..9fb656283157 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref) { struct autogroup *ag = container_of(kref, struct autogroup, kref); +#ifdef CONFIG_RT_GROUP_SCHED + /* We've redirected RT tasks to the root task group... */ + ag->tg->rt_se = NULL; + ag->tg->rt_rq = NULL; +#endif sched_destroy_group(ag->tg); } @@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) return ag; } +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg); +#endif + static inline struct autogroup *autogroup_create(void) { struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); @@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void) init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); ag->tg = tg; +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Autogroup RT tasks are redirected to the root task group + * so we don't have to move tasks around upon policy change, + * or flail around trying to allocate bandwidth on the fly. + * A bandwidth exception in __sched_setscheduler() allows + * the policy change to proceed. Thereafter, task_group() + * returns &root_task_group, so zero bandwidth is required. + */ + free_rt_sched_group(tg); + tg->rt_se = root_task_group.rt_se; + tg->rt_rq = root_task_group.rt_rq; +#endif tg->autogroup = ag; return ag; @@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return tg != &root_task_group && tg->autogroup; +} + static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { @@ -231,6 +258,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (!enabled || !tg->autogroup) + return 0; + return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } #endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 5358e241cb20..7b859ffe5dad 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg); static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 1dfae3d014b5..eb6cb8edd075 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -16,6 +16,8 @@ #include <linux/kallsyms.h> #include <linux/utsname.h> +static DEFINE_SPINLOCK(sched_debug_lock); + /* * This allows printing both to /proc/sched_debug and * to the console @@ -86,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } #endif +#ifdef CONFIG_CGROUP_SCHED +static char group_path[PATH_MAX]; + +static char *task_group_path(struct task_group *tg) +{ + if (autogroup_path(tg, group_path, PATH_MAX)) + return group_path; + + /* + * May be NULL if the underlying cgroup isn't fully-created yet + */ + if (!tg->css.cgroup) { + group_path[0] = '\0'; + return group_path; + } + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; +} +#endif + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { @@ -108,6 +130,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif +#ifdef CONFIG_CGROUP_SCHED + SEQ_printf(m, " %s", task_group_path(task_group(p))); +#endif SEQ_printf(m, "\n"); } @@ -144,7 +169,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); +#else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -191,7 +220,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { +#ifdef CONFIG_RT_GROUP_SCHED + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); +#else SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) @@ -212,6 +245,7 @@ extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long flags; #ifdef CONFIG_X86 { @@ -262,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu) P(ttwu_count); P(ttwu_local); - P(bkl_count); + SEQ_printf(m, " .%-30s: %d\n", "bkl_count", + rq->rq_sched_info.bkl_count); #undef P +#undef P64 #endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + rcu_read_lock(); print_rq(m, rq, cpu); + rcu_read_unlock(); + spin_unlock_irqrestore(&sched_debug_lock, flags); } static const char *sched_tunable_scaling_names[] = { diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c62ebae65cf0..354769979c02 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->nr_running--; } -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_FAIR_GROUP_SCHED +# ifdef CONFIG_SMP static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, int global_update) { @@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, + long weight_delta) +{ + long load_weight, load, shares; + + load = cfs_rq->load.weight + weight_delta; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + return shares; +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } +} +# else /* CONFIG_SMP */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, + long weight_delta) +{ + return tg->shares; +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +# endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -782,7 +828,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) { struct task_group *tg; struct sched_entity *se; - long load_weight, load, shares; + long shares; if (!cfs_rq) return; @@ -791,32 +837,14 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) se = tg->se[cpu_of(rq_of(cfs_rq))]; if (!se) return; - - load = cfs_rq->load.weight + weight_delta; - - load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; - load_weight += load; - - shares = (tg->shares * load); - if (load_weight) - shares /= load_weight; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - if (shares > tg->shares) - shares = tg->shares; +#ifndef CONFIG_SMP + if (likely(se->load.weight == tg->shares)) + return; +#endif + shares = calc_cfs_shares(cfs_rq, tg, weight_delta); reweight_entity(cfs_rq_of(se), se, shares); } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); - } -} #else /* CONFIG_FAIR_GROUP_SCHED */ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { @@ -1062,6 +1090,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) struct sched_entity *se = __pick_next_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; + if (delta < 0) + return; + if (delta > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); } @@ -1362,27 +1393,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long S, rw, s, a, b; + long lw, w; - S = se->my_q->tg->shares; - s = se->load.weight; - rw = se->my_q->load.weight; + tg = se->my_q->tg; + w = se->my_q->load.weight; - a = S*(rw + wl); - b = S*rw + s*wg; + /* use this cpu's instantaneous contribution */ + lw = atomic_read(&tg->load_weight); + lw -= se->my_q->load_contribution; + lw += w + wg; - wl = s*(a-b); + wl += w; - if (likely(b)) - wl /= b; + if (lw > 0 && wl < lw) + wl = (wl * tg->shares) / lw; + else + wl = tg->shares; - /* - * Assume the group is already running and will - * thus already be accounted for in the weight. - * - * That is, moving shares between CPUs, does not - * alter the group weight. - */ + /* zero point is MIN_SHARES */ + if (wl < MIN_SHARES) + wl = MIN_SHARES; + wl -= se->load.weight; wg = 0; } diff --git a/kernel/smp.c b/kernel/smp.c index 12ed8b013e2d..9910744f0856 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,7 @@ #include <linux/smp.h> #include <linux/cpu.h> +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS static struct { struct list_head queue; raw_spinlock_t lock; @@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void) */ list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; + void (*func) (void *info); - if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) + /* + * Since we walk the list without any locks, we might + * see an entry that was completed, removed from the + * list and is in the process of being reused. + * + * We must check that the cpu is in the cpumask before + * checking the refs, and both must be set before + * executing the callback on this cpu. + */ + + if (!cpumask_test_cpu(cpu, data->cpumask)) + continue; + + smp_rmb(); + + if (atomic_read(&data->refs) == 0) continue; + func = data->csd.func; /* for later warn */ data->csd.func(data->csd.info); + /* + * If the cpu mask is not still set then it enabled interrupts, + * we took another smp interrupt, and executed the function + * twice on this cpu. In theory that copy decremented refs. + */ + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { + WARN(1, "%pS enabled interrupts and double executed\n", + func); + continue; + } + refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); - if (!refs) { - raw_spin_lock(&call_function.lock); - list_del_rcu(&data->csd.list); - raw_spin_unlock(&call_function.lock); - } if (refs) continue; + WARN_ON(!cpumask_empty(data->cpumask)); + + raw_spin_lock(&call_function.lock); + list_del_rcu(&data->csd.list); + raw_spin_unlock(&call_function.lock); + csd_unlock(&data->csd); } @@ -429,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask, * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress); + && !oops_in_progress && !early_boot_irqs_disabled); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); @@ -453,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); + BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); + + /* + * To ensure the interrupt handler gets an complete view + * we order the cpumask and refs writes and order the read + * of them in the interrupt handler. In addition we may + * only clear our own cpu bit from the mask. + */ + smp_wmb(); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); raw_spin_lock_irqsave(&call_function.lock, flags); @@ -529,3 +569,24 @@ void ipi_call_unlock_irq(void) { raw_spin_unlock_irq(&call_function.lock); } +#endif /* USE_GENERIC_SMP_HELPERS */ + +/* + * Call a function on all processors. May be used during early boot while + * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead + * of local_irq_disable/enable(). + */ +int on_each_cpu(void (*func) (void *info), void *info, int wait) +{ + unsigned long flags; + int ret = 0; + + preempt_disable(); + ret = smp_call_function(func, info, wait); + local_irq_save(flags); + func(info); + local_irq_restore(flags); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(on_each_cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c index 0823778f87fc..68eb5efec388 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -885,25 +885,6 @@ static __init int spawn_ksoftirqd(void) } early_initcall(spawn_ksoftirqd); -#ifdef CONFIG_SMP -/* - * Call a function on all processors - */ -int on_each_cpu(void (*func) (void *info), void *info, int wait) -{ - int ret = 0; - - preempt_disable(); - ret = smp_call_function(func, info, wait); - local_irq_disable(); - func(info); - local_irq_enable(); - preempt_enable(); - return ret; -} -EXPORT_SYMBOL(on_each_cpu); -#endif - /* * [ These __weak aliases are kept in a separate compilation unit, so that * GCC does not inline them incorrectly. ] diff --git a/kernel/srcu.c b/kernel/srcu.c index 98d8c1e80edb..73ce23feaea9 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -156,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after 10 microseconds, + * we repeatedly block for 1-millisecond time periods. This approach + * has done well in testing, so there is no need for a config parameter. + */ +#define SYNCHRONIZE_SRCU_READER_DELAY 10 + +/* * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) @@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) * will have finished executing. We initially give readers * an arbitrarily chosen 10 microseconds to get out of their * SRCU read-side critical sections, then loop waiting 1/HZ - * seconds per iteration. + * seconds per iteration. The 10-microsecond value has done + * very well in testing. */ if (srcu_readers_active_idx(sp, idx)) - udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); + udelay(SYNCHRONIZE_SRCU_READER_DELAY); while (srcu_readers_active_idx(sp, idx)) schedule_timeout_interruptible(1); diff --git a/kernel/sys.c b/kernel/sys.c index 2745dcdb6c6c..31b71a276b40 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -43,6 +43,8 @@ #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/kmsg_dump.h> + #include <asm/uaccess.h> #include <asm/io.h> #include <asm/unistd.h> @@ -285,6 +287,7 @@ out_unlock: */ void emergency_restart(void) { + kmsg_dump(KMSG_DUMP_EMERG); machine_emergency_restart(); } EXPORT_SYMBOL_GPL(emergency_restart); @@ -312,6 +315,7 @@ void kernel_restart(char *cmd) printk(KERN_EMERG "Restarting system.\n"); else printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + kmsg_dump(KMSG_DUMP_RESTART); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -333,6 +337,7 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); sysdev_shutdown(); printk(KERN_EMERG "System halted.\n"); + kmsg_dump(KMSG_DUMP_HALT); machine_halt(); } @@ -351,6 +356,7 @@ void kernel_power_off(void) disable_nonboot_cpus(); sysdev_shutdown(); printk(KERN_EMERG "Power down.\n"); + kmsg_dump(KMSG_DUMP_POWEROFF); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae5cbb1e3ced..bc86bb32e126 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/signal.h> +#include <linux/printk.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> @@ -245,10 +246,6 @@ static struct ctl_table root_table[] = { .mode = 0555, .child = dev_table, }, -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -710,6 +707,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, #endif { .procname = "ngroups_max", @@ -962,10 +968,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -1326,11 +1328,6 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif - -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -1486,10 +1483,6 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -2899,7 +2892,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } } -#else /* CONFIG_PROC_FS */ +#else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2951,7 +2944,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, } -#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_PROC_SYSCTL */ /* * No sense putting this after each symbol definition, twice, diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 4b2545a136ff..b875bedf7c9a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1192,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file, buf[result] = '\0'; - /* Convert the decnet addresss to binary */ + /* Convert the decnet address to binary */ result = -EIO; nodep = strchr(buf, '.') + 1; if (!nodep) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 69691eb4b715..3971c6b9d58d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -348,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask) return ret; } -#ifdef CONFIG_IA64 +#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define TASKSTATS_NEEDS_PADDING 1 #endif diff --git a/kernel/time.c b/kernel/time.c index ba9b338d1835..32174359576f 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time); * Avoid unnecessary multiplications/divisions in the * two most common HZ cases: */ -unsigned int inline jiffies_to_msecs(const unsigned long j) +inline unsigned int jiffies_to_msecs(const unsigned long j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; @@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_msecs); -unsigned int inline jiffies_to_usecs(const unsigned long j) +inline unsigned int jiffies_to_usecs(const unsigned long j) { #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index df140cd3ea47..6519cf62d9cd 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * @shift: pointer to shift variable * @from: frequency to convert from * @to: frequency to convert to - * @minsec: guaranteed runtime conversion range in seconds + * @maxsec: guaranteed runtime conversion range in seconds * * The function evaluates the shift/mult pair for the scaled math * operations of clocksources and clockevents. @@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock * event @to is the counter frequency and @from is NSEC_PER_SEC. * - * The @minsec conversion range argument controls the time frame in + * The @maxsec conversion range argument controls the time frame in * seconds which must be covered by the runtime conversion with the * calculated mult and shift factors. This guarantees that no 64bit * overflow happens when the input value of the conversion is @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * factors. */ void -clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) { u64 tmp; u32 sft, sftacc= 32; @@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) * Calculate the shift factor which is limiting the conversion * range: */ - tmp = ((u64)minsec * from) >> 32; + tmp = ((u64)maxsec * from) >> 32; while (tmp) { tmp >>=1; sftacc--; @@ -679,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { - /* Intialize mult/shift and max_idle_ns */ + /* Initialize mult/shift and max_idle_ns */ __clocksource_updatefreq_scale(cs, scale, freq); /* Add clocksource to the clcoksource list */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d2321891538f..5c00242fa921 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -14,6 +14,7 @@ #include <linux/timex.h> #include <linux/time.h> #include <linux/mm.h> +#include <linux/module.h> /* * NTP timekeeping variables: @@ -74,6 +75,162 @@ static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; +#ifdef CONFIG_NTP_PPS + +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available. They establish the engineering parameters of the clock + * discipline loop when controlled by the PPS signal. + */ +#define PPS_VALID 10 /* PPS signal watchdog max (s) */ +#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ +#define PPS_INTMIN 2 /* min freq interval (s) (shift) */ +#define PPS_INTMAX 8 /* max freq interval (s) (shift) */ +#define PPS_INTCOUNT 4 /* number of consecutive good intervals to + increase pps_shift or consecutive bad + intervals to decrease it */ +#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ + +static int pps_valid; /* signal watchdog counter */ +static long pps_tf[3]; /* phase median filter */ +static long pps_jitter; /* current jitter (ns) */ +static struct timespec pps_fbase; /* beginning of the last freq interval */ +static int pps_shift; /* current interval duration (s) (shift) */ +static int pps_intcnt; /* interval counter */ +static s64 pps_freq; /* frequency offset (scaled ns/s) */ +static long pps_stabil; /* current stability (scaled ns/s) */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt; /* calibration intervals */ +static long pps_jitcnt; /* jitter limit exceeded */ +static long pps_stbcnt; /* stability limit exceeded */ +static long pps_errcnt; /* calibration errors */ + + +/* PPS kernel consumer compensates the whole phase error immediately. + * Otherwise, reduce the offset by a fixed factor times the time constant. + */ +static inline s64 ntp_offset_chunk(s64 offset) +{ + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + return offset; + else + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) +{ + /* the PPS calibration interval may end + surprisingly early */ + pps_shift = PPS_INTMIN; + pps_intcnt = 0; +} + +/** + * pps_clear - Clears the PPS state variables + * + * Must be called while holding a write on the xtime_lock + */ +static inline void pps_clear(void) +{ + pps_reset_freq_interval(); + pps_tf[0] = 0; + pps_tf[1] = 0; + pps_tf[2] = 0; + pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; + pps_freq = 0; +} + +/* Decrease pps_valid to indicate that another second has passed since + * the last PPS signal. When it reaches 0, indicate that PPS signal is + * missing. + * + * Must be called while holding a write on the xtime_lock + */ +static inline void pps_dec_valid(void) +{ + if (pps_valid > 0) + pps_valid--; + else { + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + pps_clear(); + } +} + +static inline void pps_set_freq(s64 freq) +{ + pps_freq = freq; +} + +static inline int is_error_status(int status) +{ + return (time_status & (STA_UNSYNC|STA_CLOCKERR)) + /* PPS signal lost when either PPS time or + * PPS frequency synchronization requested + */ + || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) + && !(time_status & STA_PPSSIGNAL)) + /* PPS jitter exceeded when + * PPS time synchronization requested */ + || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) + == (STA_PPSTIME|STA_PPSJITTER)) + /* PPS wander exceeded or calibration error when + * PPS frequency synchronization requested + */ + || ((time_status & STA_PPSFREQ) + && (time_status & (STA_PPSWANDER|STA_PPSERROR))); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->jitter = pps_jitter; + if (!(time_status & STA_NANO)) + txc->jitter /= NSEC_PER_USEC; + txc->shift = pps_shift; + txc->stabil = pps_stabil; + txc->jitcnt = pps_jitcnt; + txc->calcnt = pps_calcnt; + txc->errcnt = pps_errcnt; + txc->stbcnt = pps_stbcnt; +} + +#else /* !CONFIG_NTP_PPS */ + +static inline s64 ntp_offset_chunk(s64 offset) +{ + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) {} +static inline void pps_clear(void) {} +static inline void pps_dec_valid(void) {} +static inline void pps_set_freq(s64 freq) {} + +static inline int is_error_status(int status) +{ + return status & (STA_UNSYNC|STA_CLOCKERR); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; +} + +#endif /* CONFIG_NTP_PPS */ + /* * NTP methods: */ @@ -185,6 +342,9 @@ void ntp_clear(void) tick_length = tick_length_base; time_offset = 0; + + /* Clear PPS state variables */ + pps_clear(); } /* @@ -250,16 +410,16 @@ void second_overflow(void) time_status |= STA_UNSYNC; } - /* - * Compute the phase adjustment for the next second. The offset is - * reduced by a fixed factor times the time constant. - */ + /* Compute the phase adjustment for the next second */ tick_length = tick_length_base; - delta = shift_right(time_offset, SHIFT_PLL + time_constant); + delta = ntp_offset_chunk(time_offset); time_offset -= delta; tick_length += delta; + /* Check PPS signal */ + pps_dec_valid(); + if (!time_adjust) return; @@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; + /* restart PPS frequency calibration */ + pps_reset_freq_interval(); } /* @@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts time_freq = txc->freq * PPM_SCALE; time_freq = min(time_freq, MAXFREQ_SCALED); time_freq = max(time_freq, -MAXFREQ_SCALED); + /* update pps_freq */ + pps_set_freq(time_freq); } if (txc->modes & ADJ_MAXERROR) @@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc) } result = time_state; /* mostly `TIME_OK' */ - if (time_status & (STA_UNSYNC|STA_CLOCKERR)) + /* check for errors */ + if (is_error_status(time_status)) result = TIME_ERROR; txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * @@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc) txc->tick = tick_usec; txc->tai = time_tai; - /* PPS is not implemented, so these are zero */ - txc->ppsfreq = 0; - txc->jitter = 0; - txc->shift = 0; - txc->stabil = 0; - txc->jitcnt = 0; - txc->calcnt = 0; - txc->errcnt = 0; - txc->stbcnt = 0; + /* fill PPS status fields */ + pps_fill_timex(txc); write_sequnlock_irq(&xtime_lock); @@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc) return result; } +#ifdef CONFIG_NTP_PPS + +/* actually struct pps_normtime is good old struct timespec, but it is + * semantically different (and it is the reason why it was invented): + * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ +struct pps_normtime { + __kernel_time_t sec; /* seconds */ + long nsec; /* nanoseconds */ +}; + +/* normalize the timestamp so that nsec is in the + ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +{ + struct pps_normtime norm = { + .sec = ts.tv_sec, + .nsec = ts.tv_nsec + }; + + if (norm.nsec > (NSEC_PER_SEC >> 1)) { + norm.nsec -= NSEC_PER_SEC; + norm.sec++; + } + + return norm; +} + +/* get current phase correction and jitter */ +static inline long pps_phase_filter_get(long *jitter) +{ + *jitter = pps_tf[0] - pps_tf[1]; + if (*jitter < 0) + *jitter = -*jitter; + + /* TODO: test various filters */ + return pps_tf[0]; +} + +/* add the sample to the phase filter */ +static inline void pps_phase_filter_add(long err) +{ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = err; +} + +/* decrease frequency calibration interval length. + * It is halved after four consecutive unstable intervals. + */ +static inline void pps_dec_freq_interval(void) +{ + if (--pps_intcnt <= -PPS_INTCOUNT) { + pps_intcnt = -PPS_INTCOUNT; + if (pps_shift > PPS_INTMIN) { + pps_shift--; + pps_intcnt = 0; + } + } +} + +/* increase frequency calibration interval length. + * It is doubled after four consecutive stable intervals. + */ +static inline void pps_inc_freq_interval(void) +{ + if (++pps_intcnt >= PPS_INTCOUNT) { + pps_intcnt = PPS_INTCOUNT; + if (pps_shift < PPS_INTMAX) { + pps_shift++; + pps_intcnt = 0; + } + } +} + +/* update clock frequency based on MONOTONIC_RAW clock PPS signal + * timestamps + * + * At the end of the calibration interval the difference between the + * first and last MONOTONIC_RAW clock timestamps divided by the length + * of the interval becomes the frequency update. If the interval was + * too long, the data are discarded. + * Returns the difference between old and new frequency values. + */ +static long hardpps_update_freq(struct pps_normtime freq_norm) +{ + long delta, delta_mod; + s64 ftemp; + + /* check if the frequency interval was too long */ + if (freq_norm.sec > (2 << pps_shift)) { + time_status |= STA_PPSERROR; + pps_errcnt++; + pps_dec_freq_interval(); + pr_err("hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); + return 0; + } + + /* here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold + * the interval is increased; otherwise it is decreased. + */ + ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, + freq_norm.sec); + delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); + pps_freq = ftemp; + if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { + pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); + time_status |= STA_PPSWANDER; + pps_stbcnt++; + pps_dec_freq_interval(); + } else { /* good sample */ + pps_inc_freq_interval(); + } + + /* the stability metric is calculated as the average of recent + * frequency changes, but is used only for performance + * monitoring + */ + delta_mod = delta; + if (delta_mod < 0) + delta_mod = -delta_mod; + pps_stabil += (div_s64(((s64)delta_mod) << + (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + + /* if enabled, the system clock frequency is updated */ + if ((time_status & STA_PPSFREQ) != 0 && + (time_status & STA_FREQHOLD) == 0) { + time_freq = pps_freq; + ntp_update_frequency(); + } + + return delta; +} + +/* correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(long error) +{ + long correction = -error; + long jitter; + + /* add the sample to the median filter */ + pps_phase_filter_add(correction); + correction = pps_phase_filter_get(&jitter); + + /* Nominal jitter is due to PPS signal noise. If it exceeds the + * threshold, the sample is discarded; otherwise, if so enabled, + * the time offset is updated. + */ + if (jitter > (pps_jitter << PPS_POPCORN)) { + pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); + time_status |= STA_PPSJITTER; + pps_jitcnt++; + } else if (time_status & STA_PPSTIME) { + /* correct the time using the phase offset */ + time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); + /* cancel running adjtime() */ + time_adjust = 0; + } + /* update jitter */ + pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; +} + +/* + * hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS signal arrival in order to + * discipline the CPU clock oscillator to the PPS signal. It takes two + * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former + * is used to correct clock phase error and the latter is used to + * correct the frequency. + * + * This code is based on David Mills's reference nanokernel + * implementation. It was mostly rewritten but keeps the same idea. + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + struct pps_normtime pts_norm, freq_norm; + unsigned long flags; + + pts_norm = pps_normalize_ts(*phase_ts); + + write_seqlock_irqsave(&xtime_lock, flags); + + /* clear the error bits, they will be set again if needed */ + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + + /* indicate signal presence */ + time_status |= STA_PPSSIGNAL; + pps_valid = PPS_VALID; + + /* when called for the first time, + * just start the frequency interval */ + if (unlikely(pps_fbase.tv_sec == 0)) { + pps_fbase = *raw_ts; + write_sequnlock_irqrestore(&xtime_lock, flags); + return; + } + + /* ok, now we have a base for frequency calculation */ + freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + + /* check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ + if ((freq_norm.sec == 0) || + (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + time_status |= STA_PPSJITTER; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + write_sequnlock_irqrestore(&xtime_lock, flags); + pr_err("hardpps: PPSJITTER: bad pulse\n"); + return; + } + + /* signal is ok */ + + /* check if the current frequency interval is finished */ + if (freq_norm.sec >= (1 << pps_shift)) { + pps_calcnt++; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + hardpps_update_freq(freq_norm); + } + + hardpps_update_phase(pts_norm.nsec); + + write_sequnlock_irqrestore(&xtime_lock, flags); +} +EXPORT_SYMBOL(hardpps); + +#endif /* CONFIG_NTP_PPS */ + static int __init ntp_tick_adj_setup(char *str) { ntp_tick_adj = simple_strtol(str, NULL, 0); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e216e01bbd1..c55ea2433471 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void) } local_irq_enable(); - printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", - smp_processor_id()); + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); } /* @@ -795,8 +794,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); + } #endif } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5bb86da82003..d27c7562902c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -49,7 +49,7 @@ struct timekeeper { u32 mult; }; -struct timekeeper timekeeper; +static struct timekeeper timekeeper; /** * timekeeper_setup_internals - Set up internals to use clocksource clock. @@ -164,7 +164,7 @@ static struct timespec total_sleep_time; /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ -struct timespec raw_time; +static struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts); +#ifdef CONFIG_NTP_PPS + +/** + * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * @ts_raw: pointer to the timespec to be set to raw monotonic time + * @ts_real: pointer to the timespec to be set to the time of day + * + * This function reads both the time of day and raw monotonic time at the + * same time atomically and stores the resulting timestamps in timespec + * format. + */ +void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +{ + unsigned long seq; + s64 nsecs_raw, nsecs_real; + + WARN_ON_ONCE(timekeeping_suspended); + + do { + u32 arch_offset; + + seq = read_seqbegin(&xtime_lock); + + *ts_raw = raw_time; + *ts_real = xtime; + + nsecs_raw = timekeeping_get_ns_raw(); + nsecs_real = timekeeping_get_ns(); + + /* If arch requires, add in gettimeoffset() */ + arch_offset = arch_gettimeoffset(); + nsecs_raw += arch_offset; + nsecs_real += arch_offset; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts_raw, nsecs_raw); + timespec_add_ns(ts_real, nsecs_real); +} +EXPORT_SYMBOL(getnstime_raw_and_real); + +#endif /* CONFIG_NTP_PPS */ + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 53f338190b26..761c510a06c5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o -obj-$(CONFIG_EVENT_TRACING) += power-traces.o +obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7b8ec0281548..153562d0b93c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -758,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore, * @q: queue the io is for * @bio: the source bio * @what: the action + * @error: error, if any * * Description: * Records an action against a bio. Will log the bio offset + size. * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what) + u32 what, int error) { struct blk_trace *bt = q->blk_trace; if (likely(!bt)) return; + if (!error && !bio_flagged(bio, BIO_UPTODATE)) + error = EIO; + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - !bio_flagged(bio, BIO_UPTODATE), 0, NULL); + error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio) + struct request_queue *q, struct bio *bio, + int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } static void blk_add_trace_bio_backmerge(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, @@ -812,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { struct blk_trace *bt = q->blk_trace; @@ -827,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { struct blk_trace *bt = q->blk_trace; @@ -887,7 +892,7 @@ static void blk_add_trace_split(void *ignore, } /** - * blk_add_trace_remap - Add a trace for a remap operation + * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @bio: the source bio @@ -899,9 +904,9 @@ static void blk_add_trace_split(void *ignore, * it spans a stripe (or similar). Add a trace for that action. * **/ -static void blk_add_trace_remap(void *ignore, - struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) +static void blk_add_trace_bio_remap(void *ignore, + struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -1016,7 +1021,7 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); - ret = register_trace_block_remap(blk_add_trace_remap, NULL); + ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); WARN_ON(ret); ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); WARN_ON(ret); @@ -1025,7 +1030,7 @@ static void blk_register_tracepoints(void) static void blk_unregister_tracepoints(void) { unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); - unregister_trace_block_remap(blk_add_trace_remap, NULL); + unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8cf959bad45..dc53ecb80589 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) __this_cpu_inc(user_stack_count); - - event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) - return; + goto out_drop_count; entry = ring_buffer_event_data(event); entry->tgid = current->tgid; @@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + out_drop_count: __this_cpu_dec(user_stack_count); - out: preempt_enable(); } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e3dfecaf13e6..6cf223764be8 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -53,7 +53,7 @@ */ /* - * Function trace entry - function address and parent function addres: + * Function trace entry - function address and parent function address: */ FTRACE_ENTRY(function, ftrace_entry, diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5cf8c602b880..92b6e1e12d98 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) * Stubs: */ -void early_boot_irqs_off(void) -{ -} - -void early_boot_irqs_on(void) -{ -} - void trace_softirqs_on(unsigned long ip) { } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bac752f0cfb5..b706529b4fc7 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event, static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); -/* All syscall exit events have the same fields */ -static LIST_HEAD(syscall_exit_fields); - static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -34,34 +31,28 @@ syscall_get_enter_fields(struct ftrace_event_call *call) return &entry->enter_fields; } -static struct list_head * -syscall_get_exit_fields(struct ftrace_event_call *call) -{ - return &syscall_exit_fields; -} - struct trace_event_functions enter_syscall_print_funcs = { - .trace = print_syscall_enter, + .trace = print_syscall_enter, }; struct trace_event_functions exit_syscall_print_funcs = { - .trace = print_syscall_exit, + .trace = print_syscall_exit, }; struct ftrace_event_class event_class_syscall_enter = { - .system = "syscalls", - .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, - .get_fields = syscall_get_enter_fields, - .raw_init = init_syscall_trace, + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, }; struct ftrace_event_class event_class_syscall_exit = { - .system = "syscalls", - .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, - .get_fields = syscall_get_exit_fields, - .raw_init = init_syscall_trace, + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, }; extern unsigned long __start_syscalls_metadata[]; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 25915832291a..9da289c34f22 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -12,6 +12,8 @@ #include <linux/highuid.h> #include <linux/cred.h> +static struct kmem_cache *user_ns_cachep __read_mostly; + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -26,7 +28,7 @@ int create_user_ns(struct cred *new) struct user_struct *root_user; int n; - ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); + ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); if (!ns) return -ENOMEM; @@ -38,7 +40,7 @@ int create_user_ns(struct cred *new) /* Alloc new root user. */ root_user = alloc_uid(ns, 0); if (!root_user) { - kfree(ns); + kmem_cache_free(user_ns_cachep, ns); return -ENOMEM; } @@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work) struct user_namespace *ns = container_of(work, struct user_namespace, destroyer); free_uid(ns->creator); - kfree(ns); + kmem_cache_free(user_ns_cachep, ns); } void free_user_ns(struct kref *kref) @@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t /* No useful relationship so no mapping */ return overflowgid; } + +static __init int user_namespaces_init(void) +{ + user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + return 0; +} +module_init(user_namespaces_init); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8ee6ec82f88a..11869faa6819 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) worker->flags &= ~flags; - /* if transitioning out of NOT_RUNNING, increment nr_running */ + /* + * If transitioning out of NOT_RUNNING, increment nr_running. Note + * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask + * of multiple flags, not a single flag. + */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(get_gcwq_nr_running(gcwq->cpu)); @@ -1840,7 +1844,7 @@ __acquires(&gcwq->lock) spin_unlock_irq(&gcwq->lock); work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); f(work); @@ -2384,8 +2388,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); - lock_map_acquire(&cwq->wq->lockdep_map); + /* + * If @max_active is 1 or rescuer is in use, flushing another work + * item on the same workqueue may lead to deadlock. Make sure the + * flusher is not running on the same workqueue by verifying write + * access. + */ + if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) + lock_map_acquire(&cwq->wq->lockdep_map); + else + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_release(&cwq->wq->lockdep_map); + return true; already_gone: spin_unlock_irq(&gcwq->lock); |