diff options
Diffstat (limited to 'kernel')
35 files changed, 1580 insertions, 486 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb05cd05d237..ff4dc02ce170 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -27,6 +28,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o +obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o diff --git a/kernel/acct.c b/kernel/acct.c index 4168f631868e..b756f527497e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -165,7 +165,7 @@ out: } /* - * Close the old accouting file (if currently open) and then replace + * Close the old accounting file (if currently open) and then replace * it with file (if non-NULL). * * NOTE: acct_globals.lock MUST be held on entry and exit. @@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file) } } -/* - * sys_acct() is the only system call needed to implement process - * accounting. It takes the name of the file where accounting records - * should be written. If the filename is NULL, accounting will be - * shutdown. +/** + * sys_acct - enable/disable process accounting + * @name: file name for accounting records or NULL to shutdown accounting + * + * Returns 0 for success or negative errno values for failure. + * + * sys_acct() is the only system call needed to implement process + * accounting. It takes the name of the file where accounting records + * should be written. If the filename is NULL, accounting will be + * shutdown. */ asmlinkage long sys_acct(const char __user *name) { @@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name) return (PTR_ERR(tmp)); } /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(tmp, O_WRONLY|O_APPEND, 0); + file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0); putname(tmp); if (IS_ERR(file)) { return (PTR_ERR(file)); @@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name) return (0); } -/* - * If the accouting is turned on for a file in the filesystem pointed - * to by sb, turn accouting off. +/** + * acct_auto_close - turn off a filesystem's accounting if it is on + * @sb: super block for the filesystem + * + * If the accounting is turned on for a file in the filesystem pointed + * to by sb, turn accounting off. */ void acct_auto_close(struct super_block *sb) { @@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file) set_fs(fs); } -/* +/** * acct_process - now just a wrapper around do_acct_process + * @exitcode: task exit code + * + * handles process accounting for an exiting task */ void acct_process(long exitcode) { @@ -530,9 +541,9 @@ void acct_process(long exitcode) } -/* - * acct_update_integrals - * - update mm integral fields in task_struct +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting */ void acct_update_integrals(struct task_struct *tsk) { @@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk) } } -/* - * acct_clear_integrals - * - clear the mm integral fields in task_struct +/** + * acct_clear_integrals - clear the mm integral fields in task_struct + * @tsk: task_struct whose accounting fields are cleared */ void acct_clear_integrals(struct task_struct *tsk) { diff --git a/kernel/audit.c b/kernel/audit.c index 8376ec10cf24..83096b67510a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -513,7 +513,8 @@ static int __init audit_init(void) { printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); + audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, + THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); diff --git a/kernel/compat.c b/kernel/compat.c index ddfcaaa86623..102296e21ea8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) if (!time_after(expire, now)) return 0; - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire - now); + expire = schedule_timeout_interruptible(expire - now); if (expire == 0) return 0; @@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, return -EINVAL; expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); + expire = schedule_timeout_interruptible(expire); if (expire == 0) return 0; @@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - current->state = TASK_INTERRUPTIBLE; - timeout = schedule_timeout(timeout); + timeout = schedule_timeout_interruptible(timeout); spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(current, &s, &info); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8ab1b4e518b8..79866bc6b3a1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL; */ static DECLARE_MUTEX(cpuset_sem); +static struct task_struct *cpuset_sem_owner; +static int cpuset_sem_depth; + +/* + * The global cpuset semaphore cpuset_sem can be needed by the + * memory allocator to update a tasks mems_allowed (see the calls + * to cpuset_update_current_mems_allowed()) or to walk up the + * cpuset hierarchy to find a mem_exclusive cpuset see the calls + * to cpuset_excl_nodes_overlap()). + * + * But if the memory allocation is being done by cpuset.c code, it + * usually already holds cpuset_sem. Double tripping on a kernel + * semaphore deadlocks the current task, and any other task that + * subsequently tries to obtain the lock. + * + * Run all up's and down's on cpuset_sem through the following + * wrappers, which will detect this nested locking, and avoid + * deadlocking. + */ + +static inline void cpuset_down(struct semaphore *psem) +{ + if (cpuset_sem_owner != current) { + down(psem); + cpuset_sem_owner = current; + } + cpuset_sem_depth++; +} + +static inline void cpuset_up(struct semaphore *psem) +{ + if (--cpuset_sem_depth == 0) { + cpuset_sem_owner = NULL; + up(psem); + } +} /* * A couple of forward declarations required, due to cyclic reference loop: @@ -522,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * Refresh current tasks mems_allowed and mems_generation from * current tasks cpuset. Call with cpuset_sem held. * - * Be sure to call refresh_mems() on any cpuset operation which - * (1) holds cpuset_sem, and (2) might possibly alloc memory. - * Call after obtaining cpuset_sem lock, before any possible - * allocation. Otherwise one risks trying to allocate memory - * while the task cpuset_mems_generation is not the same as - * the mems_generation in its cpuset, which would deadlock on - * cpuset_sem in cpuset_update_current_mems_allowed(). - * - * Since we hold cpuset_sem, once refresh_mems() is called, the - * test (current->cpuset_mems_generation != cs->mems_generation) - * in cpuset_update_current_mems_allowed() will remain false, - * until we drop cpuset_sem. Anyone else who would change our - * cpusets mems_generation needs to lock cpuset_sem first. + * This routine is needed to update the per-task mems_allowed + * data, within the tasks context, when it is trying to allocate + * memory (in various mm/mempolicy.c routines) and notices + * that some other task has been modifying its cpuset. */ static void refresh_mems(void) @@ -628,13 +655,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. */ -/* - * Hack to avoid 2.6.13 partial node dynamic sched domain bug. - * Disable letting 'cpu_exclusive' cpusets define dynamic sched - * domains, until the sched domain can handle partial nodes. - * Remove this #if hackery when sched domains fixed. - */ -#if 0 static void update_cpu_domains(struct cpuset *cur) { struct cpuset *c, *par = cur->parent; @@ -675,11 +695,6 @@ static void update_cpu_domains(struct cpuset *cur) partition_sched_domains(&pspan, &cspan); unlock_cpu_hotplug(); } -#else -static void update_cpu_domains(struct cpuset *cur) -{ -} -#endif static int update_cpumask(struct cpuset *cs, char *buf) { @@ -852,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us } buffer[nbytes] = 0; /* nul-terminate */ - down(&cpuset_sem); + cpuset_down(&cpuset_sem); if (is_removed(cs)) { retval = -ENODEV; @@ -886,7 +901,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us if (retval == 0) retval = nbytes; out2: - up(&cpuset_sem); + cpuset_up(&cpuset_sem); cpuset_release_agent(pathbuf); out1: kfree(buffer); @@ -926,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { cpumask_t mask; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); mask = cs->cpus_allowed; - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return cpulist_scnprintf(page, PAGE_SIZE, mask); } @@ -937,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { nodemask_t mask; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); mask = cs->mems_allowed; - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return nodelist_scnprintf(page, PAGE_SIZE, mask); } @@ -984,6 +999,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, *s++ = '\n'; *s = '\0'; + /* Do nothing if *ppos is at the eof or beyond the eof. */ + if (s - page <= *ppos) + return 0; + start = page + *ppos; n = s - start; retval = n - copy_to_user(buf, start, min(n, nbytes)); @@ -1342,8 +1361,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) if (!cs) return -ENOMEM; - down(&cpuset_sem); - refresh_mems(); + cpuset_down(&cpuset_sem); cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); @@ -1368,14 +1386,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) * will down() this new directory's i_sem and if we race with * another mkdir, we might deadlock. */ - up(&cpuset_sem); + cpuset_up(&cpuset_sem); err = cpuset_populate_dir(cs->dentry); /* If err < 0, we have a half-filled directory - oh well ;) */ return 0; err: list_del(&cs->sibling); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); kfree(cs); return err; } @@ -1397,14 +1415,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) /* the vfs holds both inode->i_sem already */ - down(&cpuset_sem); - refresh_mems(); + cpuset_down(&cpuset_sem); if (atomic_read(&cs->count) > 0) { - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return -EBUSY; } if (!list_empty(&cs->children)) { - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return -EBUSY; } parent = cs->parent; @@ -1420,7 +1437,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) spin_unlock(&d->d_lock); cpuset_d_remove_dir(d); dput(d); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); cpuset_release_agent(pathbuf); return 0; } @@ -1523,10 +1540,10 @@ void cpuset_exit(struct task_struct *tsk) if (notify_on_release(cs)) { char *pathbuf = NULL; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); if (atomic_dec_and_test(&cs->count)) check_for_release(cs, &pathbuf); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); @@ -1547,11 +1564,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) { cpumask_t mask; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); task_lock((struct task_struct *)tsk); guarantee_online_cpus(tsk->cpuset, &mask); task_unlock((struct task_struct *)tsk); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return mask; } @@ -1576,9 +1593,9 @@ void cpuset_update_current_mems_allowed(void) if (!cs) return; /* task is exiting */ if (current->cpuset_mems_generation != cs->mems_generation) { - down(&cpuset_sem); + cpuset_down(&cpuset_sem); refresh_mems(); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); } } @@ -1611,17 +1628,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) return 0; } +/* + * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive + * ancestor to the specified cpuset. Call while holding cpuset_sem. + * If no ancestor is mem_exclusive (an unusual configuration), then + * returns the root cpuset. + */ +static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) +{ + while (!is_mem_exclusive(cs) && cs->parent) + cs = cs->parent; + return cs; +} + /** - * cpuset_zone_allowed - is zone z allowed in current->mems_allowed - * @z: zone in question + * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? + * @z: is this zone on an allowed node? + * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) * - * Is zone z allowed in current->mems_allowed, or is - * the CPU in interrupt context? (zone is always allowed in this case) - */ -int cpuset_zone_allowed(struct zone *z) + * If we're in interrupt, yes, we can always allocate. If zone + * z's node is in our tasks mems_allowed, yes. If it's not a + * __GFP_HARDWALL request and this zone's nodes is in the nearest + * mem_exclusive cpuset ancestor to this tasks cpuset, yes. + * Otherwise, no. + * + * GFP_USER allocations are marked with the __GFP_HARDWALL bit, + * and do not allow allocations outside the current tasks cpuset. + * GFP_KERNEL allocations are not so marked, so can escape to the + * nearest mem_exclusive ancestor cpuset. + * + * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() + * routine only calls here with __GFP_HARDWALL bit _not_ set if + * it's a GFP_KERNEL allocation, and all nodes in the current tasks + * mems_allowed came up empty on the first pass over the zonelist. + * So only GFP_KERNEL allocations, if all nodes in the cpuset are + * short of memory, might require taking the cpuset_sem semaphore. + * + * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() + * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing + * hardwall cpusets - no allocation on a node outside the cpuset is + * allowed (unless in interrupt, of course). + * + * The second loop doesn't even call here for GFP_ATOMIC requests + * (if the __alloc_pages() local variable 'wait' is set). That check + * and the checks below have the combined affect in the second loop of + * the __alloc_pages() routine that: + * in_interrupt - any node ok (current task context irrelevant) + * GFP_ATOMIC - any node ok + * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok + * GFP_USER - only nodes in current tasks mems allowed ok. + **/ + +int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) +{ + int node; /* node that zone z is on */ + const struct cpuset *cs; /* current cpuset ancestors */ + int allowed = 1; /* is allocation in zone z allowed? */ + + if (in_interrupt()) + return 1; + node = z->zone_pgdat->node_id; + if (node_isset(node, current->mems_allowed)) + return 1; + if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ + return 0; + + /* Not hardwall and node outside mems_allowed: scan up cpusets */ + cpuset_down(&cpuset_sem); + cs = current->cpuset; + if (!cs) + goto done; /* current task exiting */ + cs = nearest_exclusive_ancestor(cs); + allowed = node_isset(node, cs->mems_allowed); +done: + cpuset_up(&cpuset_sem); + return allowed; +} + +/** + * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? + * @p: pointer to task_struct of some other task. + * + * Description: Return true if the nearest mem_exclusive ancestor + * cpusets of tasks @p and current overlap. Used by oom killer to + * determine if task @p's memory usage might impact the memory + * available to the current task. + * + * Acquires cpuset_sem - not suitable for calling from a fast path. + **/ + +int cpuset_excl_nodes_overlap(const struct task_struct *p) { - return in_interrupt() || - node_isset(z->zone_pgdat->node_id, current->mems_allowed); + const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ + int overlap = 0; /* do cpusets overlap? */ + + cpuset_down(&cpuset_sem); + cs1 = current->cpuset; + if (!cs1) + goto done; /* current task exiting */ + cs2 = p->cpuset; + if (!cs2) + goto done; /* task p is exiting */ + cs1 = nearest_exclusive_ancestor(cs1); + cs2 = nearest_exclusive_ancestor(cs2); + overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); +done: + cpuset_up(&cpuset_sem); + + return overlap; } /* @@ -1642,7 +1756,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) return -ENOMEM; tsk = m->private; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); task_lock(tsk); cs = tsk->cpuset; task_unlock(tsk); @@ -1657,7 +1771,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) seq_puts(m, buf); seq_putc(m, '\n'); out: - up(&cpuset_sem); + cpuset_up(&cpuset_sem); kfree(buf); return retval; } diff --git a/kernel/exit.c b/kernel/exit.c index 5b0fb9f09f21..6d2089a1bce7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -368,17 +368,19 @@ EXPORT_SYMBOL(daemonize); static inline void close_files(struct files_struct * files) { int i, j; + struct fdtable *fdt; j = 0; + fdt = files_fdtable(files); for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= files->max_fdset || i >= files->max_fds) + if (i >= fdt->max_fdset || i >= fdt->max_fds) break; - set = files->open_fds->fds_bits[j++]; + set = fdt->open_fds->fds_bits[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&files->fd[i], NULL); + struct file * file = xchg(&fdt->fd[i], NULL); if (file) filp_close(file, files); } @@ -403,18 +405,22 @@ struct files_struct *get_files_struct(struct task_struct *task) void fastcall put_files_struct(struct files_struct *files) { + struct fdtable *fdt; + if (atomic_dec_and_test(&files->count)) { close_files(files); /* * Free the fd and fdset arrays if we expanded them. + * If the fdtable was embedded, pass files for freeing + * at the end of the RCU grace period. Otherwise, + * you can free files immediately. */ - if (files->fd != &files->fd_array[0]) - free_fd_array(files->fd, files->max_fds); - if (files->max_fdset > __FD_SETSIZE) { - free_fdset(files->open_fds, files->max_fdset); - free_fdset(files->close_on_exec, files->max_fdset); - } - kmem_cache_free(files_cachep, files); + fdt = files_fdtable(files); + if (fdt == &files->fdtab) + fdt->free_files = files; + else + kmem_cache_free(files_cachep, files); + free_fdtable(fdt); } } diff --git a/kernel/fork.c b/kernel/fork.c index b65187f0c74e..8149f3602881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -35,6 +35,7 @@ #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/futex.h> +#include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> #include <linux/audit.h> @@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); + atomic_set(&tsk->fs_excl, 0); return tsk; } @@ -564,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) return 0; } -static int count_open_files(struct files_struct *files, int size) +static int count_open_files(struct fdtable *fdt) { + int size = fdt->max_fdset; int i; /* Find the last open fd */ for (i = size/(8*sizeof(long)); i > 0; ) { - if (files->open_fds->fds_bits[--i]) + if (fdt->open_fds->fds_bits[--i]) break; } i = (i+1) * 8 * sizeof(long); return i; } +static struct files_struct *alloc_files(void) +{ + struct files_struct *newf; + struct fdtable *fdt; + + newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + if (!newf) + goto out; + + atomic_set(&newf->count, 1); + + spin_lock_init(&newf->file_lock); + fdt = &newf->fdtab; + fdt->next_fd = 0; + fdt->max_fds = NR_OPEN_DEFAULT; + fdt->max_fdset = __FD_SETSIZE; + fdt->close_on_exec = &newf->close_on_exec_init; + fdt->open_fds = &newf->open_fds_init; + fdt->fd = &newf->fd_array[0]; + INIT_RCU_HEAD(&fdt->rcu); + fdt->free_files = NULL; + fdt->next = NULL; + rcu_assign_pointer(newf->fdt, fdt); +out: + return newf; +} + static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; struct file **old_fds, **new_fds; int open_files, size, i, error = 0, expand; + struct fdtable *old_fdt, *new_fdt; /* * A background process may not have any files ... @@ -602,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) */ tsk->files = NULL; error = -ENOMEM; - newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); - if (!newf) + newf = alloc_files(); + if (!newf) goto out; - atomic_set(&newf->count, 1); - - spin_lock_init(&newf->file_lock); - newf->next_fd = 0; - newf->max_fds = NR_OPEN_DEFAULT; - newf->max_fdset = __FD_SETSIZE; - newf->close_on_exec = &newf->close_on_exec_init; - newf->open_fds = &newf->open_fds_init; - newf->fd = &newf->fd_array[0]; - spin_lock(&oldf->file_lock); - - open_files = count_open_files(oldf, oldf->max_fdset); + old_fdt = files_fdtable(oldf); + new_fdt = files_fdtable(newf); + size = old_fdt->max_fdset; + open_files = count_open_files(old_fdt); expand = 0; /* * Check whether we need to allocate a larger fd array or fd set. * Note: we're not a clone task, so the open count won't change. */ - if (open_files > newf->max_fdset) { - newf->max_fdset = 0; + if (open_files > new_fdt->max_fdset) { + new_fdt->max_fdset = 0; expand = 1; } - if (open_files > newf->max_fds) { - newf->max_fds = 0; + if (open_files > new_fdt->max_fds) { + new_fdt->max_fds = 0; expand = 1; } @@ -642,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) spin_unlock(&newf->file_lock); if (error < 0) goto out_release; + new_fdt = files_fdtable(newf); + /* + * Reacquire the oldf lock and a pointer to its fd table + * who knows it may have a new bigger fd table. We need + * the latest pointer. + */ spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); } - old_fds = oldf->fd; - new_fds = newf->fd; + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; - memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); - memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -662,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * is partway through open(). So make sure that this * fd is available to the new process. */ - FD_CLR(open_files - i, newf->open_fds); + FD_CLR(open_files - i, new_fdt->open_fds); } - *new_fds++ = f; + rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* compute the remainder to be cleared */ - size = (newf->max_fds - open_files) * sizeof(struct file *); + size = (new_fdt->max_fds - open_files) * sizeof(struct file *); /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (newf->max_fdset > open_files) { - int left = (newf->max_fdset-open_files)/8; + if (new_fdt->max_fdset > open_files) { + int left = (new_fdt->max_fdset-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); - memset(&newf->open_fds->fds_bits[start], 0, left); - memset(&newf->close_on_exec->fds_bits[start], 0, left); + memset(&new_fdt->open_fds->fds_bits[start], 0, left); + memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } tsk->files = newf; @@ -688,9 +718,9 @@ out: return error; out_release: - free_fdset (newf->close_on_exec, newf->max_fdset); - free_fdset (newf->open_fds, newf->max_fdset); - free_fd_array(newf->fd, newf->max_fds); + free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); + free_fdset (new_fdt->open_fds, new_fdt->max_fdset); + free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); goto out; } @@ -994,6 +1024,9 @@ static task_t *copy_process(unsigned long clone_flags, * of CLONE_PTRACE. */ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#endif /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ @@ -1112,6 +1145,9 @@ static task_t *copy_process(unsigned long clone_flags, __get_cpu_var(process_counts)++; } + if (!current->signal->tty && p->signal->tty) + p->signal->tty = NULL; + nr_threads++; total_forks++; write_unlock_irq(&tasklist_lock); diff --git a/kernel/futex.c b/kernel/futex.c index c7130f86106c..ca05fe6a70b2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -40,6 +40,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/signal.h> +#include <asm/futex.h> #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) @@ -327,6 +328,118 @@ out: } /* + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: + */ +static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) +{ + union futex_key key1, key2; + struct futex_hash_bucket *bh1, *bh2; + struct list_head *head; + struct futex_q *this, *next; + int ret, op_ret, attempt = 0; + +retryfull: + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr1, &key1); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, &key2); + if (unlikely(ret != 0)) + goto out; + + bh1 = hash_futex(&key1); + bh2 = hash_futex(&key2); + +retry: + if (bh1 < bh2) + spin_lock(&bh1->lock); + spin_lock(&bh2->lock); + if (bh1 > bh2) + spin_lock(&bh1->lock); + + op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); + if (unlikely(op_ret < 0)) { + int dummy; + + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); + + /* futex_atomic_op_inuser needs to both read and write + * *(int __user *)uaddr2, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. */ + if (attempt++) { + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + + ret = -EFAULT; + if (attempt >= 2 || + !(vma = find_vma(mm, uaddr2)) || + vma->vm_start > uaddr2 || + !(vma->vm_flags & VM_WRITE)) + goto out; + + switch (handle_mm_fault(mm, vma, uaddr2, 1)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + default: + goto out; + } + goto retry; + } + + /* If we would have faulted, release mmap_sem, + * fault it in and start all over again. */ + up_read(¤t->mm->mmap_sem); + + ret = get_user(dummy, (int __user *)uaddr2); + if (ret) + return ret; + + goto retryfull; + } + + head = &bh1->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key1)) { + wake_futex(this); + if (++ret >= nr_wake) + break; + } + } + + if (op_ret > 0) { + head = &bh2->chain; + + op_ret = 0; + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key2)) { + wake_futex(this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } + + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); +out: + up_read(¤t->mm->mmap_sem); + return ret; +} + +/* * Requeue all waiters hashed on one physical page to another * physical page. */ @@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal) filp->f_mapping = filp->f_dentry->d_inode->i_mapping; if (signal) { - int err; err = f_setown(filp, current->pid, 1); if (err < 0) { - put_unused_fd(ret); - put_filp(filp); - ret = err; - goto out; + goto error; } filp->f_owner.signum = signal; } q = kmalloc(sizeof(*q), GFP_KERNEL); if (!q) { - put_unused_fd(ret); - put_filp(filp); - ret = -ENOMEM; - goto out; + err = -ENOMEM; + goto error; } down_read(¤t->mm->mmap_sem); @@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal) if (unlikely(err != 0)) { up_read(¤t->mm->mmap_sem); - put_unused_fd(ret); - put_filp(filp); kfree(q); - return err; + goto error; } /* @@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal) fd_install(ret, filp); out: return ret; +error: + put_unused_fd(ret); + put_filp(filp); + ret = err; + goto out; } long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, @@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, case FUTEX_CMP_REQUEUE: ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); break; + case FUTEX_WAKE_OP: + ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); + break; default: ret = -ENOSYS; } diff --git a/kernel/intermodule.c b/kernel/intermodule.c index 388977f3e9b7..0cbe633420fb 100644 --- a/kernel/intermodule.c +++ b/kernel/intermodule.c @@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void struct list_head *tmp; struct inter_module_entry *ime, *ime_new; - if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { + if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { /* Overloaded kernel, not fatal */ printk(KERN_ERR "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", @@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void kmalloc_failed = 1; return; } - memset(ime_new, 0, sizeof(*ime_new)); ime_new->im_name = im_name; ime_new->owner = owner; ime_new->userdata = userdata; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c29f83c16497..3ff7b925c387 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) unsigned int status; kstat_this_cpu.irqs[irq]++; - if (desc->status & IRQ_PER_CPU) { + if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ac6700985705..1cfdb08ddf20 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -18,6 +18,10 @@ cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; +#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) +cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; +#endif + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 85d08daa6600..f26e534c6585 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; */ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; -void __attribute__((weak)) -proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) +#ifdef CONFIG_GENERIC_PENDING_IRQ +void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) +{ + /* + * Save these away for later use. Re-progam when the + * interrupt is pending + */ + set_pending_irq(irq, mask_val); +} +#else +void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { irq_affinity[irq] = mask_val; irq_desc[irq].handler->set_affinity(irq, mask_val); } +#endif static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b0237122b24e..f3ea492ab44d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -37,6 +37,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> #include <asm/kdebug.h> @@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages; * get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t *get_insn_slot(void) +kprobe_opcode_t __kprobes *get_insn_slot(void) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void) return kip->insns; } -void free_insn_slot(kprobe_opcode_t *slot) +void __kprobes free_insn_slot(kprobe_opcode_t *slot) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot) } /* Locks kprobe: irqs must be disabled */ -void lock_kprobes(void) +void __kprobes lock_kprobes(void) { + unsigned long flags = 0; + + /* Avoiding local interrupts to happen right after we take the kprobe_lock + * and before we get a chance to update kprobe_cpu, this to prevent + * deadlock when we have a kprobe on ISR routine and a kprobe on task + * routine + */ + local_irq_save(flags); + spin_lock(&kprobe_lock); kprobe_cpu = smp_processor_id(); + + local_irq_restore(flags); } -void unlock_kprobes(void) +void __kprobes unlock_kprobes(void) { + unsigned long flags = 0; + + /* Avoiding local interrupts to happen right after we update + * kprobe_cpu and before we get a a chance to release kprobe_lock, + * this to prevent deadlock when we have a kprobe on ISR routine and + * a kprobe on task routine + */ + local_irq_save(flags); + kprobe_cpu = NR_CPUS; spin_unlock(&kprobe_lock); + + local_irq_restore(flags); } /* You have to be holding the kprobe_lock */ -struct kprobe *get_kprobe(void *addr) +struct kprobe __kprobes *get_kprobe(void *addr) { struct hlist_head *head; struct hlist_node *node; @@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr) * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list */ -static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) +static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) { struct kprobe *kp; @@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, return; } -static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, - int trapnr) +static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, + int trapnr) { /* * if we faulted "during" the execution of a user specified @@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, return 0; } -static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) +static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp = curr_kprobe; if (curr_kprobe && kp->break_handler) { @@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) +struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) { struct hlist_node *node; struct kretprobe_instance *ri; @@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) return NULL; } -static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) +static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe + *rp) { struct hlist_node *node; struct kretprobe_instance *ri; @@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) return NULL; } -void add_rp_inst(struct kretprobe_instance *ri) +void __kprobes add_rp_inst(struct kretprobe_instance *ri) { /* * Remove rp inst off the free list - @@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri) hlist_add_head(&ri->uflist, &ri->rp->used_instances); } -void recycle_rp_inst(struct kretprobe_instance *ri) +void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) { /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); @@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri) kfree(ri); } -struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) +struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) { return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; } @@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) * instances associated with this task. These left over instances represent * probed functions that have been called but will never return. */ -void kprobe_flush_task(struct task_struct *tk) +void __kprobes kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head; @@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk) * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. */ -static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); @@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) * Add the new probe to old_p->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist */ -static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) +static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) { struct kprobe *kp; @@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) * the intricacies * TODO: Move kcalloc outside the spinlock */ -static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) +static int __kprobes register_aggr_kprobe(struct kprobe *old_p, + struct kprobe *p) { int ret = 0; struct kprobe *ap; @@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p, spin_unlock_irqrestore(&kprobe_lock, flags); } -int register_kprobe(struct kprobe *p) +static int __kprobes in_kprobes_functions(unsigned long addr) +{ + if (addr >= (unsigned long)__kprobes_text_start + && addr < (unsigned long)__kprobes_text_end) + return -EINVAL; + return 0; +} + +int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; unsigned long flags = 0; struct kprobe *old_p; - if ((ret = arch_prepare_kprobe(p)) != 0) { + if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) + return ret; + if ((ret = arch_prepare_kprobe(p)) != 0) goto rm_kprobe; - } + spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); p->nmissed = 0; @@ -466,7 +502,7 @@ rm_kprobe: return ret; } -void unregister_kprobe(struct kprobe *p) +void __kprobes unregister_kprobe(struct kprobe *p) { unsigned long flags; struct kprobe *old_p; @@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = { .priority = 0x7fffffff /* we need to notified first */ }; -int register_jprobe(struct jprobe *jp) +int __kprobes register_jprobe(struct jprobe *jp) { /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; @@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp) return register_kprobe(&jp->kp); } -void unregister_jprobe(struct jprobe *jp) +void __kprobes unregister_jprobe(struct jprobe *jp) { unregister_kprobe(&jp->kp); } #ifdef ARCH_SUPPORTS_KRETPROBES -int register_kretprobe(struct kretprobe *rp) +int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp) #else /* ARCH_SUPPORTS_KRETPROBES */ -int register_kretprobe(struct kretprobe *rp) +int __kprobes register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } #endif /* ARCH_SUPPORTS_KRETPROBES */ -void unregister_kretprobe(struct kretprobe *rp) +void __kprobes unregister_kretprobe(struct kretprobe *rp) { unsigned long flags; struct kretprobe_instance *ri; diff --git a/kernel/module.c b/kernel/module.c index c32995fbd8fd..ff5c500ab625 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/moduleloader.h> #include <linux/init.h> +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/elf.h> @@ -498,7 +499,7 @@ static inline int try_force(unsigned int flags) { int ret = (flags & O_TRUNC); if (ret) - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); return ret; } #else @@ -897,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs, if (!(tainted & TAINT_FORCED_MODULE)) { printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); } return 1; } @@ -1352,7 +1353,7 @@ static void set_license(struct module *mod, const char *license) if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", mod->name, license); - tainted |= TAINT_PROPRIETARY_MODULE; + add_taint(TAINT_PROPRIETARY_MODULE); } } @@ -1509,6 +1510,7 @@ static struct module *load_module(void __user *umod, long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ struct exception_table_entry *extable; + mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", umod, len, uargs); @@ -1609,7 +1611,7 @@ static struct module *load_module(void __user *umod, modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", mod->name); } else if (!same_magic(modmagic, vermagic)) { @@ -1738,7 +1740,7 @@ static struct module *load_module(void __user *umod, (mod->num_gpl_syms && !gplcrcindex)) { printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); } #endif @@ -1779,6 +1781,24 @@ static struct module *load_module(void __user *umod, if (err < 0) goto cleanup; + /* flush the icache in correct context */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + + /* + * Flush the instruction cache, since we've played with text. + * Do it before processing of module parameters, so the module + * can provide parameter accessor functions of its own. + */ + if (mod->module_init) + flush_icache_range((unsigned long)mod->module_init, + (unsigned long)mod->module_init + + mod->init_size); + flush_icache_range((unsigned long)mod->module_core, + (unsigned long)mod->module_core + mod->core_size); + + set_fs(old_fs); + mod->args = args; if (obsparmindex) { err = obsolete_params(mod->name, mod->args, @@ -1860,7 +1880,6 @@ sys_init_module(void __user *umod, const char __user *uargs) { struct module *mod; - mm_segment_t old_fs = get_fs(); int ret = 0; /* Must have permission */ @@ -1878,19 +1897,6 @@ sys_init_module(void __user *umod, return PTR_ERR(mod); } - /* flush the icache in correct context */ - set_fs(KERNEL_DS); - - /* Flush the instruction cache, since we've played with text */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); - - set_fs(old_fs); - /* Now sew it into the lists. They won't access us, since strong_try_module_get() will fail. */ stop_machine_run(__link_module, mod, NR_CPUS); diff --git a/kernel/params.c b/kernel/params.c index d586c35ef8fc..fbf173215fd2 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name, { struct module_kobject *mk; - mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); - memset(mk, 0, sizeof(struct module_kobject)); + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + BUG_ON(!mk); mk->mod = THIS_MODULE; kobj_set_kset_s(mk, module_subsys); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 38798a2ff994..b7b532acd9fc 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private) timr->sigq->info.si_code = SI_TIMER; timr->sigq->info.si_tid = timr->it_id; timr->sigq->info.si_value = timr->it_sigev_value; + if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - if (unlikely(timr->it_process->flags & PF_EXITING)) { - timr->it_sigev_notify = SIGEV_SIGNAL; - put_task_struct(timr->it_process); - timr->it_process = timr->it_process->group_leader; - goto group; - } - return send_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); - } - else { - group: - return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); + struct task_struct *leader; + int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); + + if (likely(ret >= 0)) + return ret; + + timr->it_sigev_notify = SIGEV_SIGNAL; + leader = timr->it_process->group_leader; + put_task_struct(timr->it_process); + timr->it_process = leader; } + + return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); } EXPORT_SYMBOL_GPL(posix_timer_event); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2c7121d9bff1..396c7873e804 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,5 +1,6 @@ config PM bool "Power Management support" + depends on !IA64_HP_SIM ---help--- "Power Management" means that parts of your computer are shut off or put into a power conserving "sleep" mode if they are not @@ -28,7 +29,7 @@ config PM_DEBUG config SOFTWARE_SUSPEND bool "Software Suspend" - depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) + depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP)) ---help--- Enable the possibility of suspending the machine. It doesn't need APM. @@ -72,6 +73,18 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. +config SWSUSP_ENCRYPT + bool "Encrypt suspend image" + depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y) + default "" + ---help--- + To prevent data gathering from swap after resume you can encrypt + the suspend image with a temporary key that is deleted on + resume. + + Note that the temporary key is stored unencrypted on disk while the + system is suspended. + config SUSPEND_SMP bool depends on HOTPLUG_CPU && X86 && PM diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 664eb0469b6e..2d8bf054d036 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -112,24 +112,12 @@ static inline void platform_finish(void) } } -static void finish(void) -{ - device_resume(); - platform_finish(); - thaw_processes(); - enable_nonboot_cpus(); - pm_restore_console(); -} - - static int prepare_processes(void) { int error; pm_prepare_console(); - sys_sync(); - disable_nonboot_cpus(); if (freeze_processes()) { @@ -162,15 +150,6 @@ static void unprepare_processes(void) pm_restore_console(); } -static int prepare_devices(void) -{ - int error; - - if ((error = device_suspend(PMSG_FREEZE))) - printk("Some devices failed to suspend\n"); - return error; -} - /** * pm_suspend_disk - The granpappy of power management. * @@ -187,17 +166,14 @@ int pm_suspend_disk(void) error = prepare_processes(); if (error) return error; - error = prepare_devices(); + error = device_suspend(PMSG_FREEZE); if (error) { + printk("Some devices failed to suspend\n"); unprepare_processes(); return error; } - pr_debug("PM: Attempting to suspend to disk.\n"); - if (pm_disk_mode == PM_DISK_FIRMWARE) - return pm_ops->enter(PM_SUSPEND_DISK); - pr_debug("PM: snapshotting memory.\n"); in_suspend = 1; if ((error = swsusp_suspend())) @@ -208,11 +184,20 @@ int pm_suspend_disk(void) error = swsusp_write(); if (!error) power_down(pm_disk_mode); + else { + /* swsusp_write can not fail in device_resume, + no need to do second device_resume */ + swsusp_free(); + unprepare_processes(); + return error; + } } else pr_debug("PM: Image restored successfully.\n"); + swsusp_free(); Done: - finish(); + device_resume(); + unprepare_processes(); return error; } @@ -233,9 +218,12 @@ static int software_resume(void) { int error; + down(&pm_sem); if (!swsusp_resume_device) { - if (!strlen(resume_file)) + if (!strlen(resume_file)) { + up(&pm_sem); return -ENOENT; + } swsusp_resume_device = name_to_dev_t(resume_file); pr_debug("swsusp: Resume From Partition %s\n", resume_file); } else { @@ -248,6 +236,7 @@ static int software_resume(void) * FIXME: If noresume is specified, we need to find the partition * and reset it back to normal swap space. */ + up(&pm_sem); return 0; } @@ -270,20 +259,24 @@ static int software_resume(void) pr_debug("PM: Preparing devices for restore.\n"); - if ((error = prepare_devices())) + if ((error = device_suspend(PMSG_FREEZE))) { + printk("Some devices failed to suspend\n"); goto Free; + } mb(); pr_debug("PM: Restoring saved image.\n"); swsusp_resume(); pr_debug("PM: Restore failed, recovering.n"); - finish(); + device_resume(); Free: swsusp_free(); Cleanup: unprepare_processes(); Done: + /* For success case, the suspend path will release the lock */ + up(&pm_sem); pr_debug("PM: Resume from disk failed.\n"); return 0; } @@ -390,7 +383,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t if (sscanf(buf, "%u:%u", &maj, &min) == 2) { res = MKDEV(maj,min); if (maj == MAJOR(res) && min == MINOR(res)) { + down(&pm_sem); swsusp_resume_device = res; + up(&pm_sem); printk("Attempting manual resume\n"); noresume = 0; software_resume(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 71aa0fd22007..22bdc93cc038 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -143,11 +143,12 @@ static void suspend_finish(suspend_state_t state) -static char * pm_states[] = { +static char *pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", +#ifdef CONFIG_SOFTWARE_SUSPEND [PM_SUSPEND_DISK] = "disk", - NULL, +#endif }; diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 61deda04e39e..159149321b3c 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type, unsigned long id, pm_callback callback) { - struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); + struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); if (dev) { - memset(dev, 0, sizeof(*dev)); dev->type = type; dev->id = id; dev->callback = callback; diff --git a/kernel/power/process.c b/kernel/power/process.c index 3bd0d261818f..28de118f7a0b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -38,7 +38,6 @@ void refrigerator(void) processes around? */ long save; save = current->state; - current->state = TASK_UNINTERRUPTIBLE; pr_debug("%s entered refrigerator\n", current->comm); printk("="); @@ -47,8 +46,10 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); - while (frozen(current)) + while (frozen(current)) { + current->state = TASK_UNINTERRUPTIBLE; schedule(); + } pr_debug("%s left refrigerator\n", current->comm); current->state = save; } @@ -80,13 +81,33 @@ int freeze_processes(void) } while_each_thread(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ - if (time_after(jiffies, start_time + TIMEOUT)) { + if (todo && time_after(jiffies, start_time + TIMEOUT)) { printk( "\n" ); printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); - return todo; + break; } } while(todo); + /* This does not unfreeze processes that are already frozen + * (we have slightly ugly calling convention in that respect, + * and caller must call thaw_processes() if something fails), + * but it cleans up leftover PF_FREEZE requests. + */ + if (todo) { + read_lock(&tasklist_lock); + do_each_thread(g, p) + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + p->flags &= ~PF_FREEZE; + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_tsk(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } + while_each_thread(g, p); + read_unlock(&tasklist_lock); + return todo; + } + printk( "|\n" ); BUG_ON(in_atomic()); return 0; diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index f2bc71b9fe8b..d967e875ee82 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -31,6 +31,9 @@ * Alex Badea <vampire@go.ro>: * Fixed runaway init * + * Andreas Steinmetz <ast@domdv.de>: + * Added encrypted suspend option + * * More state savers are welcome. Especially for the scsi layer... * * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt @@ -71,8 +74,16 @@ #include <asm/tlbflush.h> #include <asm/io.h> +#include <linux/random.h> +#include <linux/crypto.h> +#include <asm/scatterlist.h> + #include "power.h" +#define CIPHER "aes" +#define MAXKEY 32 +#define MAXIV 32 + /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; @@ -103,7 +114,8 @@ static suspend_pagedir_t *pagedir_save; #define SWSUSP_SIG "S1SUSPEND" static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; + char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; + u8 key_iv[MAXKEY+MAXIV]; swp_entry_t swsusp_info; char orig_sig[10]; char sig[10]; @@ -129,6 +141,131 @@ static struct swsusp_info swsusp_info; static unsigned short swapfile_used[MAX_SWAPFILES]; static unsigned short root_swap; +static int write_page(unsigned long addr, swp_entry_t * loc); +static int bio_read_page(pgoff_t page_off, void * page); + +static u8 key_iv[MAXKEY+MAXIV]; + +#ifdef CONFIG_SWSUSP_ENCRYPT + +static int crypto_init(int mode, void **mem) +{ + int error = 0; + int len; + char *modemsg; + struct crypto_tfm *tfm; + + modemsg = mode ? "suspend not possible" : "resume not possible"; + + tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC); + if(!tfm) { + printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg); + error = -EINVAL; + goto out; + } + + if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) { + printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg); + error = -ENOKEY; + goto fail; + } + + if (mode) + get_random_bytes(key_iv, MAXKEY+MAXIV); + + len = crypto_tfm_alg_max_keysize(tfm); + if (len > MAXKEY) + len = MAXKEY; + + if (crypto_cipher_setkey(tfm, key_iv, len)) { + printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg); + error = -EKEYREJECTED; + goto fail; + } + + len = crypto_tfm_alg_ivsize(tfm); + + if (MAXIV < len) { + printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg); + error = -EOVERFLOW; + goto fail; + } + + crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len); + + *mem=(void *)tfm; + + goto out; + +fail: crypto_free_tfm(tfm); +out: return error; +} + +static __inline__ void crypto_exit(void *mem) +{ + crypto_free_tfm((struct crypto_tfm *)mem); +} + +static __inline__ int crypto_write(struct pbe *p, void *mem) +{ + int error = 0; + struct scatterlist src, dst; + + src.page = virt_to_page(p->address); + src.offset = 0; + src.length = PAGE_SIZE; + dst.page = virt_to_page((void *)&swsusp_header); + dst.offset = 0; + dst.length = PAGE_SIZE; + + error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src, + PAGE_SIZE); + + if (!error) + error = write_page((unsigned long)&swsusp_header, + &(p->swap_address)); + return error; +} + +static __inline__ int crypto_read(struct pbe *p, void *mem) +{ + int error = 0; + struct scatterlist src, dst; + + error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); + if (!error) { + src.offset = 0; + src.length = PAGE_SIZE; + dst.offset = 0; + dst.length = PAGE_SIZE; + src.page = dst.page = virt_to_page((void *)p->address); + + error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst, + &src, PAGE_SIZE); + } + return error; +} +#else +static __inline__ int crypto_init(int mode, void *mem) +{ + return 0; +} + +static __inline__ void crypto_exit(void *mem) +{ +} + +static __inline__ int crypto_write(struct pbe *p, void *mem) +{ + return write_page(p->address, &(p->swap_address)); +} + +static __inline__ int crypto_read(struct pbe *p, void *mem) +{ + return bio_read_page(swp_offset(p->swap_address), (void *)p->address); +} +#endif + static int mark_swapfiles(swp_entry_t prev) { int error; @@ -140,6 +277,7 @@ static int mark_swapfiles(swp_entry_t prev) !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); + memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); swsusp_header.swsusp_info = prev; error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), @@ -179,9 +317,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */ len=strlen(resume_file); root_swap = 0xFFFF; - swap_list_lock(); + spin_lock(&swap_lock); for (i=0; i<MAX_SWAPFILES; i++) { - if (swap_info[i].flags == 0) { + if (!(swap_info[i].flags & SWP_WRITEOK)) { swapfile_used[i]=SWAPFILE_UNUSED; } else { if (!len) { @@ -202,7 +340,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */ } } } - swap_list_unlock(); + spin_unlock(&swap_lock); return (root_swap != 0xffff) ? 0 : -ENODEV; } @@ -216,12 +354,12 @@ static void lock_swapdevices(void) { int i; - swap_list_lock(); + spin_lock(&swap_lock); for (i = 0; i< MAX_SWAPFILES; i++) if (swapfile_used[i] == SWAPFILE_IGNORED) { - swap_info[i].flags ^= 0xFF; + swap_info[i].flags ^= SWP_WRITEOK; } - swap_list_unlock(); + spin_unlock(&swap_lock); } /** @@ -286,6 +424,10 @@ static int data_write(void) int error = 0, i = 0; unsigned int mod = nr_copy_pages / 100; struct pbe *p; + void *tfm; + + if ((error = crypto_init(1, &tfm))) + return error; if (!mod) mod = 1; @@ -294,11 +436,14 @@ static int data_write(void) for_each_pbe (p, pagedir_nosave) { if (!(i%mod)) printk( "\b\b\b\b%3d%%", i / mod ); - if ((error = write_page(p->address, &(p->swap_address)))) + if ((error = crypto_write(p, tfm))) { + crypto_exit(tfm); return error; + } i++; } printk("\b\b\b\bdone\n"); + crypto_exit(tfm); return error; } @@ -385,7 +530,6 @@ static int write_pagedir(void) * write_suspend_image - Write entire image and metadata. * */ - static int write_suspend_image(void) { int error; @@ -400,6 +544,7 @@ static int write_suspend_image(void) if ((error = close_swap())) goto FreePagedir; Done: + memset(key_iv, 0, MAXKEY+MAXIV); return error; FreePagedir: free_pagedir_entries(); @@ -591,18 +736,7 @@ static void copy_data_pages(void) static int calc_nr(int nr_copy) { - int extra = 0; - int mod = !!(nr_copy % PBES_PER_PAGE); - int diff = (nr_copy / PBES_PER_PAGE) + mod; - - do { - extra += diff; - nr_copy += diff; - mod = !!(nr_copy % PBES_PER_PAGE); - diff = (nr_copy / PBES_PER_PAGE) + mod - extra; - } while (diff > 0); - - return nr_copy; + return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1); } /** @@ -886,20 +1020,21 @@ int swsusp_suspend(void) * at resume time, and evil weirdness ensues. */ if ((error = device_power_down(PMSG_FREEZE))) { + printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); local_irq_enable(); return error; } if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " - "swapon -a!\n"); + printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); + device_power_up(); local_irq_enable(); return error; } save_processor_state(); if ((error = swsusp_arch_suspend())) - printk("Error %d suspending\n", error); + printk(KERN_ERR "Error %d suspending\n", error); /* Restore control flow magically appears here */ restore_processor_state(); BUG_ON (nr_copy_pages_check != nr_copy_pages); @@ -924,6 +1059,7 @@ int swsusp_resume(void) BUG_ON(!error); restore_processor_state(); restore_highmem(); + touch_softlockup_watchdog(); device_power_up(); local_irq_enable(); return error; @@ -1179,7 +1315,8 @@ static const char * sanity_check(void) if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) return "machine"; #if 0 - if(swsusp_info.cpus != num_online_cpus()) + /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */ + if (swsusp_info.cpus != num_possible_cpus()) return "number of cpus"; #endif return NULL; @@ -1212,13 +1349,14 @@ static int check_sig(void) return error; if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV); + memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV); /* * Reset swap signature now. */ error = bio_write_page(0, &swsusp_header); } else { - printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n"); return -EINVAL; } if (!error) @@ -1239,6 +1377,10 @@ static int data_read(struct pbe *pblist) int error = 0; int i = 0; int mod = swsusp_info.image_pages / 100; + void *tfm; + + if ((error = crypto_init(0, &tfm))) + return error; if (!mod) mod = 1; @@ -1250,14 +1392,15 @@ static int data_read(struct pbe *pblist) if (!(i % mod)) printk("\b\b\b\b%3d%%", i / mod); - error = bio_read_page(swp_offset(p->swap_address), - (void *)p->address); - if (error) + if ((error = crypto_read(p, tfm))) { + crypto_exit(tfm); return error; + } i++; } printk("\b\b\b\bdone\n"); + crypto_exit(tfm); return error; } @@ -1385,6 +1528,7 @@ int swsusp_read(void) error = read_suspend_image(); blkdev_put(resume_bdev); + memset(key_iv, 0, MAXKEY+MAXIV); if (!error) pr_debug("swsusp: Reading resume file was successful\n"); diff --git a/kernel/printk.c b/kernel/printk.c index 5092397fac29..a967605bc2e3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -514,6 +514,9 @@ asmlinkage int printk(const char *fmt, ...) return r; } +/* cpu currently holding logbuf_lock */ +static volatile unsigned int printk_cpu = UINT_MAX; + asmlinkage int vprintk(const char *fmt, va_list args) { unsigned long flags; @@ -522,11 +525,15 @@ asmlinkage int vprintk(const char *fmt, va_list args) static char printk_buf[1024]; static int log_level_unknown = 1; - if (unlikely(oops_in_progress)) + preempt_disable(); + if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) + /* If a crash is occurring during printk() on this CPU, + * make sure we can't deadlock */ zap_locks(); /* This stops the holder of console_sem just where we want him */ spin_lock_irqsave(&logbuf_lock, flags); + printk_cpu = smp_processor_id(); /* Emit the output into the temporary buffer */ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); @@ -595,6 +602,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * CPU until it is officially up. We shouldn't be calling into * random console drivers on a CPU which doesn't exist yet.. */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); goto out; } @@ -604,6 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * We own the drivers. We can drop the spinlock and let * release_console_sem() print the text */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); console_may_schedule = 0; release_console_sem(); @@ -613,9 +622,11 @@ asmlinkage int vprintk(const char *fmt, va_list args) * allows the semaphore holder to proceed and to call the * console drivers with the output which we just produced. */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); } out: + preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8dcb8f6288bc..019e04ec065a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill) return ret; } +static int may_attach(struct task_struct *task) +{ + if (!task->mm) + return -EPERM; + if (((current->uid != task->euid) || + (current->uid != task->suid) || + (current->uid != task->uid) || + (current->gid != task->egid) || + (current->gid != task->sgid) || + (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) + return -EPERM; + smp_rmb(); + if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + return -EPERM; + + return security_ptrace(current, task); +} + +int ptrace_may_attach(struct task_struct *task) +{ + int err; + task_lock(task); + err = may_attach(task); + task_unlock(task); + return !err; +} + int ptrace_attach(struct task_struct *task) { int retval; @@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task) goto bad; if (task == current) goto bad; - if (!task->mm) - goto bad; - if(((current->uid != task->euid) || - (current->uid != task->suid) || - (current->uid != task->uid) || - (current->gid != task->egid) || - (current->gid != task->sgid) || - (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) - goto bad; - smp_rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) - goto bad; /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; - retval = security_ptrace(current, task); + retval = may_attach(task); if (retval) goto bad; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f436993bd590..bef3b6901b76 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@ #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/rcupdate.h> +#include <linux/rcuref.h> #include <linux/cpu.h> /* Definition for rcupdate control block. */ @@ -72,6 +73,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; static int maxbatch = 10; +#ifndef __HAVE_ARCH_CMPXCHG +/* + * We use an array of spinlocks for the rcurefs -- similar to ones in sparc + * 32 bit atomic_t implementations, and a hash function similar to that + * for our refcounting needs. + * Can't help multiprocessors which donot have cmpxchg :( + */ + +spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { + [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED +}; +#endif + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. diff --git a/kernel/resource.c b/kernel/resource.c index 26967e042201..92285d822de6 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource); */ struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) { - struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); if (res) { - memset(res, 0, sizeof(*res)); res->name = name; res->start = start; res->end = start + n - 1; diff --git a/kernel/sched.c b/kernel/sched.c index 5f889d0cbfcc..81b3a96ed2d0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(task_t * p) +void wait_task_inactive(task_t *p) { unsigned long flags; runqueue_t *rq; @@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group; int i; + /* Skip over this group if it has no CPUs allowed */ + if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + goto nextgroup; + local_group = cpu_isset(this_cpu, group->cpumask); - /* XXX: put a cpus allowed check */ /* Tally up the load of all CPUs in the group */ avg_load = 0; @@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) min_load = avg_load; idlest = group; } +nextgroup: group = group->next; } while (group != sd->groups); @@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* * find_idlest_queue - find the idlest runqueue among the cpus in group. */ -static int find_idlest_cpu(struct sched_group *group, int this_cpu) +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { + cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; - for_each_cpu_mask(i, group->cpumask) { + /* Traverse only the allowed CPUs */ + cpus_and(tmp, group->cpumask, p->cpus_allowed); + + for_each_cpu_mask(i, tmp) { load = source_load(i, 0); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag) if (!group) goto nextlevel; - new_cpu = find_idlest_cpu(group, cpu); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) goto nextlevel; @@ -1127,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p) * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t * p, unsigned int state, int sync) +static int try_to_wake_up(task_t *p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -1252,6 +1261,16 @@ out_activate: } /* + * Tasks that have marked their sleep as noninteractive get + * woken up without updating their sleep average. (i.e. their + * sleep is handled in a priority-neutral manner, no priority + * boost and no penalty.) + */ + if (old_state & TASK_NONINTERACTIVE) + __activate_task(p, rq); + else + activate_task(p, rq, cpu == this_cpu); + /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) * don't trigger a preemption, if the woken up task will run on @@ -1259,7 +1278,6 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1274,7 +1292,7 @@ out: return success; } -int fastcall wake_up_process(task_t * p) +int fastcall wake_up_process(task_t *p) { return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); @@ -1353,7 +1371,7 @@ void fastcall sched_fork(task_t *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) +void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) { unsigned long flags; int this_cpu, cpu; @@ -1436,7 +1454,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) * artificially, because any timeslice recovered here * was given away by the parent in the first place.) */ -void fastcall sched_exit(task_t * p) +void fastcall sched_exit(task_t *p) { unsigned long flags; runqueue_t *rq; @@ -1478,6 +1496,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) /** * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -1510,6 +1529,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) * Manfred Spraul <manfred@colorfullife.com> */ prev_task_flags = prev->flags; +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif finish_arch_switch(prev); finish_lock_switch(rq, prev); if (mm) @@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle, int *all_pinned) + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) { /* * We do not migrate tasks that are: @@ -1882,10 +1906,11 @@ out: */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle) + unsigned long *imbalance, enum idle_type idle, int *sd_idle) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; + unsigned long max_pull; int load_idx; max_load = this_load = total_load = total_pwr = 0; @@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load = 0; for_each_cpu_mask(i, group->cpumask) { + if (*sd_idle && !idle_cpu(i)) + *sd_idle = 0; + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, group = group->next; } while (group != sd->groups); - if (!busiest || this_load >= max_load) + if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; @@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ + + /* Don't want to pull so many tasks that a group would go idle */ + max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); + /* How much load to actually move to equalise the imbalance */ - *imbalance = min((max_load - avg_load) * busiest->cpu_power, + *imbalance = min(max_pull * busiest->cpu_power, (avg_load - this_load) * this->cpu_power) / SCHED_LOAD_SCALE; @@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, unsigned long imbalance; int nr_moved, all_pinned = 0; int active_balance = 0; + int sd_idle = 0; + + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) + sd_idle = 1; - spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); - group = find_busiest_group(sd, this_cpu, &imbalance, idle); + group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ - double_lock_balance(this_rq, busiest); + double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, - &all_pinned); - spin_unlock(&busiest->lock); + imbalance, sd, idle, &all_pinned); + double_rq_unlock(this_rq, busiest); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) goto out_balanced; } - spin_unlock(&this_rq->lock); - if (!nr_moved) { schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; @@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { spin_lock(&busiest->lock); + + /* don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + spin_unlock(&busiest->lock); + all_pinned = 1; + goto out_one_pinned; + } + if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; @@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->balance_interval *= 2; } + if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; return nr_moved; out_balanced: - spin_unlock(&this_rq->lock); - schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; + +out_one_pinned: /* tune up the balancing interval */ if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; return 0; } @@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, runqueue_t *busiest = NULL; unsigned long imbalance; int nr_moved = 0; + int sd_idle = 0; + + if (sd->flags & SD_SHARE_CPUPOWER) + sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; @@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, BUG_ON(busiest == this_rq); - /* Attempt to move tasks */ - double_lock_balance(this_rq, busiest); - schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); - nr_moved = move_tasks(this_rq, this_cpu, busiest, + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, NEWLY_IDLE, NULL); - if (!nr_moved) + spin_unlock(&busiest->lock); + } + + if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - else + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; + } else sd->nr_balance_failed = 0; - spin_unlock(&busiest->lock); return nr_moved; out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; sd->nr_balance_failed = 0; return 0; } @@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, if (j - sd->last_balance >= interval) { if (load_balance(this_cpu, this_rq, sd, idle)) { - /* We've pulled tasks over so no longer idle */ + /* + * We've pulled tasks over so either we're no + * longer idle, or one of our SMT siblings is + * not idle. + */ idle = NOT_IDLE; } sd->last_balance += interval; @@ -2575,6 +2637,13 @@ out: } #ifdef CONFIG_SCHED_SMT +static inline void wakeup_busy_runqueue(runqueue_t *rq) +{ + /* If an SMT runqueue is sleeping due to priority reasons wake it up */ + if (rq->curr == rq->idle && rq->nr_running) + resched_task(rq->idle); +} + static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; @@ -2608,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); - /* - * If an SMT sibling task is sleeping due to priority - * reasons wake it up now. - */ - if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) - resched_task(smt_rq->idle); + wakeup_busy_runqueue(smt_rq); } for_each_cpu_mask(i, sibling_map) @@ -2624,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) */ } +/* + * number of 'lost' timeslices this task wont be able to fully + * utilize, if another task runs on a sibling. This models the + * slowdown effect of other tasks running on siblings: + */ +static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) +{ + return p->time_slice * (100 - sd->per_cpu_gain) / 100; +} + static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; @@ -2667,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) runqueue_t *smt_rq = cpu_rq(i); task_t *smt_curr = smt_rq->curr; + /* Kernel threads do not participate in dependent sleeping */ + if (!p->mm || !smt_curr->mm || rt_task(p)) + goto check_smt_task; + /* * If a user task with lower static priority than the * running task on the SMT sibling is trying to schedule, @@ -2675,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && - p->mm && smt_curr->mm && !rt_task(p)) - ret = 1; + if (rt_task(smt_curr)) { + /* + * With real time tasks we run non-rt tasks only + * per_cpu_gain% of the time. + */ + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + ret = 1; + } else + if (smt_curr->static_prio < p->static_prio && + !TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(smt_curr, sd) > task_timeslice(p)) + ret = 1; + +check_smt_task: + if ((!smt_curr->mm && smt_curr != smt_rq->idle) || + rt_task(smt_curr)) + continue; + if (!p->mm) { + wakeup_busy_runqueue(smt_rq); + continue; + } /* - * Reschedule a lower priority task on the SMT sibling, - * or wake it up if it has been put to sleep for priority - * reasons. + * Reschedule a lower priority task on the SMT sibling for + * it to be put to sleep, or wake it up if it has been put to + * sleep for priority reasons to see if it should run now. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p)) && - smt_curr->mm && p->mm && !rt_task(smt_curr)) || - (smt_curr == smt_rq->idle && smt_rq->nr_running)) - resched_task(smt_curr); + if (rt_task(p)) { + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + resched_task(smt_curr); + } else { + if (TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(p, sd) > task_timeslice(smt_curr)) + resched_task(smt_curr); + else + wakeup_busy_runqueue(smt_rq); + } } out_unlock: for_each_cpu_mask(i, sibling_map) @@ -2887,6 +2989,7 @@ switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); prefetch(next); + prefetch_stack(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); @@ -3014,7 +3117,8 @@ need_resched: #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, + void *key) { task_t *p = curr->private; return try_to_wake_up(p, mode, sync); @@ -3056,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @key: is directly passed to the wakeup function */ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) + int nr_exclusive, void *key) { unsigned long flags; @@ -3088,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) * * On UP it can prevent extra preemption. */ -void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void fastcall +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { unsigned long flags; int sync = 1; @@ -3279,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -3498,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) * @policy: new policy. * @param: structure containing the new RT priority. */ -int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) { int retval; int oldprio, oldpolicy = -1; @@ -3518,7 +3625,7 @@ recheck: * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) @@ -3581,7 +3688,8 @@ recheck: } EXPORT_SYMBOL_GPL(sched_setscheduler); -static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { int retval; struct sched_param lparam; @@ -3848,7 +3956,7 @@ asmlinkage long sys_sched_yield(void) if (rt_task(current)) target = rq->active; - if (current->array->nr_active == 1) { + if (array->nr_active == 1) { schedstat_inc(rq, yld_act_empty); if (!rq->expired->nr_active) schedstat_inc(rq, yld_both_empty); @@ -3912,7 +4020,7 @@ EXPORT_SYMBOL(cond_resched); * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t * lock) +int cond_resched_lock(spinlock_t *lock) { int ret = 0; @@ -4095,7 +4203,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p) return list_entry(p->sibling.next,struct task_struct,sibling); } -static void show_task(task_t * p) +static void show_task(task_t *p) { task_t *relative; unsigned state; @@ -4121,7 +4229,7 @@ static void show_task(task_t * p) #endif #ifdef CONFIG_DEBUG_STACK_USAGE { - unsigned long * n = (unsigned long *) (p->thread_info+1); + unsigned long *n = (unsigned long *) (p->thread_info+1); while (!*n) n++; free = (unsigned long) n - (unsigned long)(p->thread_info+1); @@ -4330,7 +4438,7 @@ out: * thread migration by bumping thread off CPU then 'pushing' onto * another runqueue. */ -static int migration_thread(void * data) +static int migration_thread(void *data) { runqueue_t *rq; int cpu = (long)data; @@ -4779,7 +4887,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void cpu_attach_domain(struct sched_domain *sd, int cpu) +static void cpu_attach_domain(struct sched_domain *sd, int cpu) { runqueue_t *rq = cpu_rq(cpu); struct sched_domain *tmp; @@ -4802,7 +4910,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu) } /* cpus with isolated domains */ -cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; +static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -4830,8 +4938,8 @@ __setup ("isolcpus=", isolated_cpu_setup); * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -void init_sched_build_groups(struct sched_group groups[], - cpumask_t span, int (*group_fn)(int cpu)) +static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, + int (*group_fn)(int cpu)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; @@ -4864,12 +4972,85 @@ void init_sched_build_groups(struct sched_group groups[], last->next = first; } +#define SD_NODES_PER_DOMAIN 16 -#ifdef ARCH_HAS_SCHED_DOMAIN -extern void build_sched_domains(const cpumask_t *cpu_map); -extern void arch_init_sched_domains(const cpumask_t *cpu_map); -extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); -#else +#ifdef CONFIG_NUMA +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int find_next_best_node(int node, unsigned long *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Start at @node */ + n = (node + i) % MAX_NUMNODES; + + if (!nr_cpus_node(n)) + continue; + + /* Skip already used nodes */ + if (test_bit(n, used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + set_bit(best_node, used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @size: number of nodes to include in this span + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static cpumask_t sched_domain_node_span(int node) +{ + int i; + cpumask_t span, nodemask; + DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + + cpus_clear(span); + bitmap_zero(used_nodes, MAX_NUMNODES); + + nodemask = node_to_cpumask(node); + cpus_or(span, span, nodemask); + set_bit(node, used_nodes); + + for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, used_nodes); + nodemask = node_to_cpumask(next_node); + cpus_or(span, span, nodemask); + } + + return span; +} +#endif + +/* + * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we + * can switch it on easily if needed. + */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; @@ -4891,36 +5072,20 @@ static int cpu_to_phys_group(int cpu) } #ifdef CONFIG_NUMA - -static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static int cpu_to_node_group(int cpu) -{ - return cpu_to_node(cpu); -} -#endif - -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) /* - * The domains setup code relies on siblings not spanning - * multiple nodes. Make sure the architecture has a proper - * siblings map: + * The init_sched_build_groups can't handle what we want to do with node + * groups, so roll our own. Now each node has its own list of groups which + * gets dynamically allocated. */ -static void check_sibling_maps(void) -{ - int i, j; +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; - for_each_online_cpu(i) { - for_each_cpu_mask(j, cpu_sibling_map[i]) { - if (cpu_to_node(i) != cpu_to_node(j)) { - printk(KERN_INFO "warning: CPU %d siblings map " - "to different node - isolating " - "them.\n", i); - cpu_sibling_map[i] = cpumask_of_cpu(i); - break; - } - } - } +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); +static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; + +static int cpu_to_allnodes_group(int cpu) +{ + return cpu_to_node(cpu); } #endif @@ -4928,9 +5093,24 @@ static void check_sibling_maps(void) * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static void build_sched_domains(const cpumask_t *cpu_map) +void build_sched_domains(const cpumask_t *cpu_map) { int i; +#ifdef CONFIG_NUMA + struct sched_group **sched_group_nodes = NULL; + struct sched_group *sched_group_allnodes = NULL; + + /* + * Allocate the per-node list of sched groups + */ + sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, + GFP_ATOMIC); + if (!sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + return; + } + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; +#endif /* * Set up domains for cpus specified by the cpu_map. @@ -4943,11 +5123,35 @@ static void build_sched_domains(const cpumask_t *cpu_map) cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA + if (cpus_weight(*cpu_map) + > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + if (!sched_group_allnodes) { + sched_group_allnodes + = kmalloc(sizeof(struct sched_group) + * MAX_NUMNODES, + GFP_KERNEL); + if (!sched_group_allnodes) { + printk(KERN_WARNING + "Can not alloc allnodes sched group\n"); + break; + } + sched_group_allnodes_bycpu[i] + = sched_group_allnodes; + } + sd = &per_cpu(allnodes_domains, i); + *sd = SD_ALLNODES_INIT; + sd->span = *cpu_map; + group = cpu_to_allnodes_group(i); + sd->groups = &sched_group_allnodes[group]; + p = sd; + } else + p = NULL; + sd = &per_cpu(node_domains, i); - group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - sd->span = *cpu_map; - sd->groups = &sched_group_nodes[group]; + sd->span = sched_domain_node_span(cpu_to_node(i)); + sd->parent = p; + cpus_and(sd->span, sd->span, *cpu_map); #endif p = sd; @@ -4972,7 +5176,7 @@ static void build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_online_cpu(i) { + for_each_cpu_mask(i, *cpu_map) { cpumask_t this_sibling_map = cpu_sibling_map[i]; cpus_and(this_sibling_map, this_sibling_map, *cpu_map); if (i != first_cpu(this_sibling_map)) @@ -4997,8 +5201,77 @@ static void build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA /* Set up node groups */ - init_sched_build_groups(sched_group_nodes, *cpu_map, - &cpu_to_node_group); + if (sched_group_allnodes) + init_sched_build_groups(sched_group_allnodes, *cpu_map, + &cpu_to_allnodes_group); + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Set up node groups */ + struct sched_group *sg, *prev; + cpumask_t nodemask = node_to_cpumask(i); + cpumask_t domainspan; + cpumask_t covered = CPU_MASK_NONE; + int j; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) { + sched_group_nodes[i] = NULL; + continue; + } + + domainspan = sched_domain_node_span(i); + cpus_and(domainspan, domainspan, *cpu_map); + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sched_group_nodes[i] = sg; + for_each_cpu_mask(j, nodemask) { + struct sched_domain *sd; + sd = &per_cpu(node_domains, j); + sd->groups = sg; + if (sd->groups == NULL) { + /* Turn off balancing if we have no groups */ + sd->flags = 0; + } + } + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", i); + continue; + } + sg->cpu_power = 0; + sg->cpumask = nodemask; + cpus_or(covered, covered, nodemask); + prev = sg; + + for (j = 0; j < MAX_NUMNODES; j++) { + cpumask_t tmp, notcovered; + int n = (i + j) % MAX_NUMNODES; + + cpus_complement(notcovered, covered); + cpus_and(tmp, notcovered, *cpu_map); + cpus_and(tmp, tmp, domainspan); + if (cpus_empty(tmp)) + break; + + nodemask = node_to_cpumask(n); + cpus_and(tmp, tmp, nodemask); + if (cpus_empty(tmp)) + continue; + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + break; + } + sg->cpu_power = 0; + sg->cpumask = tmp; + cpus_or(covered, covered, tmp); + prev->next = sg; + prev = sg; + } + prev->next = sched_group_nodes[i]; + } #endif /* Calculate CPU power for physical packages and nodes */ @@ -5017,14 +5290,46 @@ static void build_sched_domains(const cpumask_t *cpu_map) sd->groups->cpu_power = power; #ifdef CONFIG_NUMA - if (i == first_cpu(sd->groups->cpumask)) { - /* Only add "power" once for each physical package. */ - sd = &per_cpu(node_domains, i); - sd->groups->cpu_power += power; + sd = &per_cpu(allnodes_domains, i); + if (sd->groups) { + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; } #endif } +#ifdef CONFIG_NUMA + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *sg = sched_group_nodes[i]; + int j; + + if (sg == NULL) + continue; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + int power; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + + sg->cpu_power += power; + } + sg = sg->next; + if (sg != sched_group_nodes[i]) + goto next_sg; + } +#endif + /* Attach the domains */ for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd; @@ -5039,13 +5344,10 @@ static void build_sched_domains(const cpumask_t *cpu_map) /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ -static void arch_init_sched_domains(cpumask_t *cpu_map) +static void arch_init_sched_domains(const cpumask_t *cpu_map) { cpumask_t cpu_default_map; -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) - check_sibling_maps(); -#endif /* * Setup mask for cpus without special case scheduling requirements. * For now this just excludes isolated cpus, but could be used to @@ -5058,10 +5360,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map) static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { - /* Do nothing: everything is statically allocated. */ -} +#ifdef CONFIG_NUMA + int i; + int cpu; -#endif /* ARCH_HAS_SCHED_DOMAIN */ + for_each_cpu_mask(cpu, *cpu_map) { + struct sched_group *sched_group_allnodes + = sched_group_allnodes_bycpu[cpu]; + struct sched_group **sched_group_nodes + = sched_group_nodes_bycpu[cpu]; + + if (sched_group_allnodes) { + kfree(sched_group_allnodes); + sched_group_allnodes_bycpu[cpu] = NULL; + } + + if (!sched_group_nodes) + continue; + + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) + continue; + + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + } + kfree(sched_group_nodes); + sched_group_nodes_bycpu[cpu] = NULL; + } +#endif +} /* * Detach sched domains from a group of cpus specified in cpu_map @@ -5263,3 +5602,47 @@ void normalize_rt_tasks(void) } #endif /* CONFIG_MAGIC_SYSRQ */ + +#ifdef CONFIG_IA64 +/* + * These functions are only useful for the IA64 MCA handling. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +task_t *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, task_t *p) +{ + cpu_curr(cpu) = p; +} + +#endif diff --git a/kernel/signal.c b/kernel/signal.c index d282fea81138..b92c3c9f8b9a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -678,7 +678,7 @@ static int check_kill_permission(int sig, struct siginfo *info, /* forward decl */ static void do_notify_parent_cldstop(struct task_struct *tsk, - struct task_struct *parent, + int to_self, int why); /* @@ -729,14 +729,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->group_stop_count = 0; p->signal->flags = SIGNAL_STOP_CONTINUED; spin_unlock(&p->sighand->siglock); - if (p->ptrace & PT_PTRACED) - do_notify_parent_cldstop(p, p->parent, - CLD_STOPPED); - else - do_notify_parent_cldstop( - p->group_leader, - p->group_leader->real_parent, - CLD_STOPPED); + do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); spin_lock(&p->sighand->siglock); } rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); @@ -777,14 +770,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->flags = SIGNAL_STOP_CONTINUED; p->signal->group_exit_code = 0; spin_unlock(&p->sighand->siglock); - if (p->ptrace & PT_PTRACED) - do_notify_parent_cldstop(p, p->parent, - CLD_CONTINUED); - else - do_notify_parent_cldstop( - p->group_leader, - p->group_leader->real_parent, - CLD_CONTINUED); + do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); spin_lock(&p->sighand->siglock); } else { /* @@ -1380,16 +1366,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) unsigned long flags; int ret = 0; - /* - * We need the tasklist lock even for the specific - * thread case (when we don't need to follow the group - * lists) in order to avoid races with "p->sighand" - * going away or changing from under us. - */ BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - read_lock(&tasklist_lock); + read_lock(&tasklist_lock); + + if (unlikely(p->flags & PF_EXITING)) { + ret = -1; + goto out_err; + } + spin_lock_irqsave(&p->sighand->siglock, flags); - + if (unlikely(!list_empty(&q->list))) { /* * If an SI_TIMER entry is already queue just increment @@ -1399,7 +1385,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) BUG(); q->info.si_overrun++; goto out; - } + } /* Short-circuit ignored signals. */ if (sig_ignored(p, sig)) { ret = 1; @@ -1414,8 +1400,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) out: spin_unlock_irqrestore(&p->sighand->siglock, flags); +out_err: read_unlock(&tasklist_lock); - return(ret); + + return ret; } int @@ -1542,14 +1530,20 @@ void do_notify_parent(struct task_struct *tsk, int sig) spin_unlock_irqrestore(&psig->siglock, flags); } -static void -do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent, - int why) +static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) { struct siginfo info; unsigned long flags; + struct task_struct *parent; struct sighand_struct *sighand; + if (to_self) + parent = tsk->parent; + else { + tsk = tsk->group_leader; + parent = tsk->real_parent; + } + info.si_signo = SIGCHLD; info.si_errno = 0; info.si_pid = tsk->pid; @@ -1618,8 +1612,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) !(current->ptrace & PT_ATTACHED)) && (likely(current->parent->signal != current->signal) || !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { - do_notify_parent_cldstop(current, current->parent, - CLD_TRAPPED); + do_notify_parent_cldstop(current, 1, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); } else { @@ -1668,25 +1661,25 @@ void ptrace_notify(int exit_code) static void finish_stop(int stop_count) { + int to_self; + /* * If there are no other threads in the group, or if there is * a group stop in progress and we are the last to stop, * report to the parent. When ptraced, every thread reports itself. */ - if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, current->parent, - CLD_STOPPED); - read_unlock(&tasklist_lock); - } - else if (stop_count == 0) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, - current->group_leader->real_parent, - CLD_STOPPED); - read_unlock(&tasklist_lock); - } + if (stop_count < 0 || (current->ptrace & PT_PTRACED)) + to_self = 1; + else if (stop_count == 0) + to_self = 0; + else + goto out; + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, to_self, CLD_STOPPED); + read_unlock(&tasklist_lock); + +out: schedule(); /* * Now we don't run again until continued. @@ -2228,8 +2221,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - current->state = TASK_INTERRUPTIBLE; - timeout = schedule_timeout(timeout); + timeout = schedule_timeout_interruptible(timeout); try_to_freeze(); spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/softirq.c b/kernel/softirq.c index b4ab6af1dea8..f766b2fc48be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void) cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ - local_softirq_pending() = 0; + set_softirq_pending(0); local_irq_enable(); diff --git a/kernel/softlockup.c b/kernel/softlockup.c new file mode 100644 index 000000000000..75976209cea7 --- /dev/null +++ b/kernel/softlockup.c @@ -0,0 +1,151 @@ +/* + * Detect Soft Lockups + * + * started by Ingo Molnar, (C) 2005, Red Hat + * + * this code detects soft lockups: incidents in where on a CPU + * the kernel does not reschedule for 10 seconds or more. + */ + +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/notifier.h> +#include <linux/module.h> + +static DEFINE_SPINLOCK(print_lock); + +static DEFINE_PER_CPU(unsigned long, timestamp) = 0; +static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; +static DEFINE_PER_CPU(struct task_struct *, watchdog_task); + +static int did_panic = 0; +static int softlock_panic(struct notifier_block *this, unsigned long event, + void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = softlock_panic, +}; + +void touch_softlockup_watchdog(void) +{ + per_cpu(timestamp, raw_smp_processor_id()) = jiffies; +} +EXPORT_SYMBOL(touch_softlockup_watchdog); + +/* + * This callback runs from the timer interrupt, and checks + * whether the watchdog thread has hung or not: + */ +void softlockup_tick(struct pt_regs *regs) +{ + int this_cpu = smp_processor_id(); + unsigned long timestamp = per_cpu(timestamp, this_cpu); + + if (per_cpu(print_timestamp, this_cpu) == timestamp) + return; + + /* Do not cause a second panic when there already was one */ + if (did_panic) + return; + + if (time_after(jiffies, timestamp + 10*HZ)) { + per_cpu(print_timestamp, this_cpu) = timestamp; + + spin_lock(&print_lock); + printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", + this_cpu); + show_regs(regs); + spin_unlock(&print_lock); + } +} + +/* + * The watchdog thread - runs every second and touches the timestamp. + */ +static int watchdog(void * __bind_cpu) +{ + struct sched_param param = { .sched_priority = 99 }; + int this_cpu = (long) __bind_cpu; + + printk("softlockup thread %d started up.\n", this_cpu); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + current->flags |= PF_NOFREEZE; + + set_current_state(TASK_INTERRUPTIBLE); + + /* + * Run briefly once per second - if this gets delayed for + * more than 10 seconds then the debug-printout triggers + * in softlockup_tick(): + */ + while (!kthread_should_stop()) { + msleep_interruptible(1000); + touch_softlockup_watchdog(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __devinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + BUG_ON(per_cpu(watchdog_task, hotcpu)); + p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); + if (IS_ERR(p)) { + printk("watchdog for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(watchdog_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(watchdog_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + p = per_cpu(watchdog_task, hotcpu); + per_cpu(watchdog_task, hotcpu) = NULL; + kthread_stop(p); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init void spawn_softlockup_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + notifier_chain_register(&panic_notifier_list, &panic_block); +} + diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 0c3f9d8bbe17..0375fcd5921d 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -3,7 +3,10 @@ * * Author: Zwane Mwaikambo <zwane@fsmlabs.com> * - * Copyright (2004) Ingo Molnar + * Copyright (2004, 2005) Ingo Molnar + * + * This file contains the spinlock/rwlock implementations for the + * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) */ #include <linux/config.h> @@ -17,12 +20,12 @@ * Generic declaration of the raw read_trylock() function, * architectures are supposed to optimize this: */ -int __lockfunc generic_raw_read_trylock(rwlock_t *lock) +int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock) { - _raw_read_lock(lock); + __raw_read_lock(lock); return 1; } -EXPORT_SYMBOL(generic_raw_read_trylock); +EXPORT_SYMBOL(generic__raw_read_trylock); int __lockfunc _spin_trylock(spinlock_t *lock) { @@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock) } EXPORT_SYMBOL(_write_trylock); -#ifndef CONFIG_PREEMPT +#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) void __lockfunc _read_lock(rwlock_t *lock) { @@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) local_irq_save(flags); preempt_disable(); - _raw_spin_lock_flags(lock, flags); + _raw_spin_lock_flags(lock, &flags); return flags; } EXPORT_SYMBOL(_spin_lock_irqsave); diff --git a/kernel/sys.c b/kernel/sys.c index 0bcaed6560ac..c80412be2302 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1711,7 +1711,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { long error; - int sig; error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error) @@ -1719,12 +1718,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, switch (option) { case PR_SET_PDEATHSIG: - sig = arg2; - if (!valid_signal(sig)) { + if (!valid_signal(arg2)) { error = -EINVAL; break; } - current->pdeath_signal = sig; + current->pdeath_signal = arg2; break; case PR_GET_PDEATHSIG: error = put_user(current->pdeath_signal, (int __user *)arg2); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e0bbee549ea..8e56e2495542 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -31,6 +31,7 @@ #include <linux/smp_lock.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/net.h> #include <linux/sysrq.h> #include <linux/highuid.h> #include <linux/writeback.h> @@ -136,9 +137,6 @@ static struct ctl_table_header root_table_header = static ctl_table kern_table[]; static ctl_table vm_table[]; -#ifdef CONFIG_NET -extern ctl_table net_table[]; -#endif static ctl_table proc_table[]; static ctl_table fs_table[]; static ctl_table debug_table[]; diff --git a/kernel/timer.c b/kernel/timer.c index 5377f40723ff..3ba10fa35b60 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs) { jiffies_64++; update_times(); + softlockup_tick(regs); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1150,9 +1151,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout) out: return timeout < 0 ? 0 : timeout; } - EXPORT_SYMBOL(schedule_timeout); +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { @@ -1169,8 +1187,7 @@ static long __sched nanosleep_restart(struct restart_block *restart) if (!time_after(expire, now)) return 0; - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire - now); + expire = schedule_timeout_interruptible(expire - now); ret = 0; if (expire) { @@ -1198,8 +1215,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us return -EINVAL; expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); + expire = schedule_timeout_interruptible(expire); ret = 0; if (expire) { @@ -1428,7 +1444,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src) } } -static inline u64 time_interpolator_get_counter(void) +static inline u64 time_interpolator_get_counter(int writelock) { unsigned int src = time_interpolator->source; @@ -1442,6 +1458,15 @@ static inline u64 time_interpolator_get_counter(void) now = time_interpolator_get_cycles(src); if (lcycle && time_after(lcycle, now)) return lcycle; + + /* When holding the xtime write lock, there's no need + * to add the overhead of the cmpxchg. Readers are + * force to retry until the write lock is released. + */ + if (writelock) { + time_interpolator->last_cycle = now; + return now; + } /* Keep track of the last timer value returned. The use of cmpxchg here * will cause contention in an SMP environment. */ @@ -1455,7 +1480,7 @@ static inline u64 time_interpolator_get_counter(void) void time_interpolator_reset(void) { time_interpolator->offset = 0; - time_interpolator->last_counter = time_interpolator_get_counter(); + time_interpolator->last_counter = time_interpolator_get_counter(1); } #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) @@ -1467,7 +1492,7 @@ unsigned long time_interpolator_get_offset(void) return 0; return time_interpolator->offset + - GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); + GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); } #define INTERPOLATOR_ADJUST 65536 @@ -1490,7 +1515,7 @@ static void time_interpolator_update(long delta_nsec) * and the tuning logic insures that. */ - counter = time_interpolator_get_counter(); + counter = time_interpolator_get_counter(1); offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) @@ -1588,10 +1613,8 @@ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; - while (timeout) { - set_current_state(TASK_UNINTERRUPTIBLE); - timeout = schedule_timeout(timeout); - } + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); @@ -1604,10 +1627,8 @@ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; - while (timeout && !signal_pending(current)) { - set_current_state(TASK_INTERRUPTIBLE); - timeout = schedule_timeout(timeout); - } + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c7e36d4a70ca..91bacb13a7e2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name, struct workqueue_struct *wq; struct task_struct *p; - wq = kmalloc(sizeof(*wq), GFP_KERNEL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; - memset(wq, 0, sizeof(*wq)); wq->name = name; /* We don't need the distraction of CPUs appearing and vanishing. */ @@ -499,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: /* Create a new workqueue thread for it. */ list_for_each_entry(wq, &workqueues, list) { - if (create_workqueue_thread(wq, hotcpu) < 0) { + if (!create_workqueue_thread(wq, hotcpu)) { printk("workqueue for %i failed\n", hotcpu); return NOTIFY_BAD; } |