diff options
Diffstat (limited to 'kernel')
35 files changed, 586 insertions, 290 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 353d3fe8ba33..85cbfb31e73e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/bounds.c b/kernel/bounds.c index 98a51f26c136..0c9b862292b2 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,11 +9,13 @@ #include <linux/page-flags.h> #include <linux/mmzone.h> #include <linux/kbuild.h> +#include <linux/page_cgroup.h> void foo(void) { /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); + DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); /* End of constants */ } diff --git a/kernel/capability.c b/kernel/capability.c index 9e9385f132c8..bf0c734d0c12 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -14,6 +14,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> /* @@ -290,6 +291,60 @@ error: } /** + * has_capability - Does a task have a capability in init_user_ns + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the initial user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability(struct task_struct *t, int cap) +{ + int ret = security_real_capable(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** + * has_capability - Does a task have a capability in a specific user ns + * @t: The task in question + * @ns: target user namespace + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the specified user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + int ret = security_real_capable(t, ns, cap); + + return (ret == 0); +} + +/** + * has_capability_noaudit - Does a task have a capability (unaudited) + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to init_user_ns, false if not. Don't write an + * audit message for the check. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability_noaudit(struct task_struct *t, int cap) +{ + int ret = security_real_capable_noaudit(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * @@ -299,17 +354,48 @@ error: * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ -int capable(int cap) +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); + +/** + * ns_capable - Determine if the current task has a superior capability in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable(struct user_namespace *ns, int cap) { if (unlikely(!cap_valid(cap))) { printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); BUG(); } - if (security_capable(current_cred(), cap) == 0) { + if (security_capable(ns, current_cred(), cap) == 0) { current->flags |= PF_SUPERPRIV; - return 1; + return true; } - return 0; + return false; } -EXPORT_SYMBOL(capable); +EXPORT_SYMBOL(ns_capable); + +/** + * task_ns_capable - Determine whether current task has a superior + * capability targeted at a specific task's user namespace. + * @t: The task whose user namespace is targeted. + * @cap: The capability in question. + * + * Return true if it does, false otherwise. + */ +bool task_ns_capable(struct task_struct *t, int cap) +{ + return ns_capable(task_cred_xxx(t, user)->user_ns, cap); +} +EXPORT_SYMBOL(task_ns_capable); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95362d15128c..e31b220a743d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - list_del(&tsk->cg_list); - list_add(&tsk->cg_list, &newcg->tasks); - } + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); write_unlock(&css_set_lock); for_each_subsys(root, ss) { @@ -3655,12 +3653,12 @@ again: spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) - list_del(&cgrp->release_list); + list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ - list_del(&cgrp->sibling); + list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); d = dget(cgrp->dentry); @@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ - list_del(&ss->sibling); + list_del_init(&ss->sibling); /* * disentangle the css from all css_sets attached to the dummytop. as @@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) if (!list_empty(&tsk->cg_list)) { write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) - list_del(&tsk->cg_list); + list_del_init(&tsk->cg_list); write_unlock(&css_set_lock); } diff --git a/kernel/cpu.c b/kernel/cpu.c index 156cc5556140..c95fc4df0faa 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v) { BUG_ON(cpu_notify(val, v)); } - EXPORT_SYMBOL(register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) @@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); - return 0; } @@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { nr_calls--; @@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk("%s: attempt to bring up CPU %u failed\n", + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", __func__, cpu); goto out_notify; } @@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk("Enabling non-boot CPUs ...\n"); + printk(KERN_INFO "Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { - printk("CPU%d is up\n", cpu); + printk(KERN_INFO "CPU%d is up\n", cpu); continue; } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); @@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) */ /* cpu_bit_bitmap[0] is empty - so we can back into it */ -#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) +#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e92e98189032..33eee16addb8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); - - if (!newmems) - return; + static nodemask_t newmems; /* protected by cgroup_mutex */ cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, newmems); - - cpuset_change_task_nodemask(p, newmems); + guarantee_online_mems(cs, &newmems); - NODEMASK_FREE(newmems); + cpuset_change_task_nodemask(p, &newmems); mm = get_task_mm(p); if (!mm) @@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); - NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); - - if (from == NULL || to == NULL) - goto alloc_fail; + static nodemask_t to; /* protected by cgroup_mutex */ if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); } else { guarantee_online_cpus(cs, cpus_attach); } - guarantee_online_mems(cs, to); + guarantee_online_mems(cs, &to); /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, to, cs); + cpuset_attach_task(tsk, &to, cs); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, to, cs); + cpuset_attach_task(c, &to, cs); } rcu_read_unlock(); } /* change mm; only needs to be done once even if threadgroup */ - *from = oldcs->mems_allowed; - *to = cs->mems_allowed; + to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, to); + mpol_rebind_mm(mm, &to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, from, to); + cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); mmput(mm); } - -alloc_fail: - NODEMASK_FREE(from); - NODEMASK_FREE(to); } /* The various types of files and directories in a cpuset file system */ @@ -1610,34 +1596,26 @@ out: * across a page fault. */ -static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { - int ret; + size_t count; mutex_lock(&callback_mutex); - ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); + count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); mutex_unlock(&callback_mutex); - return ret; + return count; } -static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) { - NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); - int retval; - - if (mask == NULL) - return -ENOMEM; + size_t count; mutex_lock(&callback_mutex); - *mask = cs->mems_allowed; + count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); mutex_unlock(&callback_mutex); - retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); - - NODEMASK_FREE(mask); - - return retval; + return count; } static ssize_t cpuset_common_file_read(struct cgroup *cont, @@ -1862,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, cs = cgroup_cs(cgroup); parent_cs = cgroup_cs(parent); + mutex_lock(&callback_mutex); cs->mems_allowed = parent_cs->mems_allowed; cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); + mutex_unlock(&callback_mutex); return; } @@ -2066,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return; + static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2086,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; - *oldmems = cp->mems_allowed; + oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); @@ -2102,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, oldmems, NULL); + update_tasks_nodemask(cp, &oldmems, NULL); } } - NODEMASK_FREE(oldmems); } /* @@ -2147,19 +2123,16 @@ void cpuset_update_active_cpus(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return NOTIFY_DONE; + static nodemask_t oldmems; /* protected by cgroup_mutex */ cgroup_lock(); switch (action) { case MEM_ONLINE: - *oldmems = top_cpuset.mems_allowed; + oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, oldmems, NULL); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); break; case MEM_OFFLINE: /* @@ -2173,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, } cgroup_unlock(); - NODEMASK_FREE(oldmems); return NOTIFY_OK; } #endif diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 000000000000..5f85690285d4 --- /dev/null +++ b/kernel/crash_dump.c @@ -0,0 +1,34 @@ +#include <linux/kernel.h> +#include <linux/crash_dump.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/module.h> + +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; + +/* + * stores the physical address of elf header of crash image + * + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence put + * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + +/* + * elfcorehdr= specifies the location of elf core header stored by the crashed + * kernel. This option will be passed by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); diff --git a/kernel/cred.c b/kernel/cred.c index 2343c132c5a7..5557b55048df 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode) } EXPORT_SYMBOL(set_create_files_as); +struct user_namespace *current_user_ns(void) +{ + return _current_user_ns(); +} +EXPORT_SYMBOL(current_user_ns); + #ifdef CONFIG_DEBUG_CREDENTIALS bool creds_are_invalid(const struct cred *cred) diff --git a/kernel/fork.c b/kernel/fork.c index 05b92c457010..457fff2e17e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include <linux/tracehook.h> #include <linux/futex.h> #include <linux/compat.h> +#include <linux/kthread.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> @@ -109,20 +110,25 @@ int nr_processes(void) } #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) -# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +# define alloc_task_struct_node(node) \ + kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) +# define free_task_struct(tsk) \ + kmem_cache_free(task_struct_cachep, (tsk)) static struct kmem_cache *task_struct_cachep; #endif #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR -static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) { #ifdef CONFIG_DEBUG_STACK_USAGE gfp_t mask = GFP_KERNEL | __GFP_ZERO; #else gfp_t mask = GFP_KERNEL; #endif - return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) @@ -249,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - + int node = tsk_fork_get_node(orig); int err; prepare_to_copy(orig); - tsk = alloc_task_struct(); + tsk = alloc_task_struct_node(node); if (!tsk) return NULL; - ti = alloc_thread_info(tsk); + ti = alloc_thread_info_node(tsk, node); if (!ti) { free_task_struct(tsk); return NULL; @@ -1181,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; - - if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); - if (retval < 0) - goto bad_fork_free_pid; - } } p->pid = pid_nr(pid); @@ -1290,7 +1290,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { - if (clone_flags & CLONE_NEWPID) + if (is_child_reaper(pid)) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; @@ -1513,38 +1513,24 @@ void __init proc_caches_init(void) } /* - * Check constraints on flags passed to the unshare system call and - * force unsharing of additional process context as appropriate. + * Check constraints on flags passed to the unshare system call. */ -static void check_unshare_flags(unsigned long *flags_ptr) +static int check_unshare_flags(unsigned long unshare_flags) { + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + return -EINVAL; /* - * If unsharing a thread from a thread group, must also - * unshare vm. - */ - if (*flags_ptr & CLONE_THREAD) - *flags_ptr |= CLONE_VM; - - /* - * If unsharing vm, must also unshare signal handlers. - */ - if (*flags_ptr & CLONE_VM) - *flags_ptr |= CLONE_SIGHAND; - - /* - * If unsharing namespace, must also unshare filesystem information. + * Not implemented, but pretend it works if there is nothing to + * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND + * needs to unshare vm. */ - if (*flags_ptr & CLONE_NEWNS) - *flags_ptr |= CLONE_FS; -} - -/* - * Unsharing of tasks created with CLONE_THREAD is not supported yet - */ -static int unshare_thread(unsigned long unshare_flags) -{ - if (unshare_flags & CLONE_THREAD) - return -EINVAL; + if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { + /* FIXME: get_task_mm() increments ->mm_users */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + } return 0; } @@ -1571,34 +1557,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unsharing of sighand is not supported yet - */ -static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) -{ - struct sighand_struct *sigh = current->sighand; - - if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) - return -EINVAL; - else - return 0; -} - -/* - * Unshare vm if it is being shared - */ -static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) -{ - struct mm_struct *mm = current->mm; - - if ((unshare_flags & CLONE_VM) && - (mm && atomic_read(&mm->mm_users) > 1)) { - return -EINVAL; - } - - return 0; -} - -/* * Unshare file descriptor table if it is being shared */ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) @@ -1626,45 +1584,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp */ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { - int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct sighand_struct *new_sigh = NULL; - struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; + int err; - check_unshare_flags(&unshare_flags); - - /* Return -EINVAL for all unsupported flags */ - err = -EINVAL; - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + err = check_unshare_flags(unshare_flags); + if (err) goto bad_unshare_out; /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old * namespace are unreachable. */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_thread(unshare_flags))) - goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) - goto bad_unshare_cleanup_thread; - if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_fs; - if ((err = unshare_vm(unshare_flags, &new_mm))) - goto bad_unshare_cleanup_sigh; + goto bad_unshare_out; if ((err = unshare_fd(unshare_flags, &new_fd))) - goto bad_unshare_cleanup_vm; + goto bad_unshare_cleanup_fs; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1690,19 +1640,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) spin_unlock(&fs->lock); } - if (new_mm) { - mm = current->mm; - active_mm = current->active_mm; - current->mm = new_mm; - current->active_mm = new_mm; - if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&mm->oom_disable_count); - atomic_inc(&new_mm->oom_disable_count); - } - activate_mm(active_mm, new_mm); - new_mm = mm; - } - if (new_fd) { fd = current->files; current->files = new_fd; @@ -1719,20 +1656,10 @@ bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); -bad_unshare_cleanup_vm: - if (new_mm) - mmput(new_mm); - -bad_unshare_cleanup_sigh: - if (new_sigh) - if (atomic_dec_and_test(&new_sigh->count)) - kmem_cache_free(sighand_cachep, new_sigh); - bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); -bad_unshare_cleanup_thread: bad_unshare_out: return err; } diff --git a/kernel/futex.c b/kernel/futex.c index bda415715382..6570c459f31c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2418,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->robust_list; rcu_read_unlock(); } diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index a7934ac75e5b..5f9e689dc8f0 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->compat_robust_list; rcu_read_unlock(); } diff --git a/kernel/groups.c b/kernel/groups.c index 253dc0f35cf4..1cc476d52dd3 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index b9d0fd1d21c7..a56aa58b9cb0 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -477,13 +477,11 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, "%0*lx %c %s\t[%s]\n", - (int)(2 * sizeof(void *)), - iter->value, type, iter->name, iter->module_name); + seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, + type, iter->name, iter->module_name); } else - seq_printf(m, "%0*lx %c %s\n", - (int)(2 * sizeof(void *)), - iter->value, iter->type, iter->name); + seq_printf(m, "%pK %c %s\n", (void *)iter->value, + iter->type, iter->name); return 0; } diff --git a/kernel/kthread.c b/kernel/kthread.c index c55afba990a3..684ab3f7dd72 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ struct kthread_create_info /* Information passed to kthread() from kthreadd. */ int (*threadfn)(void *data); void *data; + int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; @@ -98,10 +99,23 @@ static int kthread(void *_create) do_exit(ret); } +/* called from do_fork() to get node information for about to be created task */ +int tsk_fork_get_node(struct task_struct *tsk) +{ +#ifdef CONFIG_NUMA + if (tsk == kthreadd_task) + return tsk->pref_node_fork; +#endif + return numa_node_id(); +} + static void create_kthread(struct kthread_create_info *create) { int pid; +#ifdef CONFIG_NUMA + current->pref_node_fork = create->node; +#endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { @@ -111,15 +125,18 @@ static void create_kthread(struct kthread_create_info *create) } /** - * kthread_create - create a kthread. + * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. + * @node: memory node number. * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give -1. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which noone will call kthread_stop(), or @@ -129,15 +146,17 @@ static void create_kthread(struct kthread_create_info *create) * * Returns a task_struct or ERR_PTR(-ENOMEM). */ -struct task_struct *kthread_create(int (*threadfn)(void *data), - void *data, - const char namefmt[], - ...) +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, + int node, + const char namefmt[], + ...) { struct kthread_create_info create; create.threadfn = threadfn; create.data = data; + create.node = node; init_completion(&create.done); spin_lock(&kthread_create_lock); @@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } return create.result; } -EXPORT_SYMBOL(kthread_create); +EXPORT_SYMBOL(kthread_create_on_node); /** * kthread_bind - bind a just-created kthread to a cpu. diff --git a/kernel/module.c b/kernel/module.c index efa290ea94bf..1f9f7bc56ca1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%lx\n", sattr->address); + return sprintf(buf, "0x%pK\n", (void *)sattr->address); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading": "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%p", mod->module_core); + seq_printf(m, " 0x%pK", mod->module_core); /* Taints info */ if (mod->taints) diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f74e6c00e26d..a05d191ffdd9 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); + new_nsp->uts_ns = copy_utsname(flags, tsk); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); + new_nsp->ipc_ns = copy_ipcs(flags, tsk); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87a1704..69231670eb95 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a5aff94e1f0b..e9c9adc84ca6 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -14,6 +14,7 @@ #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> +#include <linux/proc_fs.h> #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i; + int i, err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) @@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + err = pid_ns_prepare_proc(ns); + if (err) + goto out_put_parent_pid_ns; + return ns; +out_put_parent_pid_ns: + put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: kmem_cache_free(pid_ns_cachep, ns); out: - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void destroy_pid_namespace(struct pid_namespace *ns) diff --git a/kernel/printk.c b/kernel/printk.c index 33284adb2189..da8ca817eae3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL /* We show everything that is MORE important than this.. */ #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ @@ -113,6 +113,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ /* + * If exclusive_console is non-NULL then only this console is to be printed to. + */ +static struct console *exclusive_console; + +/* * Array of consoles built from command line options (console=) */ struct console_cmdline @@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end) struct console *con; for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) @@ -1230,6 +1237,11 @@ void console_unlock(void) local_irq_restore(flags); } console_locked = 0; + + /* Release the exclusive_console once it is used */ + if (unlikely(exclusive_console)) + exclusive_console = NULL; + up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) @@ -1316,6 +1328,18 @@ void console_start(struct console *console) } EXPORT_SYMBOL(console_start); +static int __read_mostly keep_bootcon; + +static int __init keep_bootcon_setup(char *str) +{ + keep_bootcon = 1; + printk(KERN_INFO "debug: skip boot console de-registration.\n"); + + return 0; +} + +early_param("keep_bootcon", keep_bootcon_setup); + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -1452,6 +1476,12 @@ void register_console(struct console *newcon) spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; spin_unlock_irqrestore(&logbuf_lock, flags); + /* + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to + * the already-registered consoles. + */ + exclusive_console = newcon; } console_unlock(); console_sysfs_notify(); @@ -1463,7 +1493,9 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + if (bcon && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && + !keep_bootcon) { /* we need to iterate through twice, to make sure we print * everything out, before we unregister the console(s) */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e2302e40b360..0fc1eed28d27 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) return 0; rcu_read_lock(); tcred = __task_cred(task); - if ((cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_PTRACE)) { - rcu_read_unlock(); - return -EPERM; - } + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + goto ok; + if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) + goto ok; + rcu_read_unlock(); + return -EPERM; +ok: rcu_read_unlock(); smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !capable(CAP_SYS_PTRACE)) + if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) return -EPERM; return security_ptrace_access_check(task, mode); @@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task) goto unlock_tasklist; task->ptrace = PT_PTRACED; - if (capable(CAP_SYS_PTRACE)) + if (task_ns_capable(task, CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index c7eaa37a768b..34683efa2cce 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member, pos, buf, s - buf); } +#if BITS_PER_LONG == 32 +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&counter->lock, flags); + ret = *res_counter_member(counter, member); + spin_unlock_irqrestore(&counter->lock, flags); + + return ret; +} +#else u64 res_counter_read_u64(struct res_counter *counter, int member) { return *res_counter_member(counter, member); } +#endif int res_counter_memparse_write_strategy(const char *buf, unsigned long long *res) diff --git a/kernel/sched.c b/kernel/sched.c index a172494a9a63..480adeb63f8f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4892,8 +4892,11 @@ static bool check_same_owner(struct task_struct *p) rcu_read_lock(); pcred = __task_cred(p); - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); + if (cred->user->user_ns == pcred->user->user_ns) + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + else + match = false; rcu_read_unlock(); return match; } @@ -5221,7 +5224,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p); diff --git a/kernel/signal.c b/kernel/signal.c index 31751868de88..324eff5468ad 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -636,13 +636,33 @@ static inline bool si_fromuser(const struct siginfo *info) } /* + * called with RCU read lock from check_kill_permission() + */ +static int kill_ok_by_cred(struct task_struct *t) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = __task_cred(t); + + if (cred->user->user_ns == tcred->user->user_ns && + (cred->euid == tcred->suid || + cred->euid == tcred->uid || + cred->uid == tcred->suid || + cred->uid == tcred->uid)) + return 1; + + if (ns_capable(tcred->user->user_ns, CAP_KILL)) + return 1; + + return 0; +} + +/* * Bad permissions for sending the signal * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { - const struct cred *cred, *tcred; struct pid *sid; int error; @@ -656,14 +676,8 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; - cred = current_cred(); - tcred = __task_cred(t); if (!same_thread_group(current, t) && - (cred->euid ^ tcred->suid) && - (cred->euid ^ tcred->uid) && - (cred->uid ^ tcred->suid) && - (cred->uid ^ tcred->uid) && - !capable(CAP_KILL)) { + !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); diff --git a/kernel/smp.c b/kernel/smp.c index 7cbd0f293df4..73a195193558 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -604,6 +604,87 @@ void ipi_call_unlock_irq(void) } #endif /* USE_GENERIC_SMP_HELPERS */ +/* Setup configured maximum number of CPUs to activate */ +unsigned int setup_max_cpus = NR_CPUS; +EXPORT_SYMBOL(setup_max_cpus); + + +/* + * Setup routine for controlling SMP activation + * + * Command-line option of "nosmp" or "maxcpus=0" will disable SMP + * activation entirely (the MPS table probe still happens, though). + * + * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to <NUM>. + */ + +void __weak arch_disable_smp_support(void) { } + +static int __init nosmp(char *str) +{ + setup_max_cpus = 0; + arch_disable_smp_support(); + + return 0; +} + +early_param("nosmp", nosmp); + +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + +static int __init maxcpus(char *str) +{ + get_option(&str, &setup_max_cpus); + if (setup_max_cpus == 0) + arch_disable_smp_support(); + + return 0; +} + +early_param("maxcpus", maxcpus); + +/* Setup number of possible processor ids */ +int nr_cpu_ids __read_mostly = NR_CPUS; +EXPORT_SYMBOL(nr_cpu_ids); + +/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ +void __init setup_nr_cpu_ids(void) +{ + nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; +} + +/* Called by boot processor to activate the rest. */ +void __init smp_init(void) +{ + unsigned int cpu; + + /* FIXME: This should be done in userspace --RR */ + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu); + } + + /* Any cleanup work */ + printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_cpus_done(setup_max_cpus); +} + /* * Call a function on all processors. May be used during early boot while * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead diff --git a/kernel/softirq.c b/kernel/softirq.c index 56e5dec837f0..735d87095172 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_create_on_node(run_ksoftirqd, + hcpu, + cpu_to_node(hotcpu), + "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return notifier_from_errno(PTR_ERR(p)); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2df820b03beb..e3516b29076c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: BUG_ON(stopper->thread || stopper->enabled || !list_empty(&stopper->works)); - p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", - cpu); + p = kthread_create_on_node(cpu_stopper_thread, + stopper, + cpu_to_node(cpu), + "migration/%d", cpu); if (IS_ERR(p)) return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); diff --git a/kernel/sys.c b/kernel/sys.c index 1ad48b3b9068..af468edf096a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -120,16 +120,33 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); /* + * Returns true if current's euid is same as p's uid or euid, + * or has CAP_SYS_NICE to p's user_ns. + * + * Called with rcu_read_lock, creds are safe + */ +static bool set_one_prio_perm(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + + if (pcred->user->user_ns == cred->user->user_ns && + (pcred->uid == cred->euid || + pcred->euid == cred->euid)) + return true; + if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) + return true; + return false; +} + +/* * set the priority of a task * - the caller must hold the RCU read lock */ static int set_one_prio(struct task_struct *p, int niceval, int error) { - const struct cred *cred = current_cred(), *pcred = __task_cred(p); int no_nice; - if (pcred->uid != cred->euid && - pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { + if (!set_one_prio_perm(p)) { error = -EPERM; goto out; } @@ -506,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (rgid != (gid_t) -1) { if (old->gid == rgid || old->egid == rgid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->gid = rgid; else goto error; @@ -515,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (old->gid == egid || old->egid == egid || old->sgid == egid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->egid = egid; else goto error; @@ -550,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETGID)) + if (nsown_capable(CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = gid; else if (gid == old->gid || gid == old->sgid) new->egid = new->fsgid = gid; @@ -617,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) new->uid = ruid; if (old->uid != ruid && old->euid != ruid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -626,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) if (old->uid != euid && old->euid != euid && old->suid != euid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -674,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETUID)) { + if (nsown_capable(CAP_SETUID)) { new->suid = new->uid = uid; if (uid != old->uid) { retval = set_user(new); @@ -716,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETUID)) { + if (!nsown_capable(CAP_SETUID)) { if (ruid != (uid_t) -1 && ruid != old->uid && ruid != old->euid && ruid != old->suid) goto error; @@ -780,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETGID)) { + if (!nsown_capable(CAP_SETGID)) { if (rgid != (gid_t) -1 && rgid != old->gid && rgid != old->egid && rgid != old->sgid) goto error; @@ -840,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) if (uid == old->uid || uid == old->euid || uid == old->suid || uid == old->fsuid || - capable(CAP_SETUID)) { + nsown_capable(CAP_SETUID)) { if (uid != old_fsuid) { new->fsuid = uid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -873,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) if (gid == old->gid || gid == old->egid || gid == old->sgid || gid == old->fsgid || - capable(CAP_SETGID)) { + nsown_capable(CAP_SETGID)) { if (gid != old_fsgid) { new->fsgid = gid; goto change_okay; @@ -1181,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; down_write(&uts_sem); @@ -1230,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1345,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, rlim = tsk->signal->rlim + resource; task_lock(tsk->group_leader); if (new_rlim) { + /* Keep the capable check against init_user_ns until + cgroups can contain all limits */ if (new_rlim->rlim_max > rlim->rlim_max && !capable(CAP_SYS_RESOURCE)) retval = -EPERM; @@ -1388,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task) { const struct cred *cred = current_cred(), *tcred; - tcred = __task_cred(task); - if (current != task && - (cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_RESOURCE)) { - return -EPERM; - } + if (current == task) + return 0; - return 0; + tcred = __task_cred(task); + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + return 0; + if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) + return 0; + + return -EPERM; } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 40245d697602..c0bb32414b17 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -117,6 +117,7 @@ static int neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static unsigned long one_ul = 1; static int one_hundred = 100; #ifdef CONFIG_PRINTK @@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; @@ -706,7 +712,7 @@ static struct ctl_table kern_table[] = { .data = &kptr_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dmesg_restrict, .extra1 = &zero, .extra2 = &two, }, @@ -971,14 +977,18 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "oom_kill_allocating_task", @@ -1006,7 +1016,8 @@ static struct ctl_table vm_table[] = { .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "dirty_background_ratio", @@ -1054,7 +1065,8 @@ static struct ctl_table vm_table[] = { .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "nr_pdflush_threads", @@ -1130,6 +1142,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, + .extra1 = &one, + .extra2 = &three, }, #ifdef CONFIG_COMPACTION { @@ -2385,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write, return err; } +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + struct do_proc_dointvec_minmax_conv_param { int *min; int *max; diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 10b90d8a03c4..4e4932a7b360 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) const char *fail = NULL; if (table->parent) { - if (table->procname && !table->parent->procname) + if (!table->parent->procname) set_fail(&fail, table, "Parent without procname"); } - if (!table->procname) - set_fail(&fail, table, "No procname"); if (table->child) { if (table->data) set_fail(&fail, table, "Directory with data?"); @@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "No maxlen"); } #ifdef CONFIG_PROC_SYSCTL - if (table->procname && !table->proc_handler) + if (!table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif -#if 0 - if (!table->procname && table->proc_handler) - set_fail(&fail, table, "proc_handler without procname"); -#endif sysctl_check_leaf(namespaces, table, &fail); } if (table->mode > 0777) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3971c6b9d58d..9ffea360a778 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -685,7 +685,7 @@ static int __init taskstats_init(void) goto err_cgroup_ops; family_registered = 1; - printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); + pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; err_cgroup_ops: genl_unregister_ops(&family, &taskstats_ops); diff --git a/kernel/uid16.c b/kernel/uid16.c index 419209893d87..51c6e89e8619 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/user.c b/kernel/user.c index 5c598ca781df..9e03e9c1df8d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -17,9 +17,13 @@ #include <linux/module.h> #include <linux/user_namespace.h> +/* + * userns count is 1 for root user, 1 for init_uts_ns, + * and 1 for... ? + */ struct user_namespace init_user_ns = { .kref = { - .refcount = ATOMIC_INIT(2), + .refcount = ATOMIC_INIT(3), }, .creator = &root_user, }; @@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep; */ static DEFINE_SPINLOCK(uidhash_lock); -/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ +/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ struct user_struct root_user = { .__count = ATOMIC_INIT(2), .processes = ATOMIC_INIT(1), diff --git a/kernel/utsname.c b/kernel/utsname.c index 8a82b4b8ea52..44646179eaba 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,6 +14,7 @@ #include <linux/utsname.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/user_namespace.h> static struct uts_namespace *create_uts_ns(void) { @@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, + struct uts_namespace *old_ns) { struct uts_namespace *ns; @@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); up_read(&uts_sem); return ns; } @@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) +struct uts_namespace *copy_utsname(unsigned long flags, + struct task_struct *tsk) { + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(old_ns); + new_ns = clone_uts_ns(tsk, old_ns); put_uts_ns(old_ns); return new_ns; @@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); + put_user_ns(ns->user_ns); kfree(ns); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18bb15776c57..140dce750450 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic; +static int hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_enabled = 0; return 1; @@ -415,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu) static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err; + int err = 0; /* enable the perf event */ err = watchdog_nmi_enable(cpu); - if (err) - return err; + + /* Regardless of err above, fall through and start softlockup */ /* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - return PTR_ERR(p); + if (!err) + /* if hardlockup hasn't already set this */ + err = PTR_ERR(p); + goto out; } kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; @@ -435,7 +441,8 @@ static int watchdog_enable(int cpu) wake_up_process(p); } - return 0; +out: + return err; } static void watchdog_disable(int cpu) @@ -547,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif /* CONFIG_HOTPLUG_CPU */ } - return notifier_from_errno(err); + + /* + * hardlockup and softlockup are not important enough + * to block cpu bring up. Just always succeed and + * rely on printk output to flag problems. + */ + return NOTIFY_OK; } static struct notifier_block __cpuinitdata cpu_nfb = { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5ca7ce9ce754..04ef830690ec 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1366,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) worker->id = id; if (!on_unbound_cpu) - worker->task = kthread_create(worker_thread, worker, - "kworker/%u:%d", gcwq->cpu, id); + worker->task = kthread_create_on_node(worker_thread, + worker, + cpu_to_node(gcwq->cpu), + "kworker/%u:%d", gcwq->cpu, id); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d", id); |