From adbe427b92d18cf3f801e82e9cd664fbe31cea3c Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 15 Apr 2015 16:13:00 -0700 Subject: memcg: zap mem_cgroup_lookup() mem_cgroup_lookup() is a wrapper around mem_cgroup_from_id(), which checks that id != 0 before issuing the function call. Today, there is no point in this additional check apart from optimization, because there is no css with id <= 0, so that css_from_id, called by mem_cgroup_from_id, will return NULL for any id <= 0. Since mem_cgroup_from_id is only called from mem_cgroup_lookup, let us zap mem_cgroup_lookup, substituting calls to it with mem_cgroup_from_id and moving the check if id > 0 to css_from_id. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a220fdb66568..59aa339a245c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5451,7 +5451,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); - return idr_find(&ss->css_idr, id); + return id > 0 ? idr_find(&ss->css_idr, id) : NULL; } #ifdef CONFIG_CGROUP_DEBUG -- cgit v1.2.3 From 5bbe3547aa3ba5242366a322a28996872301b703 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Wed, 15 Apr 2015 16:13:20 -0700 Subject: mm: allow compaction of unevictable pages Currently, pages which are marked as unevictable are protected from compaction, but not from other types of migration. The POSIX real time extension explicitly states that mlock() will prevent a major page fault, but the spirit of this is that mlock() should give a process the ability to control sources of latency, including minor page faults. However, the mlock manpage only explicitly says that a locked page will not be written to swap and this can cause some confusion. The compaction code today does not give a developer who wants to avoid swap but wants to have large contiguous areas available any method to achieve this state. This patch introduces a sysctl for controlling compaction behavior with respect to the unevictable lru. Users who demand no page faults after a page is present can set compact_unevictable_allowed to 0 and users who need the large contiguous areas can enable compaction on locked memory by leaving the default value of 1. To illustrate this problem I wrote a quick test program that mmaps a large number of 1MB files filled with random data. These maps are created locked and read only. Then every other mmap is unmapped and I attempt to allocate huge pages to the static huge page pool. When the compact_unevictable_allowed sysctl is 0, I cannot allocate hugepages after fragmenting memory. When the value is set to 1, allocations succeed. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: David Rientjes Acked-by: Rik van Riel Cc: Vlastimil Babka Cc: Thomas Gleixner Cc: Christoph Lameter Cc: Peter Zijlstra Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c0eabd41886..42b7fc2860c1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1335,6 +1335,15 @@ static struct ctl_table vm_table[] = { .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, + { + .procname = "compact_unevictable_allowed", + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_COMPACTION */ { -- cgit v1.2.3 From 2813893f8b197a14f1e1ddb04d99bce46817c84a Mon Sep 17 00:00:00 2001 From: Iulia Manda Date: Wed, 15 Apr 2015 16:16:41 -0700 Subject: kernel: conditionally support non-root users, groups and capabilities There are a lot of embedded systems that run most or all of their functionality in init, running as root:root. For these systems, supporting multiple users is not necessary. This patch adds a new symbol, CONFIG_MULTIUSER, that makes support for non-root users, non-root groups, and capabilities optional. It is enabled under CONFIG_EXPERT menu. When this symbol is not defined, UID and GID are zero in any possible case and processes always have all capabilities. The following syscalls are compiled out: setuid, setregid, setgid, setreuid, setresuid, getresuid, setresgid, getresgid, setgroups, getgroups, setfsuid, setfsgid, capget, capset. Also, groups.c is compiled out completely. In kernel/capability.c, capable function was moved in order to avoid adding two ifdef blocks. This change saves about 25 KB on a defconfig build. The most minimal kernels have total text sizes in the high hundreds of kB rather than low MB. (The 25k goes down a bit with allnoconfig, but not that much. The kernel was booted in Qemu. All the common functionalities work. Adding users/groups is not possible, failing with -ENOSYS. Bloat-o-meter output: add/remove: 7/87 grow/shrink: 19/397 up/down: 1675/-26325 (-24650) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Iulia Manda Reviewed-by: Josh Triplett Acked-by: Geert Uytterhoeven Tested-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 4 +++- kernel/capability.c | 35 +++++++++++++++++++---------------- kernel/cred.c | 3 +++ kernel/groups.c | 3 --- kernel/sys.c | 2 ++ kernel/sys_ni.c | 14 ++++++++++++++ 6 files changed, 41 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 1408b3353a3c..0f8f8b0bc1bf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,9 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o groups.o smpboot.o + async.o range.o smpboot.o + +obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files diff --git a/kernel/capability.c b/kernel/capability.c index 989f5bfc57dc..45432b54d5c6 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str) } __setup("no_file_caps", file_caps_disable); +#ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * @@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); + +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); +#endif /* CONFIG_MULTIUSER */ + /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check @@ -411,22 +430,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, } EXPORT_SYMBOL(file_ns_capable); -/** - * capable - Determine if the current task has a superior capability in effect - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -bool capable(int cap) -{ - return ns_capable(&init_user_ns, cap); -} -EXPORT_SYMBOL(capable); - /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question diff --git a/kernel/cred.c b/kernel/cred.c index e0573a43c7df..ec1c07667ec1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -29,6 +29,9 @@ static struct kmem_cache *cred_jar; +/* init to 2 - one for init_task, one to ensure it is never freed */ +struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; + /* * The initial credentials for the initial task */ diff --git a/kernel/groups.c b/kernel/groups.c index 664411f171b5..74d431d25251 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -9,9 +9,6 @@ #include #include -/* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; - struct group_info *groups_alloc(int gidsetsize) { struct group_info *group_info; diff --git a/kernel/sys.c b/kernel/sys.c index a03d9cd23ed7..3be344902316 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -325,6 +325,7 @@ out_unlock: * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ +#ifdef CONFIG_MULTIUSER SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { struct user_namespace *ns = current_user_ns(); @@ -815,6 +816,7 @@ change_okay: commit_creds(new); return old_fsgid; } +#endif /* CONFIG_MULTIUSER */ /** * sys_getpid - return the thread group id of the current process diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5adcb0ae3a58..7995ef5868d8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -159,6 +159,20 @@ cond_syscall(sys_uselib); cond_syscall(sys_fadvise64); cond_syscall(sys_fadvise64_64); cond_syscall(sys_madvise); +cond_syscall(sys_setuid); +cond_syscall(sys_setregid); +cond_syscall(sys_setgid); +cond_syscall(sys_setreuid); +cond_syscall(sys_setresuid); +cond_syscall(sys_getresuid); +cond_syscall(sys_setresgid); +cond_syscall(sys_getresgid); +cond_syscall(sys_setgroups); +cond_syscall(sys_getgroups); +cond_syscall(sys_setfsuid); +cond_syscall(sys_setfsgid); +cond_syscall(sys_capget); +cond_syscall(sys_capset); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- cgit v1.2.3 From 96831c0a6738f88f89e7012f4df0a747514af0a0 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 15 Apr 2015 16:16:44 -0700 Subject: kernel/resource.c: remove deprecated __check_region() and friends All users of __check_region(), check_region(), and check_mem_region() are gone. We got rid of the last user in v4.0-rc1. Remove them. bloat-o-meter on x86_64 shows: add/remove: 0/3 grow/shrink: 0/0 up/down: 0/-102 (-102) function old new delta __kstrtab___check_region 15 - -15 __ksymtab___check_region 16 - -16 __check_region 71 - -71 Signed-off-by: Jakub Sitnicki Cc: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 19f2357dfda3..90552aab5f2d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res) * * request_region creates a new busy region. * - * check_region returns non-zero if the area is already busy. - * * release_region releases a matching busy region. */ @@ -1097,36 +1095,6 @@ struct resource * __request_region(struct resource *parent, } EXPORT_SYMBOL(__request_region); -/** - * __check_region - check if a resource region is busy or free - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * Returns 0 if the region is free at the moment it is checked, - * returns %-EBUSY if the region is busy. - * - * NOTE: - * This function is deprecated because its use is racy. - * Even if it returns 0, a subsequent call to request_region() - * may fail because another driver etc. just allocated the region. - * Do NOT use it. It will be removed from the kernel. - */ -int __check_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource * res; - - res = __request_region(parent, start, n, "check-region", 0); - if (!res) - return -EBUSY; - - release_resource(res); - free_resource(res); - return 0; -} -EXPORT_SYMBOL(__check_region); - /** * __release_region - release a previously reserved resource region * @parent: parent resource descriptor -- cgit v1.2.3 From 972fae6993cbbb934345011664dc703c0891dda3 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Wed, 15 Apr 2015 16:16:47 -0700 Subject: kernel/hung_task.c: change hung_task.c to use for_each_process_thread() In check_hung_uninterruptible_tasks() avoid the use of deprecated while_each_thread(). The "max_count" logic will prevent a livelock - see commit 0c740d0a ("introduce for_each_thread() to replace the buggy while_each_thread()"). Having said this let's use for_each_process_thread(). Signed-off-by: Aaron Tomlin Acked-by: Oleg Nesterov Cc: David Rientjes Cc: Dave Wysochanski Cc: Aaron Tomlin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06db12434d72..e0f90c2b57aa 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) return; rcu_read_lock(); - do_each_thread(g, t) { + for_each_process_thread(g, t) { if (!max_count--) goto unlock; if (!--batch_count) { @@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); - } while_each_thread(g, t); + } unlock: rcu_read_unlock(); } -- cgit v1.2.3 From 7a54f46b301cfab8a0d7365aa186545f8b98f22e Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 15 Apr 2015 16:16:53 -0700 Subject: kernel/reboot.c: add orderly_reboot for graceful reboot The kernel has orderly_poweroff which allows the kernel to initiate a graceful shutdown of userspace, by running /sbin/poweroff. This adds orderly_reboot that will cause userspace to shut itself down by calling /sbin/reboot. This will be used for shutdown initiated by a system controller on platforms that do not use ACPI. orderly_reboot() should be used when the system wants to allow userspace to gracefully shut itself down. For cases where the system may imminently catch on fire, the existing emergency_restart() provides an immediate reboot without involving userspace. Signed-off-by: Joel Stanley Cc: Fabian Frederick Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Rusty Russell Cc: Jeremy Kerr Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 5925f5ae8dff..d20c85d9f8c0 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -387,8 +387,9 @@ void ctrl_alt_del(void) } char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; +static const char reboot_cmd[] = "/sbin/reboot"; -static int __orderly_poweroff(bool force) +static int run_cmd(const char *cmd) { char **argv; static char *envp[] = { @@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force) NULL }; int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + argv = argv_split(GFP_KERNEL, cmd, NULL); if (argv) { ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); argv_free(argv); @@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force) ret = -ENOMEM; } + return ret; +} + +static int __orderly_reboot(void) +{ + int ret; + + ret = run_cmd(reboot_cmd); + + if (ret) { + pr_warn("Failed to start orderly reboot: forcing the issue\n"); + emergency_sync(); + kernel_restart(NULL); + } + + return ret; +} + +static int __orderly_poweroff(bool force) +{ + int ret; + + ret = run_cmd(poweroff_cmd); + if (ret && force) { pr_warn("Failed to start orderly shutdown: forcing the issue\n"); + /* * I guess this should try to kick off some daemon to sync and * poweroff asap. Or not even bother syncing if we're doing an @@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func); * This may be called from any context to trigger a system shutdown. * If the orderly shutdown fails, it will force an immediate shutdown. */ -int orderly_poweroff(bool force) +void orderly_poweroff(bool force) { if (force) /* do not override the pending "true" */ poweroff_force = true; schedule_work(&poweroff_work); - return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); +static void reboot_work_func(struct work_struct *work) +{ + __orderly_reboot(); +} + +static DECLARE_WORK(reboot_work, reboot_work_func); + +/** + * orderly_reboot - Trigger an orderly system reboot + * + * This may be called from any context to trigger a system reboot. + * If the orderly reboot fails, it will force an immediate reboot. + */ +void orderly_reboot(void) +{ + schedule_work(&reboot_work); +} +EXPORT_SYMBOL_GPL(orderly_reboot); + static int __init reboot_setup(char *str) { for (;;) { -- cgit v1.2.3 From 94ff212d096d96afea29e41585b0d8ce41087c0f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:20 -0700 Subject: cgroup: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 59aa339a245c..469dd547770c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4196,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) static int cgroup_pidlist_show(struct seq_file *s, void *v) { - return seq_printf(s, "%d\n", *(int *)v); + seq_printf(s, "%d\n", *(int *)v); + + return 0; } static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, -- cgit v1.2.3 From 962e3707d9fb16bcf66ec5e5ebcea5248b9c2ab3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:22 -0700 Subject: tracing: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Miscellanea: o Remove unused return value from trace_lookup_stack Signed-off-by: Joe Perches Acked-by: Steven Rostedt Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_stack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c3e4fcfddd45..3f34496244e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -327,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p) local_irq_enable(); } -static int trace_lookup_stack(struct seq_file *m, long i) +static void trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; - return seq_printf(m, "%pS\n", (void *)addr); + seq_printf(m, "%pS\n", (void *)addr); } static void print_disabled(struct seq_file *m) -- cgit v1.2.3