diff options
Diffstat (limited to 'kernel')
166 files changed, 11304 insertions, 5942 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index eca595e2fd52..f70396e5a24b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -2,16 +2,15 @@ # Makefile for the linux kernel. # -obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ +obj-y = fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ - async.o range.o -obj-y += groups.o + notifier.o ksysfs.o cred.o \ + async.o range.o groups.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg -CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif +obj-y += sched/ + obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o @@ -99,8 +99,8 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ -obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o +obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_PERF_EVENTS) += events/ @@ -109,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o -ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) -# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is -# needed for x86 only. Why this used to be enabled for all architectures is beyond -# me. I suspect most platforms don't need this, but until we know that for sure -# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k -# to get a correct value for the wait-channel (WCHAN in ps). --davidm -CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer -endif - $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. diff --git a/kernel/acct.c b/kernel/acct.c index fa7eb3de2ddc..02e6167a53b0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct, * the cache line to have the data after getting the lock. */ struct bsd_acct_struct { - volatile int active; - volatile int needcheck; + int active; + unsigned long needcheck; struct file *file; struct pid_namespace *ns; - struct timer_list timer; struct list_head list; }; @@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock); static LIST_HEAD(acct_list); /* - * Called whenever the timer says to check the free space. - */ -static void acct_timeout(unsigned long x) -{ - struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x; - acct->needcheck = 1; -} - -/* * Check the amount of free space and suspend/resume accordingly. */ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) @@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) struct kstatfs sbuf; int res; int act; - sector_t resume; - sector_t suspend; + u64 resume; + u64 suspend; spin_lock(&acct_lock); res = acct->active; - if (!file || !acct->needcheck) + if (!file || time_is_before_jiffies(acct->needcheck)) goto out; spin_unlock(&acct_lock); @@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; - sector_div(suspend, 100); - sector_div(resume, 100); + do_div(suspend, 100); + do_div(resume, 100); if (sbuf.f_bavail <= suspend) act = -1; @@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) } } - del_timer(&acct->timer); - acct->needcheck = 0; - acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct->timer); + acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; res = acct->active; out: spin_unlock(&acct_lock); @@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, if (acct->file) { old_acct = acct->file; old_ns = acct->ns; - del_timer(&acct->timer); acct->active = 0; - acct->needcheck = 0; acct->file = NULL; acct->ns = NULL; list_del(&acct->list); @@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, if (file) { acct->file = file; acct->ns = ns; - acct->needcheck = 0; + acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; acct->active = 1; list_add(&acct->list, &acct_list); - /* It's been deleted if it was used before so this is safe */ - setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); - acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct->timer); } if (old_acct) { mnt_unpin(old_acct->f_path.mnt); @@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb) spin_lock(&acct_lock); restart: list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { + if (acct->file && acct->file->f_path.dentry->d_sb == sb) { acct_file_reopen(acct, NULL, NULL); goto restart; } @@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns) if (acct == NULL) return; - del_timer_sync(&acct->timer); spin_lock(&acct_lock); if (acct->file != NULL) acct_file_reopen(acct, NULL, NULL); @@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, * Fill the accounting struct with the needed info as recorded * by the different kernel functions. */ - memset((caddr_t)&ac, 0, sizeof(acct_t)); + memset(&ac, 0, sizeof(acct_t)); ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); @@ -613,8 +593,8 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; - pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); - pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); + pacct->ac_utime += current->utime; + pacct->ac_stime += current->stime; pacct->ac_minflt += current->min_flt; pacct->ac_majflt += current->maj_flt; spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/async.c b/kernel/async.c index d5fe7af0de2e..bd0c168a3bbe 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -51,7 +51,7 @@ asynchronous and synchronous parts of the kernel. #include <linux/async.h> #include <linux/atomic.h> #include <linux/ktime.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/wait.h> #include <linux/sched.h> #include <linux/slab.h> @@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; -extern int initcall_debug; - /* * MUST be called with the lock held! @@ -120,7 +118,7 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; - ktime_t calltime, delta, rettime; + ktime_t uninitialized_var(calltime), delta, rettime; /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); @@ -269,7 +267,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); void async_synchronize_cookie_domain(async_cookie_t cookie, struct list_head *running) { - ktime_t starttime, delta, endtime; + ktime_t uninitialized_var(starttime), delta, endtime; if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); diff --git a/kernel/audit.c b/kernel/audit.c index f3ba55fa0b70..57e3f5107937 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -45,7 +45,7 @@ #include <asm/types.h> #include <linux/atomic.h> #include <linux/mm.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> @@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) - goto out; + goto out_va_end; len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); } - va_end(args2); if (len > 0) skb_put(skb, len); +out_va_end: + va_end(args2); out: return; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ce4b054acee5..e7fe2b0d29b3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -48,7 +48,7 @@ #include <linux/fs.h> #include <linux/namei.h> #include <linux/mm.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/mount.h> #include <linux/socket.h> @@ -210,12 +210,12 @@ struct audit_context { struct { uid_t uid; gid_t gid; - mode_t mode; + umode_t mode; u32 osid; int has_perm; uid_t perm_uid; gid_t perm_gid; - mode_t perm_mode; + umode_t perm_mode; unsigned long qbytes; } ipc; struct { @@ -234,7 +234,7 @@ struct audit_context { } mq_sendrecv; struct { int oflag; - mode_t mode; + umode_t mode; struct mq_attr attr; } mq_open; struct { @@ -308,7 +308,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask) static int audit_match_filetype(struct audit_context *ctx, int which) { unsigned index = which & ~S_IFMT; - mode_t mode = which & S_IFMT; + umode_t mode = which & S_IFMT; if (unlikely(!ctx)) return 0; @@ -1249,7 +1249,7 @@ static void show_special(struct audit_context *context, int *call_panic) case AUDIT_IPC: { u32 osid = context->ipc.osid; - audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", + audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", context->ipc.uid, context->ipc.gid, context->ipc.mode); if (osid) { char *ctx = NULL; @@ -1267,7 +1267,7 @@ static void show_special(struct audit_context *context, int *call_panic) ab = audit_log_start(context, GFP_KERNEL, AUDIT_IPC_SET_PERM); audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%#o", + "qbytes=%lx ouid=%u ogid=%u mode=%#ho", context->ipc.qbytes, context->ipc.perm_uid, context->ipc.perm_gid, @@ -1278,7 +1278,7 @@ static void show_special(struct audit_context *context, int *call_panic) break; } case AUDIT_MQ_OPEN: { audit_log_format(ab, - "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " "mq_msgsize=%ld mq_curmsgs=%ld", context->mq_open.oflag, context->mq_open.mode, context->mq_open.attr.mq_flags, @@ -1502,7 +1502,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (n->ino != (unsigned long)-1) { audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#o" + " dev=%02x:%02x mode=%#ho" " ouid=%u ogid=%u rdev=%02x:%02x", n->ino, MAJOR(n->dev), @@ -2160,7 +2160,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) * @attr: queue attributes * */ -void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) +void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { struct audit_context *context = current->audit_context; @@ -2260,7 +2260,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) * * Called only after audit_ipc_obj(). */ -void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { struct audit_context *context = current->audit_context; diff --git a/kernel/capability.c b/kernel/capability.c index 74fb3b603045..0fcf1c14a297 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -10,7 +10,7 @@ #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d2b6ceea95d..a5d3b5325f77 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -63,7 +63,24 @@ #include <linux/atomic.h> +/* + * cgroup_mutex is the master lock. Any modification to cgroup or its + * hierarchy must be performed while holding it. + * + * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify + * cgroupfs_root of any cgroup hierarchy - subsys list, flags, + * release_agent_path and so on. Modifying requires both cgroup_mutex and + * cgroup_root_mutex. Readers can acquire either of the two. This is to + * break the following locking order cycle. + * + * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem + * B. namespace_sem -> cgroup_mutex + * + * B happens only through cgroup_show_options() and using cgroup_root_mutex + * breaks it. + */ static DEFINE_MUTEX(cgroup_mutex); +static DEFINE_MUTEX(cgroup_root_mutex); /* * Generate an array of cgroup subsystem pointers. At boot time, this is @@ -265,7 +282,7 @@ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); -static DEFINE_SPINLOCK(release_list_lock); +static DEFINE_RAW_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); @@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); * -> cgroup_mkdir. */ -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); @@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, struct cgroup *child); -static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) +static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; */ -DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); +static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) { @@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, int i; BUG_ON(!mutex_is_locked(&cgroup_mutex)); + BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); removed_bits = root->actual_subsys_bits & ~final_bits; added_bits = final_bits & ~root->actual_subsys_bits; @@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, return 0; } -static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) { - struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; + struct cgroupfs_root *root = dentry->d_sb->s_fs_info; struct cgroup_subsys *ss; - mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); if (test_bit(ROOT_NOPREFIX, &root->flags)) @@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); - mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_root_mutex); return 0; } @@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* * If the 'all' option was specified select all the subsystems, - * otherwise 'all, 'none' and a subsystem name options were not - * specified, let's default to 'all' + * otherwise if 'none', 'name=' and a subsystem name options + * were not specified, let's default to 'all' */ - if (all_ss || (!all_ss && !one_ss && !opts->none)) { + if (all_ss || (!one_ss && !opts->none && !opts->name)) { for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss == NULL) @@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) out_unlock: kfree(opts.release_agent); kfree(opts.name); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return ret; @@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *new_root; + struct inode *inode; /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); @@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* We used the new root structure, so this is a new hierarchy */ struct list_head tmp_cg_links; struct cgroup *root_cgrp = &root->top_cgroup; - struct inode *inode; struct cgroupfs_root *existing_root; const struct cred *cred; int i; @@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); - if (strlen(root->name)) { - /* Check for name clashes with existing mounts */ - for_each_active_root(existing_root) { - if (!strcmp(existing_root->name, root->name)) { - ret = -EBUSY; - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - goto drop_new_super; - } - } - } + /* Check for name clashes with existing mounts */ + ret = -EBUSY; + if (strlen(root->name)) + for_each_active_root(existing_root) + if (!strcmp(existing_root->name, root->name)) + goto unlock_drop; /* * We're accessing css_set_count without locking @@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * have some link structures left over */ ret = allocate_cg_links(css_set_count, &tmp_cg_links); - if (ret) { - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - goto drop_new_super; - } + if (ret) + goto unlock_drop; ret = rebind_subsystems(root, root->subsys_bits); if (ret == -EBUSY) { - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); free_cg_links(&tmp_cg_links); - goto drop_new_super; + goto unlock_drop; } /* * There must be no failure case after here, since rebinding @@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, cred = override_creds(&init_cred); cgroup_populate_dir(root_cgrp); revert_creds(cred); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); } else { @@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, kfree(opts.name); return dget(sb->s_root); + unlock_drop: + mutex_unlock(&cgroup_root_mutex); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); drop_new_super: deactivate_locked_super(sb); drop_modules: @@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(!list_empty(&cgrp->sibling)); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); /* Rebind all subsystems back to the default hierarchy */ ret = rebind_subsystems(root, 0); @@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) { root_count--; } + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); kill_litter_super(sb); @@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) EXPORT_SYMBOL_GPL(cgroup_path); /* + * Control Group taskset + */ +struct task_and_cgroup { + struct task_struct *task; + struct cgroup *cgrp; +}; + +struct cgroup_taskset { + struct task_and_cgroup single; + struct flex_array *tc_array; + int tc_array_len; + int idx; + struct cgroup *cur_cgrp; +}; + +/** + * cgroup_taskset_first - reset taskset and return the first task + * @tset: taskset of interest + * + * @tset iteration is initialized and the first task is returned. + */ +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +{ + if (tset->tc_array) { + tset->idx = 0; + return cgroup_taskset_next(tset); + } else { + tset->cur_cgrp = tset->single.cgrp; + return tset->single.task; + } +} +EXPORT_SYMBOL_GPL(cgroup_taskset_first); + +/** + * cgroup_taskset_next - iterate to the next task in taskset + * @tset: taskset of interest + * + * Return the next task in @tset. Iteration must have been initialized + * with cgroup_taskset_first(). + */ +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +{ + struct task_and_cgroup *tc; + + if (!tset->tc_array || tset->idx >= tset->tc_array_len) + return NULL; + + tc = flex_array_get(tset->tc_array, tset->idx++); + tset->cur_cgrp = tc->cgrp; + return tc->task; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_next); + +/** + * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task + * @tset: taskset of interest + * + * Return the cgroup for the current (last returned) task of @tset. This + * function must be preceded by either cgroup_taskset_first() or + * cgroup_taskset_next(). + */ +struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) +{ + return tset->cur_cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); + +/** + * cgroup_taskset_size - return the number of tasks in taskset + * @tset: taskset of interest + */ +int cgroup_taskset_size(struct cgroup_taskset *tset) +{ + return tset->tc_array ? tset->tc_array_len : 1; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_size); + + +/* * cgroup_task_migrate - move a task from one cgroup to another. * * 'guarantee' is set if the caller promises that a new css_set for the task * will already exist. If not set, this function might sleep, and can fail with - * -ENOMEM. Otherwise, it can only fail with -ESRCH. + * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. */ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct task_struct *tsk, bool guarantee) @@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct css_set *newcg; /* - * get old css_set. we need to take task_lock and refcount it, because - * an exiting task can change its css_set to init_css_set and drop its - * old one without taking cgroup_mutex. + * We are synchronized through threadgroup_lock() against PF_EXITING + * setting such that we can't race against cgroup_exit() changing the + * css_set to init_css_set and dropping the old one. */ - task_lock(tsk); + WARN_ON_ONCE(tsk->flags & PF_EXITING); oldcg = tsk->cgroups; - get_css_set(oldcg); - task_unlock(tsk); /* locate or allocate a new css_set for this task. */ if (guarantee) { @@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, might_sleep(); /* find_css_set will give us newcg already referenced. */ newcg = find_css_set(oldcg, cgrp); - if (!newcg) { - put_css_set(oldcg); + if (!newcg) return -ENOMEM; - } } - put_css_set(oldcg); - /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ task_lock(tsk); - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - put_css_set(newcg); - return -ESRCH; - } rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, * @cgrp: the cgroup the task is attaching to * @tsk: the task to be attached * - * Call holding cgroup_mutex. May take task_lock of - * the task 'tsk' during call. + * Call with cgroup_mutex and threadgroup locked. May take task_lock of + * @tsk during call. */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { @@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; + struct cgroup_taskset tset = { }; + + /* @tsk either already exited or can't exit until the end */ + if (tsk->flags & PF_EXITING) + return -ESRCH; /* Nothing to do if the task is already in that cgroup */ oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) return 0; + tset.single.task = tsk; + tset.single.cgrp = oldcgrp; + for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, &tset); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; } } - if (ss->can_attach_task) { - retval = ss->can_attach_task(cgrp, tsk); - if (retval) { - failed_ss = ss; - goto out; - } - } } retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); @@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; for_each_subsys(root, ss) { - if (ss->pre_attach) - ss->pre_attach(cgrp); - if (ss->attach_task) - ss->attach_task(cgrp, tsk); if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, &tset); } synchronize_rcu(); @@ -1884,7 +1967,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, tsk); + ss->cancel_attach(ss, cgrp, &tset); } } return retval; @@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp, read_lock(&css_set_lock); newcg = find_existing_css_set(cg, cgrp, template); - if (newcg) - get_css_set(newcg); read_unlock(&css_set_lock); /* doesn't exist at all? */ if (!newcg) return false; /* see if it's already in the list */ - list_for_each_entry(cg_entry, newcg_list, links) { - if (cg_entry->cg == newcg) { - put_css_set(newcg); + list_for_each_entry(cg_entry, newcg_list, links) + if (cg_entry->cg == newcg) return true; - } - } /* not found */ - put_css_set(newcg); return false; } @@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, * @cgrp: the cgroup to attach to * @leader: the threadgroup leader task_struct of the group to be attached * - * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will - * take task_lock of each thread in leader's threadgroup individually in turn. + * Call holding cgroup_mutex and the group_rwsem of the leader. Will take + * task_lock of each thread in leader's threadgroup individually in turn. */ -int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) { int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; - bool cancel_failed_ss = false; /* guaranteed to be initialized later, but the compiler needs this */ - struct cgroup *oldcgrp = NULL; struct css_set *oldcg; struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; + struct task_and_cgroup *tc; struct flex_array *group; + struct cgroup_taskset tset = { }; /* * we need to make sure we have css_sets for all the tasks we're * going to move -before- we actually start moving them, so that in @@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 0: in order to do expensive, possibly blocking operations for * every thread, we cannot iterate the thread group list, since it needs * rcu or tasklist locked. instead, build an array of all threads in the - * group - threadgroup_fork_lock prevents new threads from appearing, - * and if threads exit, this will just be an over-estimate. + * group - group_rwsem prevents new threads from appearing, and if + * threads exit, this will just be an over-estimate. */ group_size = get_nr_threads(leader); /* flex_array supports very large thread-groups better than kmalloc. */ - group = flex_array_alloc(sizeof(struct task_struct *), group_size, - GFP_KERNEL); + group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); if (!group) return -ENOMEM; /* pre-allocate to guarantee space while iterating in rcu read-side. */ @@ -2027,7 +2103,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) goto out_free_group_list; /* prevent changes to the threadgroup list while we take a snapshot. */ - rcu_read_lock(); + read_lock(&tasklist_lock); if (!thread_group_leader(leader)) { /* * a race with de_thread from another thread's exec() may strip @@ -2036,53 +2112,57 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * throw this task away and try again (from cgroup_procs_write); * this is "double-double-toil-and-trouble-check locking". */ - rcu_read_unlock(); + read_unlock(&tasklist_lock); retval = -EAGAIN; goto out_free_group_list; } - /* take a reference on each task in the group to go in the array. */ + tsk = leader; i = 0; do { + struct task_and_cgroup ent; + + /* @tsk either already exited or can't exit until the end */ + if (tsk->flags & PF_EXITING) + continue; + /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - get_task_struct(tsk); /* * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. */ - retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + ent.task = tsk; + ent.cgrp = task_cgroup_from_root(tsk, root); + /* nothing to do if this task is already in the cgroup */ + if (ent.cgrp == cgrp) + continue; + retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ group_size = i; - rcu_read_unlock(); + tset.tc_array = group; + tset.tc_array_len = group_size; + read_unlock(&tasklist_lock); + + /* methods shouldn't be called if no task is actually migrating */ + retval = 0; + if (!group_size) + goto out_free_group_list; /* * step 1: check that we can legitimately attach to the cgroup. */ for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, leader); + retval = ss->can_attach(ss, cgrp, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; } } - /* a callback to be run on every thread in the threadgroup. */ - if (ss->can_attach_task) { - /* run on each task in the threadgroup. */ - for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - retval = ss->can_attach_task(cgrp, tsk); - if (retval) { - failed_ss = ss; - cancel_failed_ss = true; - goto out_cancel_attach; - } - } - } } /* @@ -2091,69 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - /* nothing to do if this task is already in the cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) - continue; - /* get old css_set pointer */ - task_lock(tsk); - if (tsk->flags & PF_EXITING) { - /* ignore this task if it's going away */ - task_unlock(tsk); - continue; - } - oldcg = tsk->cgroups; - get_css_set(oldcg); - task_unlock(tsk); - /* see if the new one for us is already in the list? */ - if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { - /* was already there, nothing to do. */ - put_css_set(oldcg); - } else { - /* we don't already have it. get new one. */ + tc = flex_array_get(group, i); + oldcg = tc->task->cgroups; + + /* if we don't already have it in the list get a new one */ + if (!css_set_check_fetched(cgrp, tc->task, oldcg, + &newcg_list)) { retval = css_set_prefetch(cgrp, oldcg, &newcg_list); - put_css_set(oldcg); if (retval) goto out_list_teardown; } } /* - * step 3: now that we're guaranteed success wrt the css_sets, proceed - * to move all tasks to the new cgroup, calling ss->attach_task for each - * one along the way. there are no failure cases after here, so this is - * the commit point. + * step 3: now that we're guaranteed success wrt the css_sets, + * proceed to move all tasks to the new cgroup. There are no + * failure cases after here, so this is the commit point. */ - for_each_subsys(root, ss) { - if (ss->pre_attach) - ss->pre_attach(cgrp); - } for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - /* leave current thread as it is if it's already there */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) - continue; - /* attach each task to each subsystem */ - for_each_subsys(root, ss) { - if (ss->attach_task) - ss->attach_task(cgrp, tsk); - } - /* if the thread is PF_EXITING, it can just get skipped. */ - retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); - BUG_ON(retval != 0 && retval != -ESRCH); + tc = flex_array_get(group, i); + retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); + BUG_ON(retval); } /* nothing is sensitive to fork() after this point. */ /* - * step 4: do expensive, non-thread-specific subsystem callbacks. - * TODO: if ever a subsystem needs to know the oldcgrp for each task - * being moved, this call will need to be reworked to communicate that. + * step 4: do subsystem attach callbacks. */ for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, leader); + ss->attach(ss, cgrp, &tset); } /* @@ -2173,20 +2220,12 @@ out_cancel_attach: /* same deal as in cgroup_attach_task */ if (retval) { for_each_subsys(root, ss) { - if (ss == failed_ss) { - if (cancel_failed_ss && ss->cancel_attach) - ss->cancel_attach(ss, cgrp, leader); + if (ss == failed_ss) break; - } if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, leader); + ss->cancel_attach(ss, cgrp, &tset); } } - /* clean up the array of referenced threads in the group. */ - for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - put_task_struct(tsk); - } out_free_group_list: flex_array_free(group); return retval; @@ -2194,8 +2233,8 @@ out_free_group_list: /* * Find the task_struct of the task to attach by vpid and pass it along to the - * function to attach either it or all tasks in its threadgroup. Will take - * cgroup_mutex; may take task_lock of task. + * function to attach either it or all tasks in its threadgroup. Will lock + * cgroup_mutex and threadgroup; may take task_lock of task. */ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { @@ -2222,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) * detect it later. */ tsk = tsk->group_leader; - } else if (tsk->flags & PF_EXITING) { - /* optimization for the single-task-only case */ - rcu_read_unlock(); - cgroup_unlock(); - return -ESRCH; } - /* * even if we're attaching all tasks in the thread group, we * only need to check permissions on one of them. @@ -2251,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) get_task_struct(tsk); } - if (threadgroup) { - threadgroup_fork_write_lock(tsk); + threadgroup_lock(tsk); + + if (threadgroup) ret = cgroup_attach_proc(cgrp, tsk); - threadgroup_fork_write_unlock(tsk); - } else { + else ret = cgroup_attach_task(cgrp, tsk); - } + + threadgroup_unlock(tsk); + put_task_struct(tsk); cgroup_unlock(); return ret; @@ -2308,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, return -EINVAL; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; + mutex_lock(&cgroup_root_mutex); strcpy(cgrp->root->release_agent_path, buffer); + mutex_unlock(&cgroup_root_mutex); cgroup_unlock(); return 0; } @@ -2587,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file) return __d_cft(file->f_dentry); } -static int cgroup_create_file(struct dentry *dentry, mode_t mode, +static int cgroup_create_file(struct dentry *dentry, umode_t mode, struct super_block *sb) { struct inode *inode; @@ -2628,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, * @mode: mode to set on new directory. */ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, - mode_t mode) + umode_t mode) { struct dentry *parent; int error = 0; @@ -2655,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, * returns S_IRUGO if it has only a read handler * returns S_IWUSR if it has only a write hander */ -static mode_t cgroup_file_mode(const struct cftype *cft) +static umode_t cgroup_file_mode(const struct cftype *cft) { - mode_t mode = 0; + umode_t mode = 0; if (cft->mode) return cft->mode; @@ -2680,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp, struct dentry *dir = cgrp->dentry; struct dentry *dentry; int error; - mode_t mode; + umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { @@ -2791,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void) } void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) + __acquires(css_set_lock) { /* * The first time anyone tries to iterate across a cgroup, @@ -2830,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, } void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) + __releases(css_set_lock) { read_unlock(&css_set_lock); } @@ -3754,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) * Must be called with the mutex on the parent inode held */ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - mode_t mode) + umode_t mode) { struct cgroup *cgrp; struct cgroupfs_root *root = parent->root; @@ -3848,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return err; } -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct cgroup *c_parent = dentry->d_parent->d_fsdata; @@ -4014,11 +4053,11 @@ again: finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ @@ -4493,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = { * * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU or cgroup_mutex, so - * might no longer be a valid cgroup pointer. cgroup_attach_task() might - * have already changed current->cgroups, allowing the previously - * referenced cgroup group to be removed and freed. + * it was not made under the protection of RCU, cgroup_mutex or + * threadgroup_change_begin(), so it might no longer be a valid + * cgroup pointer. cgroup_attach_task() might have already changed + * current->cgroups, allowing the previously referenced cgroup + * group to be removed and freed. + * + * Outside the pointer validity we also need to process the css_set + * inheritance between threadgoup_change_begin() and + * threadgoup_change_end(), this way there is no leak in any process + * wide migration performed by cgroup_attach_proc() that could otherwise + * miss a thread because it is too early or too late in the fork stage. * * At the point that cgroup_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. */ void cgroup_fork(struct task_struct *child) { - task_lock(current); + /* + * We don't need to task_lock() current because current->cgroups + * can't be changed concurrently here. The parent obviously hasn't + * exited and called cgroup_exit(), and we are synchronized against + * cgroup migration through threadgroup_change_begin(). + */ child->cgroups = current->cgroups; get_css_set(child->cgroups); - task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } @@ -4548,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { write_lock(&css_set_lock); - task_lock(child); - if (list_empty(&child->cg_list)) + if (list_empty(&child->cg_list)) { + /* + * It's safe to use child->cgroups without task_lock() + * here because we are protected through + * threadgroup_change_begin() against concurrent + * css_set change in cgroup_task_migrate(). Also + * the task can't exit at that point until + * wake_up_new_task() is called, so we are protected + * against cgroup_exit() setting child->cgroup to + * init_css_set. + */ list_add(&child->cg_list, &child->cgroups->tasks); - task_unlock(child); + } write_unlock(&css_set_lock); } } @@ -4671,13 +4730,13 @@ static void check_for_release(struct cgroup *cgrp) * already queued for a userspace notification, queue * it now */ int need_schedule_work = 0; - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); if (need_schedule_work) schedule_work(&release_agent_work); } @@ -4729,7 +4788,7 @@ static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); mutex_lock(&cgroup_mutex); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; @@ -4738,7 +4797,7 @@ static void cgroup_release_agent(struct work_struct *work) struct cgroup, release_list); list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!pathbuf) goto continue_free; @@ -4768,9 +4827,9 @@ static void cgroup_release_agent(struct work_struct *work) continue_free: kfree(pathbuf); kfree(agentbuf); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } @@ -4880,9 +4939,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) rcu_assign_pointer(id->css, NULL); rcu_assign_pointer(css->id, NULL); - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4908,10 +4967,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) error = -ENOMEM; goto err_out; } - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ error = idr_get_new_above(&ss->idr, newid, 1, &myid); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); /* Returns error when there are no free spaces for new ID.*/ if (error) { @@ -4926,9 +4985,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) return newid; remove_idr: error = -ENOSPC; - spin_lock(&ss->id_lock); + write_lock(&ss->id_lock); idr_remove(&ss->idr, myid); - spin_unlock(&ss->id_lock); + write_unlock(&ss->id_lock); err_out: kfree(newid); return ERR_PTR(error); @@ -4940,7 +4999,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, { struct css_id *newid; - spin_lock_init(&ss->id_lock); + rwlock_init(&ss->id_lock); idr_init(&ss->idr); newid = get_new_cssid(ss, 0); @@ -5035,9 +5094,9 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - spin_lock(&ss->id_lock); + read_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - spin_unlock(&ss->id_lock); + read_unlock(&ss->id_lock); if (!tmp) break; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e691818d7e45..fc0646b78a64 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -14,7 +14,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fs.h> @@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } -static inline int __cgroup_freezing_or_frozen(struct task_struct *task) +bool cgroup_freezing(struct task_struct *task) { - enum freezer_state state = task_freezer(task)->state; - return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); -} + enum freezer_state state; + bool ret; -int cgroup_freezing_or_frozen(struct task_struct *task) -{ - int result; - task_lock(task); - result = __cgroup_freezing_or_frozen(task); - task_unlock(task); - return result; + rcu_read_lock(); + state = task_freezer(task)->state; + ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; + rcu_read_unlock(); + + return ret; } /* @@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys; * freezer_can_attach(): * cgroup_mutex (held by caller of can_attach) * - * cgroup_freezing_or_frozen(): - * task->alloc_lock (to get task's cgroup) - * * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): * freezer->lock * sighand->siglock (if the cgroup is freezing) @@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys; * write_lock css_set_lock (cgroup iterator start) * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) + * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) * sighand->siglock */ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, @@ -150,7 +145,18 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, static void freezer_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup) { - kfree(cgroup_freezer(cgroup)); + struct freezer *freezer = cgroup_freezer(cgroup); + + if (freezer->state != CGROUP_THAWED) + atomic_dec(&system_freezing_cnt); + kfree(freezer); +} + +/* task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ + return frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task)); } /* @@ -160,13 +166,17 @@ static void freezer_destroy(struct cgroup_subsys *ss, */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task) + struct cgroup_taskset *tset) { struct freezer *freezer; + struct task_struct *task; /* * Anything frozen can't move or be moved to/from. */ + cgroup_taskset_for_each(task, new_cgroup, tset) + if (cgroup_freezing(task)) + return -EBUSY; freezer = cgroup_freezer(new_cgroup); if (freezer->state != CGROUP_THAWED) @@ -175,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss, return 0; } -static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ - rcu_read_lock(); - if (__cgroup_freezing_or_frozen(tsk)) { - rcu_read_unlock(); - return -EBUSY; - } - rcu_read_unlock(); - return 0; -} - static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) { struct freezer *freezer; @@ -213,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) /* Locking avoids race with FREEZING -> THAWED transitions. */ if (freezer->state == CGROUP_FREEZING) - freeze_task(task, true); + freeze_task(task); spin_unlock_irq(&freezer->lock); } @@ -231,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (frozen(task)) + if (freezing(task) && is_task_frozen_enough(task)) nfrozen++; } @@ -279,12 +278,11 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) struct task_struct *task; unsigned int num_cant_freeze_now = 0; - freezer->state = CGROUP_FREEZING; cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { - if (!freeze_task(task, true)) + if (!freeze_task(task)) continue; - if (frozen(task)) + if (is_task_frozen_enough(task)) continue; if (!freezing(task) && !freezer_should_skip(task)) num_cant_freeze_now++; @@ -300,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) struct task_struct *task; cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - thaw_process(task); - } + while ((task = cgroup_iter_next(cgroup, &it))) + __thaw_task(task); cgroup_iter_end(cgroup, &it); - - freezer->state = CGROUP_THAWED; } static int freezer_change_state(struct cgroup *cgroup, @@ -319,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup, spin_lock_irq(&freezer->lock); update_if_frozen(cgroup, freezer); - if (goal_state == freezer->state) - goto out; switch (goal_state) { case CGROUP_THAWED: + if (freezer->state != CGROUP_THAWED) + atomic_dec(&system_freezing_cnt); + freezer->state = CGROUP_THAWED; unfreeze_cgroup(cgroup, freezer); break; case CGROUP_FROZEN: + if (freezer->state == CGROUP_THAWED) + atomic_inc(&system_freezing_cnt); + freezer->state = CGROUP_FREEZING; retval = try_to_freeze_cgroup(cgroup, freezer); break; default: BUG(); } -out: + spin_unlock_irq(&freezer->lock); return retval; @@ -381,10 +380,5 @@ struct cgroup_subsys freezer_subsys = { .populate = freezer_populate, .subsys_id = freezer_subsys_id, .can_attach = freezer_can_attach, - .can_attach_task = freezer_can_attach_task, - .pre_attach = NULL, - .attach_task = NULL, - .attach = NULL, .fork = freezer_fork, - .exit = NULL, }; diff --git a/kernel/compat.c b/kernel/compat.c index e2435ee9993a..f346cedfe24d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -21,6 +21,7 @@ #include <linux/unistd.h> #include <linux/security.h> #include <linux/timex.h> +#include <linux/export.h> #include <linux/migrate.h> #include <linux/posix-timers.h> #include <linux/times.h> diff --git a/kernel/cpu.c b/kernel/cpu.c index 12b7458f23b1..2060c6e57027 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,11 +10,12 @@ #include <linux/sched.h> #include <linux/unistd.h> #include <linux/cpu.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/kthread.h> #include <linux/stop_machine.h> #include <linux/mutex.h> #include <linux/gfp.h> +#include <linux/suspend.h> #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -177,8 +178,7 @@ static inline void check_for_tasks(int cpu) write_lock_irq(&tasklist_lock); for_each_process(p) { if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (!cputime_eq(p->utime, cputime_zero) || - !cputime_eq(p->stime, cputime_zero))) + (p->utime || p->stime)) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " "(state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, @@ -379,6 +379,7 @@ out: cpu_maps_update_done(); return err; } +EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; @@ -469,13 +470,86 @@ out: cpu_maps_update_done(); } -static int alloc_frozen_cpus(void) +static int __init alloc_frozen_cpus(void) { if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) return -ENOMEM; return 0; } core_initcall(alloc_frozen_cpus); + +/* + * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU + * hotplug when tasks are about to be frozen. Also, don't allow the freezer + * to continue until any currently running CPU hotplug operation gets + * completed. + * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the + * 'cpu_add_remove_lock'. And this same lock is also taken by the regular + * CPU hotplug path and released only after it is complete. Thus, we + * (and hence the freezer) will block here until any currently running CPU + * hotplug operation gets completed. + */ +void cpu_hotplug_disable_before_freeze(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + + +/* + * When tasks have been thawed, re-enable regular CPU hotplug (which had been + * disabled while beginning to freeze tasks). + */ +void cpu_hotplug_enable_after_thaw(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + +/* + * When callbacks for CPU hotplug notifications are being executed, we must + * ensure that the state of the system with respect to the tasks being frozen + * or not, as reported by the notification, remains unchanged *throughout the + * duration* of the execution of the callbacks. + * Hence we need to prevent the freezer from racing with regular CPU hotplug. + * + * This synchronization is implemented by mutually excluding regular CPU + * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ + * Hibernate notifications. + */ +static int +cpu_hotplug_pm_callback(struct notifier_block *nb, + unsigned long action, void *ptr) +{ + switch (action) { + + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + cpu_hotplug_disable_before_freeze(); + break; + + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + cpu_hotplug_enable_after_thaw(); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + + +static int __init cpu_hotplug_pm_sync_init(void) +{ + pm_notifier(cpu_hotplug_pm_callback, 0); + return 0; +} +core_initcall(cpu_hotplug_pm_sync_init); + #endif /* CONFIG_PM_SLEEP_SMP */ /** diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c new file mode 100644 index 000000000000..249152e15308 --- /dev/null +++ b/kernel/cpu_pm.c @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2011 Google, Inc. + * + * Author: + * Colin Cross <ccross@android.com> + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/cpu_pm.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/spinlock.h> +#include <linux/syscore_ops.h> + +static DEFINE_RWLOCK(cpu_pm_notifier_lock); +static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain); + +static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) +{ + int ret; + + ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, + nr_to_call, nr_calls); + + return notifier_to_errno(ret); +} + +/** + * cpu_pm_register_notifier - register a driver with cpu_pm + * @nb: notifier block to register + * + * Add a driver to a list of drivers that are notified about + * CPU and CPU cluster low power entry and exit. + * + * This function may sleep, and has the same return conditions as + * raw_notifier_chain_register. + */ +int cpu_pm_register_notifier(struct notifier_block *nb) +{ + unsigned long flags; + int ret; + + write_lock_irqsave(&cpu_pm_notifier_lock, flags); + ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb); + write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); + +/** + * cpu_pm_unregister_notifier - unregister a driver with cpu_pm + * @nb: notifier block to be unregistered + * + * Remove a driver from the CPU PM notifier list. + * + * This function may sleep, and has the same return conditions as + * raw_notifier_chain_unregister. + */ +int cpu_pm_unregister_notifier(struct notifier_block *nb) +{ + unsigned long flags; + int ret; + + write_lock_irqsave(&cpu_pm_notifier_lock, flags); + ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); + write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); + +/** + * cpm_pm_enter - CPU low power entry notifier + * + * Notifies listeners that a single CPU is entering a low power state that may + * cause some blocks in the same power domain as the cpu to reset. + * + * Must be called on the affected CPU with interrupts disabled. Platform is + * responsible for ensuring that cpu_pm_enter is not called twice on the same + * CPU before cpu_pm_exit is called. Notified drivers can include VFP + * co-processor, interrupt controller and it's PM extensions, local CPU + * timers context save/restore which shouldn't be interrupted. Hence it + * must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_pm_enter(void) +{ + int nr_calls; + int ret = 0; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); + if (ret) + /* + * Inform listeners (nr_calls - 1) about failure of CPU PM + * PM entry who are notified earlier to prepare for it. + */ + cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_enter); + +/** + * cpm_pm_exit - CPU low power exit notifier + * + * Notifies listeners that a single CPU is exiting a low power state that may + * have caused some blocks in the same power domain as the cpu to reset. + * + * Notified drivers can include VFP co-processor, interrupt controller + * and it's PM extensions, local CPU timers context save/restore which + * shouldn't be interrupted. Hence it must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_pm_exit(void) +{ + int ret; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_pm_exit); + +/** + * cpm_cluster_pm_enter - CPU cluster low power entry notifier + * + * Notifies listeners that all cpus in a power domain are entering a low power + * state that may cause some blocks in the same power domain to reset. + * + * Must be called after cpu_pm_enter has been called on all cpus in the power + * domain, and before cpu_pm_exit has been called on any cpu in the power + * domain. Notified drivers can include VFP co-processor, interrupt controller + * and it's PM extensions, local CPU timers context save/restore which + * shouldn't be interrupted. Hence it must be called with interrupts disabled. + * + * Must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_cluster_pm_enter(void) +{ + int nr_calls; + int ret = 0; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); + if (ret) + /* + * Inform listeners (nr_calls - 1) about failure of CPU cluster + * PM entry who are notified earlier to prepare for it. + */ + cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); + +/** + * cpm_cluster_pm_exit - CPU cluster low power exit notifier + * + * Notifies listeners that all cpus in a power domain are exiting form a + * low power state that may have caused some blocks in the same power domain + * to reset. + * + * Must be called after cpu_pm_exit has been called on all cpus in the power + * domain, and before cpu_pm_exit has been called on any cpu in the power + * domain. Notified drivers can include VFP co-processor, interrupt controller + * and it's PM extensions, local CPU timers context save/restore which + * shouldn't be interrupted. Hence it must be called with interrupts disabled. + * + * Return conditions are same as __raw_notifier_call_chain. + */ +int cpu_cluster_pm_exit(void) +{ + int ret; + + read_lock(&cpu_pm_notifier_lock); + ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); + read_unlock(&cpu_pm_notifier_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); + +#ifdef CONFIG_PM +static int cpu_pm_suspend(void) +{ + int ret; + + ret = cpu_pm_enter(); + if (ret) + return ret; + + ret = cpu_cluster_pm_enter(); + return ret; +} + +static void cpu_pm_resume(void) +{ + cpu_cluster_pm_exit(); + cpu_pm_exit(); +} + +static struct syscore_ops cpu_pm_syscore_ops = { + .suspend = cpu_pm_suspend, + .resume = cpu_pm_resume, +}; + +static int cpu_pm_init(void) +{ + register_syscore_ops(&cpu_pm_syscore_ops); + return 0; +} +core_initcall(cpu_pm_init); +#endif diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10131fdaff70..a09ac2b9a661 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -37,7 +37,7 @@ #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/pagemap.h> @@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +#ifdef CONFIG_NUMA +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return task->mempolicy; +} +#else +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return false; +} +#endif + + /* bits in struct cpuset flags field */ typedef enum { CS_CPU_EXCLUSIVE, @@ -949,6 +962,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { + bool need_loop; + repeat: /* * Allow tasks that have access to memory reserves because they have @@ -960,10 +975,17 @@ repeat: return; task_lock(tsk); + /* + * Determine if a loop is necessary if another thread is doing + * get_mems_allowed(). If at least one node remains unchanged and + * tsk does not have a mempolicy, then an empty nodemask will not be + * possible when mems_allowed is larger than a word. + */ + need_loop = task_has_mempolicy(tsk) || + !nodes_intersects(*newmems, tsk->mems_allowed); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* * ensure checking ->mems_allowed_change_disable after setting all new * allowed nodes. @@ -982,7 +1004,7 @@ repeat: * Allocation of memory is very fast, we needn't sleep when waiting * for the read-side. */ - while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); @@ -1367,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct task_struct *tsk) -{ - struct cpuset *cs = cgroup_cs(cont); - - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - return -ENOSPC; - - /* - * Kthreads bound to specific cpus cannot be moved to a new cpuset; we - * cannot change their cpu affinity and isolating such threads by their - * set of allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may - * be changed. - */ - if (tsk->flags & PF_THREAD_BOUND) - return -EINVAL; - - return 0; -} - -static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) -{ - return security_task_setscheduler(task); -} - /* * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in pre_attach, and they must - * persist among pre_attach, attach_task, and attach. + * dynamically allocating them is not allowed in can_attach, and they must + * persist until attach. */ static cpumask_var_t cpus_attach; static nodemask_t cpuset_attach_nodemask_from; static nodemask_t cpuset_attach_nodemask_to; -/* Set-up work for before attaching each task. */ -static void cpuset_pre_attach(struct cgroup *cont) +/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = cgroup_cs(cgrp); + struct task_struct *task; + int ret; + + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + return -ENOSPC; + + cgroup_taskset_for_each(task, cgrp, tset) { + /* + * Kthreads bound to specific cpus cannot be moved to a new + * cpuset; we cannot change their cpu affinity and + * isolating such threads by their set of allowed nodes is + * unnecessary. Thus, cpusets are not applicable for such + * threads. This prevents checking for success of + * set_cpus_allowed_ptr() on all attached tasks before + * cpus_allowed may be changed. + */ + if (task->flags & PF_THREAD_BOUND) + return -EINVAL; + if ((ret = security_task_setscheduler(task))) + return ret; + } + /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); else guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); -} -/* Per-thread attachment work. */ -static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) -{ - int err; - struct cpuset *cs = cgroup_cs(cont); - - /* - * can_attach beforehand should guarantee that this doesn't fail. - * TODO: have a better way to handle failure here - */ - err = set_cpus_allowed_ptr(tsk, cpus_attach); - WARN_ON_ONCE(err); - - cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flag(cs, tsk); + return 0; } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *oldcont, struct task_struct *tsk) +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { struct mm_struct *mm; - struct cpuset *cs = cgroup_cs(cont); - struct cpuset *oldcs = cgroup_cs(oldcont); + struct task_struct *task; + struct task_struct *leader = cgroup_taskset_first(tset); + struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *oldcs = cgroup_cs(oldcgrp); + + cgroup_taskset_for_each(task, cgrp, tset) { + /* + * can_attach beforehand should guarantee that this doesn't + * fail. TODO: have a better way to handle failure here + */ + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); + cpuset_update_task_spread_flag(cs, task); + } /* * Change mm, possibly for multiple threads in a threadgroup. This is @@ -1447,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, */ cpuset_attach_nodemask_from = oldcs->mems_allowed; cpuset_attach_nodemask_to = cs->mems_allowed; - mm = get_task_mm(tsk); + mm = get_task_mm(leader); if (mm) { mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) @@ -1903,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = { .create = cpuset_create, .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, - .can_attach_task = cpuset_can_attach_task, - .pre_attach = cpuset_pre_attach, - .attach_task = cpuset_attach_task, .attach = cpuset_attach, .populate = cpuset_populate, .post_clone = cpuset_post_clone, diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 5f85690285d4..c766ee54c0b1 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -2,7 +2,7 @@ #include <linux/crash_dump.h> #include <linux/init.h> #include <linux/errno.h> -#include <linux/module.h> +#include <linux/export.h> /* * If we have booted due to a crash, max_pfn will be a very low value. We need @@ -20,8 +20,15 @@ unsigned long saved_max_pfn; unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; /* + * stores the size of elf header of crash image + */ +unsigned long long elfcorehdr_size; + +/* * elfcorehdr= specifies the location of elf core header stored by the crashed * kernel. This option will be passed by kexec loader to the capture kernel. + * + * Syntax: elfcorehdr=[size[KMG]@]offset[KMG] */ static int __init setup_elfcorehdr(char *arg) { @@ -29,6 +36,10 @@ static int __init setup_elfcorehdr(char *arg) if (!arg) return -EINVAL; elfcorehdr_addr = memparse(arg, &end); + if (*end == '@') { + elfcorehdr_size = elfcorehdr_addr; + elfcorehdr_addr = memparse(end + 1, &end); + } return end > arg ? 0 : -EINVAL; } early_param("elfcorehdr", setup_elfcorehdr); diff --git a/kernel/cred.c b/kernel/cred.c index 8ef31f53c44c..5791612a4045 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/cred.h> #include <linux/slab.h> #include <linux/sched.h> @@ -644,6 +644,9 @@ void __init cred_init(void) */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred; +#endif const struct cred *old; struct cred *new; @@ -651,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (!new) return NULL; +#ifdef CONFIG_KEYS + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) { + kmem_cache_free(cred_jar, new); + return NULL; + } +#endif + kdebug("prepare_kernel_cred() alloc %p", new); if (daemon) @@ -667,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) get_group_info(new->group_info); #ifdef CONFIG_KEYS - atomic_inc(&init_tgcred.usage); - new->tgcred = &init_tgcred; + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + tgcred->process_keyring = NULL; + tgcred->session_keyring = NULL; + new->tgcred = tgcred; new->request_key_auth = NULL; new->thread_keyring = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 34872482315e..c22d8c28ad84 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -217,7 +217,7 @@ void gdbstub_msg_write(const char *s, int len) /* Pack in hex chars */ for (i = 0; i < wcount; i++) - bufptr = pack_hex_byte(bufptr, s[i]); + bufptr = hex_byte_pack(bufptr, s[i]); *bufptr = '\0'; /* Move up */ @@ -249,7 +249,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count) if (err) return NULL; while (count > 0) { - buf = pack_hex_byte(buf, *tmp); + buf = hex_byte_pack(buf, *tmp); tmp++; count--; } @@ -411,14 +411,14 @@ static char *pack_threadid(char *pkt, unsigned char *id) limit = id + (BUF_THREAD_ID_SIZE / 2); while (id < limit) { if (!lzero || *id != 0) { - pkt = pack_hex_byte(pkt, *id); + pkt = hex_byte_pack(pkt, *id); lzero = 0; } id++; } if (lzero) - pkt = pack_hex_byte(pkt, 0); + pkt = hex_byte_pack(pkt, 0); return pkt; } @@ -486,7 +486,7 @@ static void gdb_cmd_status(struct kgdb_state *ks) dbg_remove_all_break(); remcom_out_buffer[0] = 'S'; - pack_hex_byte(&remcom_out_buffer[1], ks->signo); + hex_byte_pack(&remcom_out_buffer[1], ks->signo); } static void gdb_get_regs_helper(struct kgdb_state *ks) @@ -954,7 +954,7 @@ int gdb_serial_stub(struct kgdb_state *ks) /* Reply to host that an exception has occurred */ ptr = remcom_out_buffer; *ptr++ = 'T'; - ptr = pack_hex_byte(ptr, ks->signo); + ptr = hex_byte_pack(ptr, ks->signo); ptr += strlen(strcpy(ptr, "thread:")); int_to_threadref(thref, shadow_pid(current->pid)); ptr = pack_threadid(ptr, thref); diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index d9ca9aa481ec..8b68ce78ff17 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -11,6 +11,7 @@ #include <linux/kgdb.h> #include <linux/kdb.h> #include <linux/kdebug.h> +#include <linux/export.h> #include "kdb_private.h" #include "../debug_core.h" diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 63786e71a3cd..e2ae7349437f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4d ", module_refcount(mod)); + kdb_printf("%4ld ", module_refcount(mod)); #endif if (mod->state == MODULE_STATE_GOING) kdb_printf(" (Unloading)"); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 5532dd37aa86..7d6fb40d2188 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p) (p->exit_state & EXIT_ZOMBIE) ? 'Z' : (p->exit_state & EXIT_DEAD) ? 'E' : (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; - if (p->pid == 0) { + if (is_idle_task(p)) { /* Idle task. Is it really idle, apart from the kdb * interrupt? */ if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { diff --git a/kernel/dma.c b/kernel/dma.c index f903189c5304..68a2306522c8 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -9,7 +9,7 @@ * [It also happened to remove the sizeof(char *) == sizeof(int) * assumption introduced because of those /proc/dma patches. -- Hennus] */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/spinlock.h> diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 89e5e8aa4c36..22d901f9caf4 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -obj-y := core.o ring_buffer.o +obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c new file mode 100644 index 000000000000..057e24b665cf --- /dev/null +++ b/kernel/events/callchain.c @@ -0,0 +1,191 @@ +/* + * Performance events callchain code, extracted from core.c: + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * For licensing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> +#include <linux/slab.h> +#include "internal.h" + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +static struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} diff --git a/kernel/events/core.c b/kernel/events/core.c index 0f857782d06f..a8f4ac001a00 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING */ @@ -25,6 +25,7 @@ #include <linux/reboot.h> #include <linux/vmstat.h> #include <linux/device.h> +#include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/rculist.h> @@ -127,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key perf_sched_events __read_mostly; +struct jump_label_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -184,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb); + void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -1126,6 +1130,8 @@ event_sched_out(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu--; ctx->nr_active--; + if (event->attr.freq && event->attr.sample_freq) + ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } @@ -1321,6 +1327,7 @@ retry: } raw_spin_unlock_irq(&ctx->lock); } +EXPORT_SYMBOL_GPL(perf_event_disable); static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, @@ -1402,6 +1409,8 @@ event_sched_in(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; + if (event->attr.freq && event->attr.sample_freq) + ctx->nr_freq++; if (event->attr.exclusive) cpuctx->exclusive = 1; @@ -1658,8 +1667,7 @@ retry: * Note: this works for group members as well as group leaders * since the non-leader members' sibling_lists will be empty. */ -static void __perf_event_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) +static void __perf_event_mark_enabled(struct perf_event *event) { struct perf_event *sub; u64 tstamp = perf_event_time(event); @@ -1697,7 +1705,7 @@ static int __perf_event_enable(void *info) */ perf_cgroup_set_timestamp(current, ctx); - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); if (!event_filter_match(event)) { if (is_cgroup_event(event)) @@ -1778,7 +1786,7 @@ void perf_event_enable(struct perf_event *event) retry: if (!ctx->is_active) { - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); goto out; } @@ -1805,6 +1813,7 @@ retry: out: raw_spin_unlock_irq(&ctx->lock); } +EXPORT_SYMBOL_GPL(perf_event_enable); int perf_event_refresh(struct perf_event *event, int refresh) { @@ -2170,9 +2179,10 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, task); + if (ctx->nr_events) + cpuctx->task_ctx = ctx; - cpuctx->task_ctx = ctx; + perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); @@ -2322,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) u64 interrupts, now; s64 delta; + if (!ctx->nr_freq) + return; + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2377,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) { u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; + int rotate = 0, remove = 1, freq = 0; if (cpuctx->ctx.nr_events) { remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; + if (cpuctx->ctx.nr_freq) + freq = 1; } ctx = cpuctx->task_ctx; @@ -2390,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; + if (ctx->nr_freq) + freq = 1; } + if (!rotate && !freq) + goto done; + perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - perf_ctx_adjust_freq(&cpuctx->ctx, interval); - if (ctx) - perf_ctx_adjust_freq(ctx, interval); - if (!rotate) - goto done; + if (freq) { + perf_ctx_adjust_freq(&cpuctx->ctx, interval); + if (ctx) + perf_ctx_adjust_freq(ctx, interval); + } - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (rotate) { + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + perf_event_sched_in(cpuctx, ctx, current); + } - perf_event_sched_in(cpuctx, ctx, current); + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); done: if (remove) list_del_init(&cpuctx->rotation_list); - - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } void perf_event_task_tick(void) @@ -2443,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event, if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); return 1; } @@ -2475,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_lock(&ctx->lock); task_ctx_sched_out(ctx); - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + list_for_each_entry(event, &ctx->event_list, event_entry) { ret = event_enable_on_exec(event, ctx); if (ret) enabled = 1; @@ -2569,215 +2585,6 @@ static u64 perf_event_read(struct perf_event *event) } /* - * Callchain support - */ - -struct callchain_cpus_entries { - struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; -}; - -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); -static atomic_t nr_callchain_events; -static DEFINE_MUTEX(callchain_mutex); -struct callchain_cpus_entries *callchain_cpus_entries; - - -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -__weak void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static void release_callchain_buffers_rcu(struct rcu_head *head) -{ - struct callchain_cpus_entries *entries; - int cpu; - - entries = container_of(head, struct callchain_cpus_entries, rcu_head); - - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - - kfree(entries); -} - -static void release_callchain_buffers(void) -{ - struct callchain_cpus_entries *entries; - - entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); - call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); -} - -static int alloc_callchain_buffers(void) -{ - int cpu; - int size; - struct callchain_cpus_entries *entries; - - /* - * We can't use the percpu allocation API for data that can be - * accessed from NMI. Use a temporary manual per cpu allocation - * until that gets sorted out. - */ - size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); - - entries = kzalloc(size, GFP_KERNEL); - if (!entries) - return -ENOMEM; - - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; - - for_each_possible_cpu(cpu) { - entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, - cpu_to_node(cpu)); - if (!entries->cpu_entries[cpu]) - goto fail; - } - - rcu_assign_pointer(callchain_cpus_entries, entries); - - return 0; - -fail: - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - kfree(entries); - - return -ENOMEM; -} - -static int get_callchain_buffers(void) -{ - int err = 0; - int count; - - mutex_lock(&callchain_mutex); - - count = atomic_inc_return(&nr_callchain_events); - if (WARN_ON_ONCE(count < 1)) { - err = -EINVAL; - goto exit; - } - - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - goto exit; - } - - err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); -exit: - mutex_unlock(&callchain_mutex); - - return err; -} - -static void put_callchain_buffers(void) -{ - if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { - release_callchain_buffers(); - mutex_unlock(&callchain_mutex); - } -} - -static int get_recursion_context(int *recursion) -{ - int rctx; - - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (recursion[rctx]) - return -1; - - recursion[rctx]++; - barrier(); - - return rctx; -} - -static inline void put_recursion_context(int *recursion, int rctx) -{ - barrier(); - recursion[rctx]--; -} - -static struct perf_callchain_entry *get_callchain_entry(int *rctx) -{ - int cpu; - struct callchain_cpus_entries *entries; - - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); - if (*rctx == -1) - return NULL; - - entries = rcu_dereference(callchain_cpus_entries); - if (!entries) - return NULL; - - cpu = smp_processor_id(); - - return &entries->cpu_entries[cpu][*rctx]; -} - -static void -put_callchain_entry(int rctx) -{ - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); -} - -static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - int rctx; - struct perf_callchain_entry *entry; - - - entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - - if (!entry) - goto exit_put; - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); - } - -exit_put: - put_callchain_entry(rctx); - - return entry; -} - -/* * Initialize the perf_event context in a task_struct: */ static void __perf_event_init_context(struct perf_event_context *ctx) @@ -2941,7 +2748,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec(&perf_sched_events); + jump_label_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2952,7 +2759,7 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec(&perf_sched_events); + jump_label_dec_deferred(&perf_sched_events); } } @@ -3189,12 +2996,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) struct ring_buffer *rb; unsigned int events = POLL_HUP; + /* + * Race between perf_event_set_output() and perf_poll(): perf_poll() + * grabs the rb reference but perf_event_set_output() overrides it. + * Here is the timeline for two threads T1, T2: + * t0: T1, rb = rcu_dereference(event->rb) + * t1: T2, old_rb = event->rb + * t2: T2, event->rb = new rb + * t3: T2, ring_buffer_detach(old_rb) + * t4: T1, ring_buffer_attach(rb1) + * t5: T1, poll_wait(event->waitq) + * + * To avoid this problem, we grab mmap_mutex in perf_poll() + * thereby ensuring that the assignment of the new ring buffer + * and the detachment of the old buffer appear atomic to perf_poll() + */ + mutex_lock(&event->mmap_mutex); + rcu_read_lock(); rb = rcu_dereference(event->rb); - if (rb) + if (rb) { + ring_buffer_attach(event, rb); events = atomic_xchg(&rb->poll, 0); + } rcu_read_unlock(); + mutex_unlock(&event->mmap_mutex); + poll_wait(file, &event->waitq, wait); return events; @@ -3495,6 +3323,53 @@ unlock: return ret; } +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb) +{ + unsigned long flags; + + if (!list_empty(&event->rb_entry)) + return; + + spin_lock_irqsave(&rb->event_lock, flags); + if (!list_empty(&event->rb_entry)) + goto unlock; + + list_add(&event->rb_entry, &rb->event_list); +unlock: + spin_unlock_irqrestore(&rb->event_lock, flags); +} + +static void ring_buffer_detach(struct perf_event *event, + struct ring_buffer *rb) +{ + unsigned long flags; + + if (list_empty(&event->rb_entry)) + return; + + spin_lock_irqsave(&rb->event_lock, flags); + list_del_init(&event->rb_entry); + wake_up_all(&event->waitq); + spin_unlock_irqrestore(&rb->event_lock, flags); +} + +static void ring_buffer_wakeup(struct perf_event *event) +{ + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb) + goto unlock; + + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) + wake_up_all(&event->waitq); + +unlock: + rcu_read_unlock(); +} + static void rb_free_rcu(struct rcu_head *rcu_head) { struct ring_buffer *rb; @@ -3520,9 +3395,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) static void ring_buffer_put(struct ring_buffer *rb) { + struct perf_event *event, *n; + unsigned long flags; + if (!atomic_dec_and_test(&rb->refcount)) return; + spin_lock_irqsave(&rb->event_lock, flags); + list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { + list_del_init(&event->rb_entry); + wake_up_all(&event->waitq); + } + spin_unlock_irqrestore(&rb->event_lock, flags); + call_rcu(&rb->rcu_head, rb_free_rcu); } @@ -3543,8 +3428,9 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct ring_buffer *rb = event->rb; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= event->mmap_locked; + vma->vm_mm->pinned_vm -= event->mmap_locked; rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); mutex_unlock(&event->mmap_mutex); ring_buffer_put(rb); @@ -3624,7 +3510,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; + locked = vma->vm_mm->pinned_vm + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -3650,7 +3536,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; event->mmap_user = get_current_user(); - vma->vm_mm->locked_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += event->mmap_locked; unlock: if (!ret) @@ -3699,7 +3585,7 @@ static const struct file_operations perf_fops = { void perf_event_wakeup(struct perf_event *event) { - wake_up_all(&event->waitq); + ring_buffer_wakeup(event); if (event->pending_kill) { kill_fasync(&event->fasync, SIGIO, event->pending_kill); @@ -4736,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, struct hw_perf_event *hwc = &event->hw; int throttle = 0; - data->period = event->hw.last_period; if (!overflow) overflow = perf_swevent_set_period(event); @@ -4770,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!is_sampling_event(event)) return; + if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { + data->period = nr; + return perf_swevent_overflow(event, 1, data, regs); + } else + data->period = event->hw.last_period; + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, data, regs); @@ -5282,7 +5173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && current->pid == 0)) + if (!(event->attr.exclude_idle && is_idle_task(current))) if (perf_event_overflow(event, &data, regs)) ret = HRTIMER_NORESTART; } @@ -5758,6 +5649,7 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { + event->pmu = pmu; ret = pmu->event_init(event); if (ret) pmu = ERR_PTR(ret); @@ -5765,6 +5657,7 @@ struct pmu *perf_init_event(struct perf_event *event) } list_for_each_entry_rcu(pmu, &pmus, entry) { + event->pmu = pmu; ret = pmu->event_init(event); if (!ret) goto unlock; @@ -5819,6 +5712,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + INIT_LIST_HEAD(&event->rb_entry); + init_waitqueue_head(&event->waitq); init_irq_work(&event->pending, perf_pending_event); @@ -5891,11 +5786,9 @@ done: return ERR_PTR(err); } - event->pmu = pmu; - if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events); + jump_label_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -6027,6 +5920,8 @@ set: old_rb = event->rb; rcu_assign_pointer(event->rb, rb); + if (old_rb) + ring_buffer_detach(event, old_rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); @@ -6131,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events); + jump_label_inc(&perf_sched_events.key); } /* @@ -6977,6 +6872,9 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); + + /* do not patch jump label more than once per second */ + jump_label_rate_limit(&perf_sched_events, HZ); } static int __init perf_event_sysfs_init(void) @@ -7043,10 +6941,13 @@ static int __perf_cgroup_move(void *info) return 0; } -static void -perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { - task_function_call(task, __perf_cgroup_move, task); + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) + task_function_call(task, __perf_cgroup_move, task); } static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, @@ -7060,7 +6961,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, if (!(task->flags & PF_EXITING)) return; - perf_cgroup_attach_task(cgrp, task); + task_function_call(task, __perf_cgroup_move, task); } struct cgroup_subsys perf_subsys = { @@ -7069,6 +6970,6 @@ struct cgroup_subsys perf_subsys = { .create = perf_cgroup_create, .destroy = perf_cgroup_destroy, .exit = perf_cgroup_exit, - .attach_task = perf_cgroup_attach_task, + .attach = perf_cgroup_attach, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09097dd8116c..b0b107f90afc 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -1,6 +1,10 @@ #ifndef _KERNEL_EVENTS_INTERNAL_H #define _KERNEL_EVENTS_INTERNAL_H +#include <linux/hardirq.h> + +/* Buffer handling */ + #define RING_BUFFER_WRITABLE 0x01 struct ring_buffer { @@ -22,6 +26,9 @@ struct ring_buffer { local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ + /* poll crap */ + spinlock_t event_lock; + struct list_head event_list; struct perf_event_mmap_page *user_page; void *data_pages[0]; @@ -64,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb) } #endif -static unsigned long perf_data_size(struct ring_buffer *rb) +static inline unsigned long perf_data_size(struct ring_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } @@ -93,4 +100,37 @@ __output_copy(struct perf_output_handle *handle, } while (len); } +/* Callchain handling */ +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern int get_callchain_buffers(void); +extern void put_callchain_buffers(void); + +static inline int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + #endif /* _KERNEL_EVENTS_INTERNAL_H */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a2a29205cc0f..6ddaba43fb7a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING */ @@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->writable = 1; atomic_set(&rb->refcount, 1); + + INIT_LIST_HEAD(&rb->event_list); + spin_lock_init(&rb->event_lock); } #ifndef CONFIG_PERF_USE_VMALLOC diff --git a/kernel/exit.c b/kernel/exit.c index 2913b3509d42..c44738267be7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,6 +51,7 @@ #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> +#include <linux/writeback.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -121,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, tsk->utime); - sig->stime = cputime_add(sig->stime, tsk->stime); - sig->gtime = cputime_add(sig->gtime, tsk->gtime); + sig->utime += tsk->utime; + sig->stime += tsk->stime; + sig->gtime += tsk->gtime; sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -679,10 +680,6 @@ static void exit_mm(struct task_struct * tsk) tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); - /* We don't want this task to be frozen prematurely */ - clear_freeze_flag(tsk); - if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); @@ -890,7 +887,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -NORET_TYPE void do_exit(long code) +void do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -1039,9 +1036,12 @@ NORET_TYPE void do_exit(long code) validate_creds_for_do_exit(tsk); preempt_disable(); + if (tsk->nr_dirtied) + __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; + tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ schedule(); BUG(); /* Avoid "noreturn function does return". */ @@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code) EXPORT_SYMBOL_GPL(do_exit); -NORET_TYPE void complete_and_exit(struct completion *comp, long code) +void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); @@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -NORET_TYPE void +void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; @@ -1257,19 +1257,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; - psig->cutime = - cputime_add(psig->cutime, - cputime_add(tgutime, - sig->cutime)); - psig->cstime = - cputime_add(psig->cstime, - cputime_add(tgstime, - sig->cstime)); - psig->cgtime = - cputime_add(psig->cgtime, - cputime_add(p->gtime, - cputime_add(sig->gtime, - sig->cgtime))); + psig->cutime += tgutime + sig->cutime; + psig->cstime += tgstime + sig->cstime; + psig->cgtime += p->gtime + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += @@ -1542,8 +1532,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* dead body doesn't have much to contribute */ - if (p->exit_state == EXIT_DEAD) + if (unlikely(p->exit_state == EXIT_DEAD)) { + /* + * But do not ignore this task until the tracer does + * wait_task_zombie()->do_notify_parent(). + */ + if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + wo->notask_error = 0; return 0; + } /* slay zombie? */ if (p->exit_state == EXIT_ZOMBIE) { diff --git a/kernel/fork.c b/kernel/fork.c index 8e6b6f4fb272..443f5125f11e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -76,6 +76,9 @@ #include <trace/events/sched.h> +#define CREATE_TRACE_POINTS +#include <trace/events/task.h> + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -162,7 +165,6 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { - prop_local_destroy_single(&tsk->dirties); account_kernel_stack(tsk->stack, -1); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); @@ -274,10 +276,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->stack = ti; - err = prop_local_init_single(&tsk->dirties); - if (err) - goto out; - setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -501,7 +499,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); - atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -816,8 +813,6 @@ good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; - if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -980,7 +975,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sched_autogroup_fork(sig); #ifdef CONFIG_CGROUPS - init_rwsem(&sig->threadgroup_fork_lock); + init_rwsem(&sig->group_rwsem); #endif sig->oom_adj = current->signal->oom_adj; @@ -1000,7 +995,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; - clear_freeze_flag(p); } SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) @@ -1031,8 +1025,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) */ static void posix_cpu_timers_init(struct task_struct *tsk) { - tsk->cputime_expires.prof_exp = cputime_zero; - tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.prof_exp = 0; + tsk->cputime_expires.virt_exp = 0; tsk->cputime_expires.sched_exp = 0; INIT_LIST_HEAD(&tsk->cpu_timers[0]); INIT_LIST_HEAD(&tsk->cpu_timers[1]); @@ -1140,14 +1134,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, init_sigpending(&p->pending); - p->utime = cputime_zero; - p->stime = cputime_zero; - p->gtime = cputime_zero; - p->utimescaled = cputime_zero; - p->stimescaled = cputime_zero; + p->utime = p->stime = p->gtime = 0; + p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = cputime_zero; - p->prev_stime = cputime_zero; + p->prev_utime = p->prev_stime = 0; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); @@ -1166,7 +1156,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) - threadgroup_fork_read_lock(current); + threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1302,6 +1292,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->pdeath_signal = 0; p->exit_state = 0; + p->nr_dirtied = 0; + p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); + p->dirty_paused_when = 0; + /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. @@ -1378,8 +1372,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) - threadgroup_fork_read_unlock(current); + threadgroup_change_end(current); perf_event_fork(p); + + trace_task_newtask(p, clone_flags); + return p; bad_fork_free_pid: @@ -1391,13 +1388,8 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) { - task_lock(p); - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&p->mm->oom_disable_count); - task_unlock(p); + if (p->mm) mmput(p->mm); - } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -1418,7 +1410,7 @@ bad_fork_cleanup_policy: bad_fork_cleanup_cgroup: #endif if (clone_flags & CLONE_THREAD) - threadgroup_fork_read_unlock(current); + threadgroup_change_end(current); cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); diff --git a/kernel/freezer.c b/kernel/freezer.c index 7b01de98bb6a..9815b8d1eed5 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -6,104 +6,117 @@ #include <linux/interrupt.h> #include <linux/suspend.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscalls.h> #include <linux/freezer.h> +#include <linux/kthread.h> -/* - * freezing is complete, mark current process as frozen +/* total number of freezing conditions in effect */ +atomic_t system_freezing_cnt = ATOMIC_INIT(0); +EXPORT_SYMBOL(system_freezing_cnt); + +/* indicate whether PM freezing is in effect, protected by pm_mutex */ +bool pm_freezing; +bool pm_nosig_freezing; + +/* protects freezing and frozen transitions */ +static DEFINE_SPINLOCK(freezer_lock); + +/** + * freezing_slow_path - slow path for testing whether a task needs to be frozen + * @p: task to be tested + * + * This function is called by freezing() if system_freezing_cnt isn't zero + * and tests whether @p needs to enter and stay in frozen state. Can be + * called under any context. The freezers are responsible for ensuring the + * target tasks see the updated state. */ -static inline void frozen_process(void) +bool freezing_slow_path(struct task_struct *p) { - if (!unlikely(current->flags & PF_NOFREEZE)) { - current->flags |= PF_FROZEN; - smp_wmb(); - } - clear_freeze_flag(current); + if (p->flags & PF_NOFREEZE) + return false; + + if (pm_nosig_freezing || cgroup_freezing(p)) + return true; + + if (pm_freezing && !(p->flags & PF_KTHREAD)) + return true; + + return false; } +EXPORT_SYMBOL(freezing_slow_path); /* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) +bool __refrigerator(bool check_kthr_stop) { /* Hmm, should we be allowed to suspend when there are realtime processes around? */ - long save; + bool was_frozen = false; + long save = current->state; - task_lock(current); - if (freezing(current)) { - frozen_process(); - task_unlock(current); - } else { - task_unlock(current); - return; - } - save = current->state; pr_debug("%s entered refrigerator\n", current->comm); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - /* prevent accounting of that task to load */ - current->flags |= PF_FREEZING; - for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current)) + + spin_lock_irq(&freezer_lock); + current->flags |= PF_FROZEN; + if (!freezing(current) || + (check_kthr_stop && kthread_should_stop())) + current->flags &= ~PF_FROZEN; + spin_unlock_irq(&freezer_lock); + + if (!(current->flags & PF_FROZEN)) break; + was_frozen = true; schedule(); } - /* Remove the accounting blocker */ - current->flags &= ~PF_FREEZING; - pr_debug("%s left refrigerator\n", current->comm); - __set_current_state(save); + + /* + * Restore saved task state before returning. The mb'd version + * needs to be used; otherwise, it might silently break + * synchronization which depends on ordered task state change. + */ + set_current_state(save); + + return was_frozen; } -EXPORT_SYMBOL(refrigerator); +EXPORT_SYMBOL(__refrigerator); static void fake_signal_wake_up(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + if (lock_task_sighand(p, &flags)) { + signal_wake_up(p, 0); + unlock_task_sighand(p, &flags); + } } /** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * @sig_only: if set, the request will only be sent if the task has the - * PF_FREEZER_NOSIG flag unset - * Return value: 'false', if @sig_only is set and the task has - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * freeze_task - send a freeze request to given task + * @p: task to send the request to + * + * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE + * flag and either sending a fake signal to it or waking it up, depending + * on whether it has %PF_FREEZER_NOSIG set. * - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and - * either sending a fake signal to it or waking it up, depending on whether - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - * TIF_FREEZE flag will not be set. + * RETURNS: + * %false, if @p is not freezing or already frozen; %true, otherwise */ -bool freeze_task(struct task_struct *p, bool sig_only) +bool freeze_task(struct task_struct *p) { - /* - * We first check if the task is freezing and next if it has already - * been frozen to avoid the race with frozen_process() which first marks - * the task as frozen and next clears its TIF_FREEZE. - */ - if (!freezing(p)) { - smp_rmb(); - if (frozen(p)) - return false; - - if (!sig_only || should_send_signal(p)) - set_freeze_flag(p); - else - return false; + unsigned long flags; + + spin_lock_irqsave(&freezer_lock, flags); + if (!freezing(p) || frozen(p)) { + spin_unlock_irqrestore(&freezer_lock, flags); + return false; } - if (should_send_signal(p)) { + if (!(p->flags & PF_KTHREAD)) { fake_signal_wake_up(p); /* * fake_signal_wake_up() goes through p's scheduler @@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only) * TASK_RUNNING transition can't race with task state * testing in try_to_freeze_tasks(). */ - } else if (sig_only) { - return false; } else { wake_up_state(p, TASK_INTERRUPTIBLE); } + spin_unlock_irqrestore(&freezer_lock, flags); return true; } -void cancel_freezing(struct task_struct *p) +void __thaw_task(struct task_struct *p) { unsigned long flags; - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - clear_freeze_flag(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_and_wake(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -} - -static int __thaw_process(struct task_struct *p) -{ - if (frozen(p)) { - p->flags &= ~PF_FROZEN; - return 1; - } - clear_freeze_flag(p); - return 0; + /* + * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to + * be visible to @p as waking up implies wmb. Waking up inside + * freezer_lock also prevents wakeups from leaking outside + * refrigerator. + */ + spin_lock_irqsave(&freezer_lock, flags); + if (frozen(p)) + wake_up_process(p); + spin_unlock_irqrestore(&freezer_lock, flags); } -/* - * Wake up a frozen process +/** + * set_freezable - make %current freezable * - * task_lock() is needed to prevent the race with refrigerator() which may - * occur if the freezing of tasks fails. Namely, without the lock, if the - * freezing of tasks failed, thaw_tasks() might have run before a task in - * refrigerator() could call frozen_process(), in which case the task would be - * frozen and no one would thaw it. + * Mark %current freezable and enter refrigerator if necessary. */ -int thaw_process(struct task_struct *p) +bool set_freezable(void) { - task_lock(p); - if (__thaw_process(p) == 1) { - task_unlock(p); - wake_up_process(p); - return 1; - } - task_unlock(p); - return 0; + might_sleep(); + + /* + * Modify flags while holding freezer_lock. This ensures the + * freezer notices that we aren't frozen yet or the freezing + * condition is visible to try_to_freeze() below. + */ + spin_lock_irq(&freezer_lock); + current->flags &= ~PF_NOFREEZE; + spin_unlock_irq(&freezer_lock); + + return try_to_freeze(); } -EXPORT_SYMBOL(thaw_process); +EXPORT_SYMBOL(set_freezable); diff --git a/kernel/futex.c b/kernel/futex.c index 11cbe052b2e8..1614be20173d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -55,7 +55,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/signal.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/magic.h> #include <linux/pid.h> #include <linux/nsproxy.h> @@ -314,17 +314,29 @@ again: #endif lock_page(page_head); + + /* + * If page_head->mapping is NULL, then it cannot be a PageAnon + * page; but it might be the ZERO_PAGE or in the gate area or + * in a special mapping (all cases which we are happy to fail); + * or it may have been a good file page when get_user_pages_fast + * found it, but truncated or holepunched or subjected to + * invalidate_complete_page2 before we got the page lock (also + * cases which we are happy to fail). And we hold a reference, + * so refcount care in invalidate_complete_page's remove_mapping + * prevents drop_caches from setting mapping to NULL beneath us. + * + * The case we do have to guard against is when memory pressure made + * shmem_writepage move it from filecache to swapcache beneath us: + * an unlikely race, but we do need to retry for page_head->mapping. + */ if (!page_head->mapping) { + int shmem_swizzled = PageSwapCache(page_head); unlock_page(page_head); put_page(page_head); - /* - * ZERO_PAGE pages don't have a mapping. Avoid a busy loop - * trying to find one. RW mapping would have COW'd (and thus - * have a mapping) so this page is RO and won't ever change. - */ - if ((page_head == ZERO_PAGE(address))) - return -EFAULT; - goto again; + if (shmem_swizzled) + goto again; + return -EFAULT; } /* @@ -854,7 +866,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) { struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; - u32 curval, newval; + u32 uninitialized_var(curval), newval; if (!pi_state) return -EINVAL; @@ -916,7 +928,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) static int unlock_futex_pi(u32 __user *uaddr, u32 uval) { - u32 oldval; + u32 uninitialized_var(oldval); /* * There is no waiter, so we unlock the futex. The owner died @@ -1576,7 +1588,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; struct task_struct *oldowner = pi_state->owner; - u32 uval, curval, newval; + u32 uval, uninitialized_var(curval), newval; int ret; /* Owner died? */ @@ -1793,7 +1805,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * * Returns: * 0 - uaddr contains val and hb has been locked - * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) @@ -2481,7 +2493,7 @@ err_unlock: */ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { - u32 uval, nval, mval; + u32 uval, uninitialized_var(nval), mval; retry: if (get_user(uval, uaddr)) diff --git a/kernel/groups.c b/kernel/groups.c index 1cc476d52dd3..99b53d1eb7ea 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -2,7 +2,7 @@ * Supplementary group IDs */ #include <linux/cred.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/syscalls.h> diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index a9205e32a059..ae34bf51682b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -32,7 +32,7 @@ */ #include <linux/cpu.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> @@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { + struct timerqueue_node *next_timer; if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; - if (&timer->node == timerqueue_getnext(&base->active)) { + next_timer = timerqueue_getnext(&base->active); + timerqueue_del(&base->active, &timer->node); + if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ if (reprogram && hrtimer_hres_active()) { @@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer, } #endif } - timerqueue_del(&base->active, &timer->node); if (!timerqueue_getnext(&base->active)) base->cpu_base->active_bases &= ~(1 << base->index); out: diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ea640120ab86..2e48ec0c2e91 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -13,7 +13,7 @@ #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/lockdep.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sysctl.h> /* @@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) /* * Ensure the task is not frozen. - * Also, when a freshly created task is scheduled once, changes - * its state to TASK_UNINTERRUPTIBLE without having ever been - * switched out once, it musn't be checked. + * Also, skip vfork and any other user process that freezer should skip. */ - if (unlikely(t->flags & PF_FROZEN || !switch_count)) + if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) + return; + + /* + * When a freshly created task is scheduled once, changes its state to + * TASK_UNINTERRUPTIBLE without having ever been switched out once, it + * musn't be checked. + */ + if (unlikely(!switch_count)) return; if (switch_count != t->last_switch_count) { diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dc5114b4c16c..f7c543a801d9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -26,7 +26,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; @@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_set_chip); int irq_set_irq_type(unsigned int irq, unsigned int type) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); int ret = 0; if (!desc) @@ -78,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type); int irq_set_handler_data(unsigned int irq, void *data) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; @@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data); int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; @@ -119,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) int irq_set_chip_data(unsigned int irq, void *data) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; @@ -204,6 +204,24 @@ void irq_disable(struct irq_desc *desc) } } +void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) +{ + if (desc->irq_data.chip->irq_enable) + desc->irq_data.chip->irq_enable(&desc->irq_data); + else + desc->irq_data.chip->irq_unmask(&desc->irq_data); + cpumask_set_cpu(cpu, desc->percpu_enabled); +} + +void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu) +{ + if (desc->irq_data.chip->irq_disable) + desc->irq_data.chip->irq_disable(&desc->irq_data); + else + desc->irq_data.chip->irq_mask(&desc->irq_data); + cpumask_clear_cpu(cpu, desc->percpu_enabled); +} + static inline void mask_ack_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask_ack) @@ -544,12 +562,44 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } +/** + * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Per CPU interrupts on SMP machines without locking requirements. Same as + * handle_percpu_irq() above but with the following extras: + * + * action->percpu_dev_id is a pointer to percpu variables which + * contain the real device id for the cpu on which this handler is + * called + */ +void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + void *dev_id = __this_cpu_ptr(action->percpu_dev_id); + irqreturn_t res; + + kstat_incr_irqs_this_cpu(irq, desc); + + if (chip->irq_ack) + chip->irq_ack(&desc->irq_data); + + trace_irq_handler_entry(irq, action); + res = action->handler(irq, dev_id); + trace_irq_handler_exit(irq, action, res); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); +} + void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return; @@ -593,7 +643,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return; diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index e38544dddb18..c89295a8f668 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -6,6 +6,7 @@ #include <linux/io.h> #include <linux/irq.h> #include <linux/slab.h> +#include <linux/export.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/syscore_ops.h> @@ -211,6 +212,7 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, } return gc; } +EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); /* * Separate lockdep class for interrupt chip which can nest irq_desc @@ -258,6 +260,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, } gc->irq_cnt = i - gc->irq_base; } +EXPORT_SYMBOL_GPL(irq_setup_generic_chip); /** * irq_setup_alt_chip - Switch to alternative chip @@ -281,6 +284,7 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type) } return -EINVAL; } +EXPORT_SYMBOL_GPL(irq_setup_alt_chip); /** * irq_remove_generic_chip - Remove a chip @@ -311,6 +315,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, irq_modify_status(i, clr, set); } } +EXPORT_SYMBOL_GPL(irq_remove_generic_chip); #ifdef CONFIG_PM static int irq_gc_suspend(void) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6546431447d7..b7952316016a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,7 +15,7 @@ #define istate core_internal_state__do_not_mess_with_it -extern int noirqdebug; +extern bool noirqdebug; /* * Bits used by threaded handlers: @@ -71,6 +71,8 @@ extern int irq_startup(struct irq_desc *desc); extern void irq_shutdown(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); +extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); +extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); @@ -114,14 +116,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } +#define _IRQ_DESC_CHECK (1 << 0) +#define _IRQ_DESC_PERCPU (1 << 1) + +#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) +#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) + struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, + unsigned int check); void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); static inline struct irq_desc * -irq_get_desc_buslock(unsigned int irq, unsigned long *flags) +irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check) { - return __irq_get_desc_lock(irq, flags, true); + return __irq_get_desc_lock(irq, flags, true, check); } static inline void @@ -131,9 +140,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) } static inline struct irq_desc * -irq_get_desc_lock(unsigned int irq, unsigned long *flags) +irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check) { - return __irq_get_desc_lock(irq, flags, false); + return __irq_get_desc_lock(irq, flags, false, check); } static inline void diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 039b889ea053..d86e254b95eb 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -9,7 +9,7 @@ */ #include <linux/irq.h> #include <linux/slab.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/radix-tree.h> @@ -424,11 +424,22 @@ unsigned int irq_get_next_irq(unsigned int offset) } struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, + unsigned int check) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { + if (check & _IRQ_DESC_CHECK) { + if ((check & _IRQ_DESC_PERCPU) && + !irq_settings_is_per_cpu_devid(desc)) + return NULL; + + if (!(check & _IRQ_DESC_PERCPU) && + irq_settings_is_per_cpu_devid(desc)) + return NULL; + } + if (bus) chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, *flags); @@ -443,6 +454,25 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) chip_bus_sync_unlock(desc); } +int irq_set_percpu_devid(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc) + return -EINVAL; + + if (desc->percpu_enabled) + return -EINVAL; + + desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL); + + if (!desc->percpu_enabled) + return -ENOMEM; + + irq_set_percpu_devid_flags(irq); + return 0; +} + /** * dynamic_irq_cleanup - cleanup a dynamically allocated irq * @irq: irq number to initialize diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b57a3776de44..1f9e26526b69 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -20,15 +20,15 @@ static DEFINE_MUTEX(irq_domain_mutex); void irq_domain_add(struct irq_domain *domain) { struct irq_data *d; - int hwirq; + int hwirq, irq; /* * This assumes that the irq_domain owner has already allocated * the irq_descs. This block will be removed when support for dynamic * allocation of irq_descs is added to irq_domain. */ - for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { - d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); + irq_domain_for_each_irq(domain, hwirq, irq) { + d = irq_get_irq_data(irq); if (!d) { WARN(1, "error: assigning domain to non existant irq_desc"); return; @@ -54,15 +54,15 @@ void irq_domain_add(struct irq_domain *domain) void irq_domain_del(struct irq_domain *domain) { struct irq_data *d; - int hwirq; + int hwirq, irq; mutex_lock(&irq_domain_mutex); list_del(&domain->list); mutex_unlock(&irq_domain_mutex); /* Clear the irq_domain assignments */ - for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { - d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); + irq_domain_for_each_irq(domain, hwirq, irq) { + d = irq_get_irq_data(irq); d->domain = NULL; } } @@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, return -EINVAL; if (intsize < 1) return -EINVAL; + if (d->nr_irq && ((intspec[0] < d->hwirq_base) || + (intspec[0] >= d->hwirq_base + d->nr_irq))) + return -EINVAL; *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; @@ -143,11 +146,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, return 0; } -struct irq_domain_ops irq_domain_simple_ops = { - .dt_translate = irq_domain_simple_dt_translate, -}; -EXPORT_SYMBOL_GPL(irq_domain_simple_ops); - /** * irq_domain_create_simple() - Set up a 'simple' translation range */ @@ -182,3 +180,10 @@ void irq_domain_generate_simple(const struct of_device_id *match, } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); #endif /* CONFIG_OF_IRQ */ + +struct irq_domain_ops irq_domain_simple_ops = { +#ifdef CONFIG_OF_IRQ + .dt_translate = irq_domain_simple_dt_translate, +#endif /* CONFIG_OF_IRQ */ +}; +EXPORT_SYMBOL_GPL(irq_domain_simple_ops); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9b956fa20308..a9a9dbe49fea 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -195,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; @@ -356,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) static int __disable_irq_nosync(unsigned int irq) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; @@ -448,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) void enable_irq(unsigned int irq) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return; @@ -467,6 +467,9 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; + if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE) + return 0; + if (desc->irq_data.chip->irq_set_wake) ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); @@ -488,7 +491,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) int irq_set_irq_wake(unsigned int irq, unsigned int on) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); int ret = 0; if (!desc) @@ -529,7 +532,7 @@ EXPORT_SYMBOL(irq_set_irq_wake); int can_request_irq(unsigned int irq, unsigned long irqflags) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); int canrequest = 0; if (!desc) @@ -620,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) static int irq_wait_for_interrupt(struct irqaction *action) { + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { @@ -629,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action) return 0; } schedule(); + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); return -1; } @@ -1118,6 +1124,8 @@ int setup_irq(unsigned int irq, struct irqaction *act) int retval; struct irq_desc *desc = irq_to_desc(irq); + if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) + return -EINVAL; chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); @@ -1126,7 +1134,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) } EXPORT_SYMBOL_GPL(setup_irq); - /* +/* * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ @@ -1224,7 +1232,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) */ void remove_irq(unsigned int irq, struct irqaction *act) { - __free_irq(irq, act->dev_id); + struct irq_desc *desc = irq_to_desc(irq); + + if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) + __free_irq(irq, act->dev_id); } EXPORT_SYMBOL_GPL(remove_irq); @@ -1246,7 +1257,7 @@ void free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - if (!desc) + if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; #ifdef CONFIG_SMP @@ -1281,7 +1292,7 @@ EXPORT_SYMBOL(free_irq); * and to set up the interrupt handler in the right order. * * If you want to set up a threaded irq handler for your device - * then you need to supply @handler and @thread_fn. @handler ist + * then you need to supply @handler and @thread_fn. @handler is * still called in hard interrupt context and has to check * whether the interrupt originates from the device. If yes it * needs to disable the interrupt on the device and return @@ -1324,7 +1335,8 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (!irq_settings_can_request(desc)) + if (!irq_settings_can_request(desc) || + WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; if (!handler) { @@ -1409,3 +1421,194 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, return !ret ? IRQC_IS_HARDIRQ : ret; } EXPORT_SYMBOL_GPL(request_any_context_irq); + +void enable_percpu_irq(unsigned int irq, unsigned int type) +{ + unsigned int cpu = smp_processor_id(); + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); + + if (!desc) + return; + + type &= IRQ_TYPE_SENSE_MASK; + if (type != IRQ_TYPE_NONE) { + int ret; + + ret = __irq_set_trigger(desc, irq, type); + + if (ret) { + WARN(1, "failed to set type for IRQ%d\n", irq); + goto out; + } + } + + irq_percpu_enable(desc, cpu); +out: + irq_put_desc_unlock(desc, flags); +} + +void disable_percpu_irq(unsigned int irq) +{ + unsigned int cpu = smp_processor_id(); + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); + + if (!desc) + return; + + irq_percpu_disable(desc, cpu); + irq_put_desc_unlock(desc, flags); +} + +/* + * Internal function to unregister a percpu irqaction. + */ +static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + unsigned long flags; + + WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); + + if (!desc) + return NULL; + + raw_spin_lock_irqsave(&desc->lock, flags); + + action = desc->action; + if (!action || action->percpu_dev_id != dev_id) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + goto bad; + } + + if (!cpumask_empty(desc->percpu_enabled)) { + WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", + irq, cpumask_first(desc->percpu_enabled)); + goto bad; + } + + /* Found it - now remove it from the list of entries: */ + desc->action = NULL; + + raw_spin_unlock_irqrestore(&desc->lock, flags); + + unregister_handler_proc(irq, action); + + module_put(desc->owner); + return action; + +bad: + raw_spin_unlock_irqrestore(&desc->lock, flags); + return NULL; +} + +/** + * remove_percpu_irq - free a per-cpu interrupt + * @irq: Interrupt line to free + * @act: irqaction for the interrupt + * + * Used to remove interrupts statically setup by the early boot process. + */ +void remove_percpu_irq(unsigned int irq, struct irqaction *act) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc && irq_settings_is_per_cpu_devid(desc)) + __free_percpu_irq(irq, act->percpu_dev_id); +} + +/** + * free_percpu_irq - free an interrupt allocated with request_percpu_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Remove a percpu interrupt handler. The handler is removed, but + * the interrupt line is not disabled. This must be done on each + * CPU before calling this function. The function does not return + * until any executing interrupts for this IRQ have completed. + * + * This function must not be called from interrupt context. + */ +void free_percpu_irq(unsigned int irq, void __percpu *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc || !irq_settings_is_per_cpu_devid(desc)) + return; + + chip_bus_lock(desc); + kfree(__free_percpu_irq(irq, dev_id)); + chip_bus_sync_unlock(desc); +} + +/** + * setup_percpu_irq - setup a per-cpu interrupt + * @irq: Interrupt line to setup + * @act: irqaction for the interrupt + * + * Used to statically setup per-cpu interrupts in the early boot process. + */ +int setup_percpu_irq(unsigned int irq, struct irqaction *act) +{ + struct irq_desc *desc = irq_to_desc(irq); + int retval; + + if (!desc || !irq_settings_is_per_cpu_devid(desc)) + return -EINVAL; + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, act); + chip_bus_sync_unlock(desc); + + return retval; +} + +/** + * request_percpu_irq - allocate a percpu interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @devname: An ascii name for the claiming device + * @dev_id: A percpu cookie passed back to the handler function + * + * This call allocates interrupt resources, but doesn't + * automatically enable the interrupt. It has to be done on each + * CPU using enable_percpu_irq(). + * + * Dev_id must be globally unique. It is a per-cpu variable, and + * the handler gets called with the interrupted CPU's instance of + * that variable. + */ +int request_percpu_irq(unsigned int irq, irq_handler_t handler, + const char *devname, void __percpu *dev_id) +{ + struct irqaction *action; + struct irq_desc *desc; + int retval; + + if (!dev_id) + return -EINVAL; + + desc = irq_to_desc(irq); + if (!desc || !irq_settings_can_request(desc) || + !irq_settings_is_per_cpu_devid(desc)) + return -EINVAL; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; + action->name = devname; + action->percpu_dev_id = dev_id; + + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, action); + chip_bus_sync_unlock(desc); + + if (retval) + kfree(action); + + return retval; +} diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f76fc00c9877..15e53b1766a6 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -9,6 +9,7 @@ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/syscore_ops.h> #include "internals.h" @@ -39,25 +40,58 @@ void suspend_device_irqs(void) } EXPORT_SYMBOL_GPL(suspend_device_irqs); -/** - * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() - * - * Enable all interrupt lines previously disabled by suspend_device_irqs() that - * have the IRQS_SUSPENDED flag set. - */ -void resume_device_irqs(void) +static void resume_irqs(bool want_early) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; + bool is_early = desc->action && + desc->action->flags & IRQF_EARLY_RESUME; + + if (is_early != want_early) + continue; raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); raw_spin_unlock_irqrestore(&desc->lock, flags); } } + +/** + * irq_pm_syscore_ops - enable interrupt lines early + * + * Enable all interrupt lines with %IRQF_EARLY_RESUME set. + */ +static void irq_pm_syscore_resume(void) +{ + resume_irqs(true); +} + +static struct syscore_ops irq_pm_syscore_ops = { + .resume = irq_pm_syscore_resume, +}; + +static int __init irq_pm_init_ops(void) +{ + register_syscore_ops(&irq_pm_syscore_ops); + return 0; +} + +device_initcall(irq_pm_init_ops); + +/** + * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() + * + * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously + * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag + * set as well as those with %IRQF_FORCE_RESUME. + */ +void resume_device_irqs(void) +{ + resume_irqs(false); +} EXPORT_SYMBOL_GPL(resume_device_irqs); /** diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index f1667833d444..1162f1030f18 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -13,6 +13,7 @@ enum { _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, + _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -24,6 +25,7 @@ enum { #define IRQ_NOTHREAD GOT_YOU_MORON #define IRQ_NOAUTOEN GOT_YOU_MORON #define IRQ_NESTED_THREAD GOT_YOU_MORON +#define IRQ_PER_CPU_DEVID GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -39,6 +41,11 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) return desc->status_use_accessors & _IRQ_PER_CPU; } +static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_PER_CPU_DEVID; +} + static inline void irq_settings_set_per_cpu(struct irq_desc *desc) { desc->status_use_accessors |= _IRQ_PER_CPU; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index aa57d5da18c1..611cd6003c45 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) */ action = desc->action; if (!action || !(action->flags & IRQF_SHARED) || - (action->flags & __IRQF_TIMER) || !action->next) + (action->flags & __IRQF_TIMER) || + (action->handler(irq, action->dev_id) == IRQ_HANDLED) || + !action->next) goto out; /* Already running on another processor */ @@ -115,7 +117,7 @@ static int misrouted_irq(int irq) struct irq_desc *desc; int i, ok = 0; - if (atomic_inc_return(&irq_poll_active) == 1) + if (atomic_inc_return(&irq_poll_active) != 1) goto out; irq_poll_cpu = smp_processor_id(); @@ -323,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->irqs_unhandled = 0; } -int noirqdebug __read_mostly; +bool noirqdebug __read_mostly; int noirqdebug_setup(char *str) { diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c58fa7da8aef..c3c46c72046e 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -6,9 +6,11 @@ */ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/irq_work.h> +#include <linux/percpu.h> #include <linux/hardirq.h> +#include <asm/processor.h> /* * An entry can be in one of four states: @@ -17,54 +19,34 @@ * claimed NULL, 3 -> {pending} : claimed to be enqueued * pending next, 3 -> {busy} : queued, pending callback * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed - * - * We use the lower two bits of the next pointer to keep PENDING and BUSY - * flags. */ #define IRQ_WORK_PENDING 1UL #define IRQ_WORK_BUSY 2UL #define IRQ_WORK_FLAGS 3UL -static inline bool irq_work_is_set(struct irq_work *entry, int flags) -{ - return (unsigned long)entry->next & flags; -} - -static inline struct irq_work *irq_work_next(struct irq_work *entry) -{ - unsigned long next = (unsigned long)entry->next; - next &= ~IRQ_WORK_FLAGS; - return (struct irq_work *)next; -} - -static inline struct irq_work *next_flags(struct irq_work *entry, int flags) -{ - unsigned long next = (unsigned long)entry; - next |= flags; - return (struct irq_work *)next; -} - -static DEFINE_PER_CPU(struct irq_work *, irq_work_list); +static DEFINE_PER_CPU(struct llist_head, irq_work_list); /* * Claim the entry so that no one else will poke at it. */ -static bool irq_work_claim(struct irq_work *entry) +static bool irq_work_claim(struct irq_work *work) { - struct irq_work *next, *nflags; + unsigned long flags, nflags; - do { - next = entry->next; - if ((unsigned long)next & IRQ_WORK_PENDING) + for (;;) { + flags = work->flags; + if (flags & IRQ_WORK_PENDING) return false; - nflags = next_flags(next, IRQ_WORK_FLAGS); - } while (cmpxchg(&entry->next, next, nflags) != next); + nflags = flags | IRQ_WORK_FLAGS; + if (cmpxchg(&work->flags, flags, nflags) == flags) + break; + cpu_relax(); + } return true; } - void __weak arch_irq_work_raise(void) { /* @@ -75,20 +57,15 @@ void __weak arch_irq_work_raise(void) /* * Queue the entry and raise the IPI if needed. */ -static void __irq_work_queue(struct irq_work *entry) +static void __irq_work_queue(struct irq_work *work) { - struct irq_work *next; + bool empty; preempt_disable(); - do { - next = __this_cpu_read(irq_work_list); - /* Can assign non-atomic because we keep the flags set. */ - entry->next = next_flags(next, IRQ_WORK_FLAGS); - } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); - + empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); /* The list was empty, raise self-interrupt to start processing. */ - if (!irq_work_next(entry)) + if (empty) arch_irq_work_raise(); preempt_enable(); @@ -100,16 +77,16 @@ static void __irq_work_queue(struct irq_work *entry) * * Can be re-enqueued while the callback is still in progress. */ -bool irq_work_queue(struct irq_work *entry) +bool irq_work_queue(struct irq_work *work) { - if (!irq_work_claim(entry)) { + if (!irq_work_claim(work)) { /* * Already enqueued, can't do! */ return false; } - __irq_work_queue(entry); + __irq_work_queue(work); return true; } EXPORT_SYMBOL_GPL(irq_work_queue); @@ -120,34 +97,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue); */ void irq_work_run(void) { - struct irq_work *list; + struct irq_work *work; + struct llist_head *this_list; + struct llist_node *llnode; - if (this_cpu_read(irq_work_list) == NULL) + this_list = &__get_cpu_var(irq_work_list); + if (llist_empty(this_list)) return; BUG_ON(!in_irq()); BUG_ON(!irqs_disabled()); - list = this_cpu_xchg(irq_work_list, NULL); - - while (list != NULL) { - struct irq_work *entry = list; + llnode = llist_del_all(this_list); + while (llnode != NULL) { + work = llist_entry(llnode, struct irq_work, llnode); - list = irq_work_next(list); + llnode = llist_next(llnode); /* - * Clear the PENDING bit, after this point the @entry + * Clear the PENDING bit, after this point the @work * can be re-used. */ - entry->next = next_flags(NULL, IRQ_WORK_BUSY); - entry->func(entry); + work->flags = IRQ_WORK_BUSY; + work->func(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - (void)cmpxchg(&entry->next, - next_flags(NULL, IRQ_WORK_BUSY), - NULL); + (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); } } EXPORT_SYMBOL_GPL(irq_work_run); @@ -156,11 +133,11 @@ EXPORT_SYMBOL_GPL(irq_work_run); * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. */ -void irq_work_sync(struct irq_work *entry) +void irq_work_sync(struct irq_work *work) { WARN_ON_ONCE(irqs_disabled()); - while (irq_work_is_set(entry, IRQ_WORK_BUSY)) + while (work->flags & IRQ_WORK_BUSY) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/itimer.c b/kernel/itimer.c index d802883153da..22000c3db0dd 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, cval = it->expires; cinterval = it->incr; - if (!cputime_eq(cval, cputime_zero)) { + if (cval) { struct task_cputime cputime; cputime_t t; thread_group_cputimer(tsk, &cputime); if (clock_id == CPUCLOCK_PROF) - t = cputime_add(cputime.utime, cputime.stime); + t = cputime.utime + cputime.stime; else /* CPUCLOCK_VIRT */ t = cputime.utime; - if (cputime_le(cval, t)) + if (cval < t) /* about to fire */ cval = cputime_one_jiffy; else - cval = cputime_sub(cval, t); + cval = cval - t; } spin_unlock_irq(&tsk->sighand->siglock); @@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, cval = it->expires; cinterval = it->incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, cputime_one_jiffy); + if (cval || nval) { + if (nval > 0) + nval += cputime_one_jiffy; set_process_cpu_timer(tsk, clock_id, &nval, &cval); } it->expires = nval; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index a8ce45097f3d..01d3b70fc98a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -66,19 +66,53 @@ void jump_label_inc(struct jump_label_key *key) return; jump_label_lock(); - if (atomic_add_return(1, &key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) jump_label_update(key, JUMP_LABEL_ENABLE); + atomic_inc(&key->enabled); jump_label_unlock(); } +EXPORT_SYMBOL_GPL(jump_label_inc); -void jump_label_dec(struct jump_label_key *key) +static void __jump_label_dec(struct jump_label_key *key, + unsigned long rate_limit, struct delayed_work *work) { if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) return; - jump_label_update(key, JUMP_LABEL_DISABLE); + if (rate_limit) { + atomic_inc(&key->enabled); + schedule_delayed_work(work, rate_limit); + } else + jump_label_update(key, JUMP_LABEL_DISABLE); + jump_label_unlock(); } +EXPORT_SYMBOL_GPL(jump_label_dec); + +static void jump_label_update_timeout(struct work_struct *work) +{ + struct jump_label_key_deferred *key = + container_of(work, struct jump_label_key_deferred, work.work); + __jump_label_dec(&key->key, 0, NULL); +} + +void jump_label_dec(struct jump_label_key *key) +{ + __jump_label_dec(key, 0, NULL); +} + +void jump_label_dec_deferred(struct jump_label_key_deferred *key) +{ + __jump_label_dec(&key->key, key->timeout, &key->work); +} + + +void jump_label_rate_limit(struct jump_label_key_deferred *key, + unsigned long rl) +{ + key->timeout = rl; + INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); +} static int addr_conflict(struct jump_entry *entry, void *start, void *end) { @@ -104,6 +138,18 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, return 0; } +/* + * Update code which is definitely not currently executing. + * Architectures which need heavyweight synchronization to modify + * running code can override this to make the non-live update case + * cheaper. + */ +void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, + enum jump_label_type type) +{ + arch_jump_label_transform(entry, type); +} + static void __jump_label_update(struct jump_label_key *key, struct jump_entry *entry, struct jump_entry *stop, int enable) @@ -121,14 +167,7 @@ static void __jump_label_update(struct jump_label_key *key, } } -/* - * Not all archs need this. - */ -void __weak arch_jump_label_text_poke_early(jump_label_t addr) -{ -} - -static __init int jump_label_init(void) +void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; @@ -139,22 +178,22 @@ static __init int jump_label_init(void) jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - arch_jump_label_text_poke_early(iter->code); - if (iter->key == (jump_label_t)(unsigned long)key) + struct jump_label_key *iterk; + + iterk = (struct jump_label_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? + JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + if (iterk == key) continue; - key = (struct jump_label_key *)(unsigned long)iter->key; - atomic_set(&key->enabled, 0); + key = iterk; key->entries = iter; #ifdef CONFIG_MODULES key->next = NULL; #endif } jump_label_unlock(); - - return 0; } -early_initcall(jump_label_init); #ifdef CONFIG_MODULES @@ -211,8 +250,13 @@ void jump_label_apply_nops(struct module *mod) if (iter_start == iter_stop) return; - for (iter = iter_start; iter < iter_stop; iter++) - arch_jump_label_text_poke_early(iter->code); + for (iter = iter_start; iter < iter_stop; iter++) { + struct jump_label_key *iterk; + + iterk = (struct jump_label_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? + JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + } } static int jump_label_add_module(struct module *mod) @@ -252,8 +296,7 @@ static int jump_label_add_module(struct module *mod) key->next = jlm; if (jump_label_enabled(key)) - __jump_label_update(key, iter, iter_stop, - JUMP_LABEL_ENABLE); + __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } return 0; diff --git a/kernel/kexec.c b/kernel/kexec.c index 296fbc84d659..7b0886786701 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,7 +32,6 @@ #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> -#include <linux/kmsg_dump.h> #include <linux/syscore_ops.h> #include <asm/page.h> @@ -498,7 +497,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, while (hole_end <= crashk_res.end) { unsigned long i; - if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) + if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; if (hole_end > crashk_res.end) break; @@ -999,6 +998,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, kimage_free(xchg(&kexec_crash_image, NULL)); result = kimage_crash_alloc(&image, entry, nr_segments, segments); + crash_map_reserved_pages(); } if (result) goto out; @@ -1015,6 +1015,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, goto out; } kimage_terminate(image); + if (flags & KEXEC_ON_CRASH) + crash_unmap_reserved_pages(); } /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); @@ -1026,6 +1028,18 @@ out: return result; } +/* + * Add and remove page tables for crashkernel memory + * + * Provide an empty default implementation here -- architecture + * code may override this + */ +void __weak crash_map_reserved_pages(void) +{} + +void __weak crash_unmap_reserved_pages(void) +{} + #ifdef CONFIG_COMPAT asmlinkage long compat_sys_kexec_load(unsigned long entry, unsigned long nr_segments, @@ -1079,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs) if (kexec_crash_image) { struct pt_regs fixed_regs; - kmsg_dump(KMSG_DUMP_KEXEC); - crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); @@ -1117,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; + unsigned long old_size; + struct resource *ram_res; mutex_lock(&kexec_mutex); @@ -1126,23 +1140,37 @@ int crash_shrink_memory(unsigned long new_size) } start = crashk_res.start; end = crashk_res.end; + old_size = (end == 0) ? 0 : end - start + 1; + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; + } - if (new_size >= end - start + 1) { - ret = -EINVAL; - if (new_size == end - start + 1) - ret = 0; + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) { + ret = -ENOMEM; goto unlock; } - start = roundup(start, PAGE_SIZE); - end = roundup(start + new_size, PAGE_SIZE); + start = roundup(start, KEXEC_CRASH_MEM_ALIGN); + end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); + crash_map_reserved_pages(); crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); + + ram_res->start = end; + ram_res->end = crashk_res.end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->name = "System RAM"; + crashk_res.end = end - 1; + insert_resource(&iomem_resource, ram_res); + crash_unmap_reserved_pages(); + unlock: mutex_unlock(&kexec_mutex); return ret; @@ -1380,24 +1408,23 @@ int __init parse_crashkernel(char *cmdline, } - -void crash_save_vmcoreinfo(void) +static void update_vmcoreinfo_note(void) { - u32 *buf; + u32 *buf = vmcoreinfo_note; if (!vmcoreinfo_size) return; - - vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); - - buf = (u32 *)vmcoreinfo_note; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, vmcoreinfo_size); - final_note(buf); } +void crash_save_vmcoreinfo(void) +{ + vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); + update_vmcoreinfo_note(); +} + void vmcoreinfo_append_str(const char *fmt, ...) { va_list args; @@ -1483,6 +1510,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_swapcache); arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); return 0; } @@ -1506,7 +1534,7 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { - mutex_lock(&pm_mutex); + lock_system_sleep(); pm_prepare_console(); error = freeze_processes(); if (error) { @@ -1559,7 +1587,7 @@ int kernel_kexec(void) thaw_processes(); Restore_console: pm_restore_console(); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } #endif diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 01a0700e873f..c744b88c44e2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -20,7 +20,7 @@ */ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/log2.h> diff --git a/kernel/kmod.c b/kernel/kmod.c index ddc7644c1305..a0a88543934e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -36,6 +36,7 @@ #include <linux/resource.h> #include <linux/notifier.h> #include <linux/suspend.h> +#include <linux/rwsem.h> #include <asm/uaccess.h> #include <trace/events/module.h> @@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq; static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); +static DECLARE_RWSEM(umhelper_sem); #ifdef CONFIG_MODULES @@ -114,10 +116,12 @@ int __request_module(bool wait, const char *fmt, ...) atomic_inc(&kmod_concurrent); if (atomic_read(&kmod_concurrent) > max_modprobes) { /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg++ < 5) + if (kmod_loop_msg < 5) { printk(KERN_ERR "request_module: runaway loop modprobe %s\n", module_name); + kmod_loop_msg++; + } atomic_dec(&kmod_concurrent); return -ENOMEM; } @@ -273,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work) * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user * land has been frozen during a system-wide hibernation or suspend operation). + * Should always be manipulated under umhelper_sem acquired for write. */ static int usermodehelper_disabled = 1; @@ -280,17 +285,29 @@ static int usermodehelper_disabled = 1; static atomic_t running_helpers = ATOMIC_INIT(0); /* - * Wait queue head used by usermodehelper_pm_callback() to wait for all running + * Wait queue head used by usermodehelper_disable() to wait for all running * helpers to finish. */ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); /* * Time to wait for running_helpers to become zero before the setting of - * usermodehelper_disabled in usermodehelper_pm_callback() fails + * usermodehelper_disabled in usermodehelper_disable() fails */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) +void read_lock_usermodehelper(void) +{ + down_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(read_lock_usermodehelper); + +void read_unlock_usermodehelper(void) +{ + up_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); + /** * usermodehelper_disable - prevent new helpers from being started */ @@ -298,8 +315,10 @@ int usermodehelper_disable(void) { long retval; + down_write(&umhelper_sem); usermodehelper_disabled = 1; - smp_mb(); + up_write(&umhelper_sem); + /* * From now on call_usermodehelper_exec() won't start any new * helpers, so it is sufficient if running_helpers turns out to @@ -312,7 +331,9 @@ int usermodehelper_disable(void) if (retval) return 0; + down_write(&umhelper_sem); usermodehelper_disabled = 0; + up_write(&umhelper_sem); return -EAGAIN; } @@ -321,7 +342,9 @@ int usermodehelper_disable(void) */ void usermodehelper_enable(void) { + down_write(&umhelper_sem); usermodehelper_disabled = 0; + up_write(&umhelper_sem); } /** diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b30fd54eb985..95dd7212e610 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -36,7 +36,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/stddef.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/moduleloader.h> #include <linux/kallsyms.h> #include <linux/freezer.h> @@ -78,10 +78,10 @@ static bool kprobes_all_disarmed; static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned_in_smp; + raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; -static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) +static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) { return &(kretprobe_table_locks[hash].lock); } @@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, hlist_del(&ri->hlist); INIT_HLIST_NODE(&ri->hlist); if (likely(rp)) { - spin_lock(&rp->lock); + raw_spin_lock(&rp->lock); hlist_add_head(&ri->hlist, &rp->free_instances); - spin_unlock(&rp->lock); + raw_spin_unlock(&rp->lock); } else /* Unregistering */ hlist_add_head(&ri->hlist, head); @@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, __acquires(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spin_lock_irqsave(hlist_lock, *flags); } static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) __acquires(hlist_lock) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_lock_irqsave(hlist_lock, *flags); } void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, @@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, __releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } static void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) __releases(hlist_lock) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } /* @@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, /*TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); - spin_lock_irqsave(&rp->lock, flags); + raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist); - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); ri->rp = rp; ri->task = current; @@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, kretprobe_table_unlock(hash, &flags); } else { rp->nmissed++; - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); } return 0; } @@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) rp->maxactive = num_possible_cpus(); #endif } - spin_lock_init(&rp->lock); + raw_spin_lock_init(&rp->lock); INIT_HLIST_HEAD(&rp->free_instances); for (i = 0; i < rp->maxactive; i++) { inst = kmalloc(sizeof(struct kretprobe_instance) + @@ -1959,7 +1959,7 @@ static int __init init_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - spin_lock_init(&(kretprobe_table_locks[i].lock)); + raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } /* @@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; - int buf_size; + size_t buf_size; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3b053c04dd86..4e316e1acf58 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -11,10 +11,11 @@ #include <linux/kobject.h> #include <linux/string.h> #include <linux/sysfs.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/kexec.h> #include <linux/profile.h> +#include <linux/stat.h> #include <linux/sched.h> #include <linux/capability.h> diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ba7cccb4994..3d3de633702e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -12,7 +12,7 @@ #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> @@ -59,6 +59,31 @@ int kthread_should_stop(void) EXPORT_SYMBOL(kthread_should_stop); /** + * kthread_freezable_should_stop - should this freezable kthread return now? + * @was_frozen: optional out parameter, indicates whether %current was frozen + * + * kthread_should_stop() for freezable kthreads, which will enter + * refrigerator if necessary. This function is safe from kthread_stop() / + * freezer deadlock and freezable kthreads should use this function instead + * of calling try_to_freeze() directly. + */ +bool kthread_freezable_should_stop(bool *was_frozen) +{ + bool frozen = false; + + might_sleep(); + + if (unlikely(freezing(current))) + frozen = __refrigerator(true); + + if (was_frozen) + *was_frozen = frozen; + + return kthread_should_stop(); +} +EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); + +/** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * @@ -257,7 +282,7 @@ int kthreadd(void *unused) set_cpus_allowed_ptr(tsk, cpu_all_mask); set_mems_allowed(node_states[N_HIGH_MEMORY]); - current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; + current->flags |= PF_NOFREEZE; for (;;) { set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 376066e10413..a462b317f9a0 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -53,12 +53,12 @@ #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/list.h> #include <linux/stacktrace.h> -static DEFINE_SPINLOCK(latency_lock); +static DEFINE_RAW_SPINLOCK(latency_lock); #define MAXLR 128 static struct latency_record latency_record[MAXLR]; @@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p) if (!latencytop_enabled) return; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void clear_global_latency_tracing(void) { unsigned long flags; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&latency_record, 0, sizeof(latency_record)); - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void __sched @@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) lat.max = usecs; store_stacktrace(tsk, &lat); - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); account_global_scheduler_latency(tsk, &lat); @@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static int lstats_show(struct seq_file *m, void *v) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 91d67ce3a8d5..8889f7dd7c46 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -44,6 +44,7 @@ #include <linux/stringify.h> #include <linux/bitops.h> #include <linux/gfp.h> +#include <linux/kmemcheck.h> #include <asm/sections.h> @@ -96,8 +97,13 @@ static int graph_lock(void) static inline int graph_unlock(void) { - if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) + if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { + /* + * The lockdep graph lock isn't locked while we expect it to + * be, we're confused now, bye! + */ return DEBUG_LOCKS_WARN_ON(1); + } current->lockdep_recursion--; arch_spin_unlock(&lockdep_lock); @@ -134,6 +140,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; static inline struct lock_class *hlock_class(struct held_lock *hlock) { if (!hlock->class_idx) { + /* + * Someone passed in garbage, we give up. + */ DEBUG_LOCKS_WARN_ON(1); return NULL; } @@ -422,6 +431,7 @@ unsigned int max_lockdep_depth; * about it later on, in lockdep_info(). */ static int lockdep_init_error; +static const char *lock_init_error; static unsigned long lockdep_init_trace_data[20]; static struct stack_trace lockdep_init_trace = { .max_entries = ARRAY_SIZE(lockdep_init_trace_data), @@ -490,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } -static int __print_lock_name(struct lock_class *class) +static void __print_lock_name(struct lock_class *class) { char str[KSYM_NAME_LEN]; const char *name; name = class->name; - if (!name) - name = __get_key_name(class->key, str); - - return printk("%s", name); -} - -static void print_lock_name(struct lock_class *class) -{ - char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; - const char *name; - - get_usage_chars(class, usage); - - name = class->name; if (!name) { name = __get_key_name(class->key, str); - printk(" (%s", name); + printk("%s", name); } else { - printk(" (%s", name); + printk("%s", name); if (class->name_version > 1) printk("#%d", class->name_version); if (class->subclass) printk("/%d", class->subclass); } +} + +static void print_lock_name(struct lock_class *class) +{ + char usage[LOCK_USAGE_CHARS]; + + get_usage_chars(class, usage); + + printk(" ("); + __print_lock_name(class); printk("){%s}", usage); } @@ -559,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr) } } -static void print_kernel_version(void) +static void print_kernel_ident(void) { - printk("%s %.*s\n", init_utsname()->release, + printk("%s %.*s %s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, + print_tainted()); } static int very_verbose(struct lock_class *class) @@ -647,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (unlikely(!lockdep_initialized)) { lockdep_init(); lockdep_init_error = 1; + lock_init_error = lock->name; save_stack_trace(&lockdep_init_trace); } #endif @@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) */ list_for_each_entry(class, hash_head, hash_entry) { if (class->key == key) { + /* + * Huh! same key, different name? Did someone trample + * on some memory? We're most confused. + */ WARN_ON_ONCE(class->name != lock->name); return class; } @@ -710,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) class = look_up_lock_class(lock, subclass); if (likely(class)) - return class; + goto out_set_class_cache; /* * Debug-check: all keys must be persistent! @@ -795,11 +807,16 @@ out_unlock_set: graph_unlock(); raw_local_irq_restore(flags); +out_set_class_cache: if (!subclass || force) lock->class_cache[0] = class; else if (subclass < NR_LOCKDEP_CACHING_CLASSES) lock->class_cache[subclass] = class; + /* + * Hash collision, did we smoke some? We found a class with a matching + * hash but the subclass -- which is hashed in -- didn't match. + */ if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) return NULL; @@ -926,7 +943,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); + WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ lock->parent = parent; lock->class->dep_gen_id = lockdep_dependency_gen_id; } @@ -936,7 +953,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); + WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1129,10 +1146,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, if (debug_locks_silent) return 0; - printk("\n=======================================================\n"); - printk( "[ INFO: possible circular locking dependency detected ]\n"); - print_kernel_version(); - printk( "-------------------------------------------------------\n"); + printk("\n"); + printk("======================================================\n"); + printk("[ INFO: possible circular locking dependency detected ]\n"); + print_kernel_ident(); + printk("-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); @@ -1196,6 +1214,9 @@ static noinline int print_bfs_bug(int ret) if (!debug_locks_off_graph_unlock()) return 0; + /* + * Breadth-first-search failed, graph got corrupted? + */ WARN(1, "lockdep bfs error:%d\n", ret); return 0; @@ -1463,11 +1484,12 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n======================================================\n"); - printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + printk("\n"); + printk("======================================================\n"); + printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", irqclass, irqclass); - print_kernel_version(); - printk( "------------------------------------------------------\n"); + print_kernel_ident(); + printk("------------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, @@ -1692,10 +1714,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n=============================================\n"); - printk( "[ INFO: possible recursive locking detected ]\n"); - print_kernel_version(); - printk( "---------------------------------------------\n"); + printk("\n"); + printk("=============================================\n"); + printk("[ INFO: possible recursive locking detected ]\n"); + print_kernel_ident(); + printk("---------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); @@ -1944,6 +1967,11 @@ out_bug: if (!debug_locks_off_graph_unlock()) return 0; + /* + * Clearly we all shouldn't be here, but since we made it we + * can reliable say we messed up our state. See the above two + * gotos for reasons why we could possibly end up here. + */ WARN_ON(1); return 0; @@ -1975,6 +2003,11 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct held_lock *hlock_curr, *hlock_next; int i, j; + /* + * We might need to take the graph lock, ensure we've got IRQs + * disabled to make this an IRQ-safe lock.. for recursion reasons + * lockdep won't complain about its own locking errors. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; /* @@ -2126,6 +2159,10 @@ static void check_chain_key(struct task_struct *curr) hlock = curr->held_locks + i; if (chain_key != hlock->prev_chain_key) { debug_locks_off(); + /* + * We got mighty confused, our chain keys don't match + * with what we expect, someone trample on our task state? + */ WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, @@ -2133,6 +2170,9 @@ static void check_chain_key(struct task_struct *curr) return; } id = hlock->class_idx - 1; + /* + * Whoops ran out of static storage again? + */ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return; @@ -2144,6 +2184,10 @@ static void check_chain_key(struct task_struct *curr) } if (chain_key != curr->curr_chain_key) { debug_locks_off(); + /* + * More smoking hash instead of calculating it, damn see these + * numbers float.. I bet that a pink elephant stepped on my memory. + */ WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, @@ -2177,10 +2221,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n=================================\n"); - printk( "[ INFO: inconsistent lock state ]\n"); - print_kernel_version(); - printk( "---------------------------------\n"); + printk("\n"); + printk("=================================\n"); + printk("[ INFO: inconsistent lock state ]\n"); + print_kernel_ident(); + printk("---------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); @@ -2241,10 +2286,11 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n=========================================================\n"); - printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); - print_kernel_version(); - printk( "---------------------------------------------------------\n"); + printk("\n"); + printk("=========================================================\n"); + printk("[ INFO: possible irq lock inversion dependency detected ]\n"); + print_kernel_ident(); + printk("---------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); @@ -2525,12 +2571,24 @@ void trace_hardirqs_on_caller(unsigned long ip) return; } + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; + /* + * See the fine text that goes along with this variable definition. + */ if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) return; + /* + * Can't allow enabling interrupts while in an interrupt handler, + * that's general bad form and such. Recursion, limited stack etc.. + */ if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; @@ -2558,6 +2616,10 @@ void trace_hardirqs_off_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + /* + * So we're supposed to get called after you mask local IRQs, but for + * some reason the hardware doesn't quite think you did a proper job. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2590,6 +2652,10 @@ void trace_softirqs_on(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + /* + * We fancy IRQs being disabled here, see softirq.c, avoids + * funny state and nesting things. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2626,6 +2692,9 @@ void trace_softirqs_off(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + /* + * We fancy IRQs being disabled here, see softirq.c + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2637,6 +2706,9 @@ void trace_softirqs_off(unsigned long ip) curr->softirq_disable_ip = ip; curr->softirq_disable_event = ++curr->irq_events; debug_atomic_inc(softirqs_off_events); + /* + * Whoops, we wanted softirqs off, so why aren't they? + */ DEBUG_LOCKS_WARN_ON(!softirq_count()); } else debug_atomic_inc(redundant_softirqs_off); @@ -2661,6 +2733,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (!(gfp_mask & __GFP_FS)) return; + /* + * Oi! Can't be having __GFP_FS allocations with IRQs disabled. + */ if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) return; @@ -2773,13 +2848,13 @@ static int separate_irq_context(struct task_struct *curr, return 0; } -#else +#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ static inline int mark_lock_irq(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - WARN_ON(1); + WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ return 1; } @@ -2799,7 +2874,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask) { } -#endif +#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ /* * Mark a lock with a usage bit, and validate the state transition: @@ -2874,12 +2949,20 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - memset(lock, 0, sizeof(*lock)); + int i; + + kmemcheck_mark_initialized(lock, sizeof(*lock)); + + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + lock->class_cache[i] = NULL; #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); #endif + /* + * Can't be having no nameless bastards around this place! + */ if (DEBUG_LOCKS_WARN_ON(!name)) { lock->name = "NULL"; return; @@ -2887,6 +2970,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; + /* + * No key, no joy, we need to hash something. + */ if (DEBUG_LOCKS_WARN_ON(!key)) return; /* @@ -2894,6 +2980,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, */ if (!static_obj(key)) { printk("BUG: key %p not in .data!\n", key); + /* + * What it says above ^^^^^, I suggest you read it. + */ DEBUG_LOCKS_WARN_ON(1); return; } @@ -2932,6 +3021,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(!debug_locks)) return 0; + /* + * Lockdep should run with IRQs disabled, otherwise we could + * get an interrupt which would want to take locks, which would + * end up in lockdep and have you got a head-ache already? + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -2963,6 +3057,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * dependency checks are done) */ depth = curr->lockdep_depth; + /* + * Ran out of static storage for our per-task lock stack again have we? + */ if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; @@ -2981,6 +3078,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } hlock = curr->held_locks + depth; + /* + * Plain impossible, we just registered it and checked it weren't no + * NULL like.. I bet this mushroom I ate was good! + */ if (DEBUG_LOCKS_WARN_ON(!class)) return 0; hlock->class_idx = class_idx; @@ -3015,11 +3116,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * the hash, not class->key. */ id = class - lock_classes; + /* + * Whoops, we did it again.. ran straight out of our static allocation. + */ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return 0; chain_key = curr->curr_chain_key; if (!depth) { + /* + * How can we have a chain hash when we ain't got no keys?! + */ if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) return 0; chain_head = 1; @@ -3065,9 +3172,11 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n=====================================\n"); - printk( "[ BUG: bad unlock balance detected! ]\n"); - printk( "-------------------------------------\n"); + printk("\n"); + printk("=====================================\n"); + printk("[ BUG: bad unlock balance detected! ]\n"); + print_kernel_ident(); + printk("-------------------------------------\n"); printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3091,6 +3200,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, { if (unlikely(!debug_locks)) return 0; + /* + * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -3120,6 +3232,11 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) if (!class) return 0; + /* + * References, but not a lock we're actually ref-counting? + * State got messed up, follow the sites that change ->references + * and try to make sense of it. + */ if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) return 0; @@ -3142,6 +3259,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, int i; depth = curr->lockdep_depth; + /* + * This function is about (re)setting the class of a held lock, + * yet we're not actually holding any locks. Naughty user! + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; @@ -3177,6 +3298,10 @@ found_it: return 0; } + /* + * I took it apart and put it back together again, except now I have + * these 'spare' parts.. where shall I put them. + */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) return 0; return 1; @@ -3201,6 +3326,10 @@ lock_release_non_nested(struct task_struct *curr, * of held locks: */ depth = curr->lockdep_depth; + /* + * So we're all set to release this lock.. wait what lock? We don't + * own any locks, you've been drinking again? + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; @@ -3253,6 +3382,10 @@ found_it: return 0; } + /* + * We had N bottles of beer on the wall, we drank one, but now + * there's not N-1 bottles of beer left on the wall... + */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) return 0; return 1; @@ -3283,6 +3416,9 @@ static int lock_release_nested(struct task_struct *curr, return lock_release_non_nested(curr, lock, ip); curr->lockdep_depth--; + /* + * No more locks, but somehow we've got hash left over, who left it? + */ if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) return 0; @@ -3365,10 +3501,13 @@ static void check_flags(unsigned long flags) * check if not in hardirq contexts: */ if (!hardirq_count()) { - if (softirq_count()) + if (softirq_count()) { + /* like the above, but with softirqs */ DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); - else + } else { + /* lick the above, does it taste good? */ DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); + } } if (!debug_locks) @@ -3478,9 +3617,11 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n=================================\n"); - printk( "[ BUG: bad contention detected! ]\n"); - printk( "---------------------------------\n"); + printk("\n"); + printk("=================================\n"); + printk("[ BUG: bad contention detected! ]\n"); + print_kernel_ident(); + printk("---------------------------------\n"); printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3506,6 +3647,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) int i, contention_point, contending_point; depth = curr->lockdep_depth; + /* + * Whee, we contended on this lock, except it seems we're not + * actually trying to acquire anything much at all.. + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return; @@ -3555,6 +3700,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) int i, cpu; depth = curr->lockdep_depth; + /* + * Yay, we acquired ownership of this lock we didn't try to + * acquire, how the heck did that happen? + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return; @@ -3759,8 +3908,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) match |= class == lock->class_cache[j]; if (unlikely(match)) { - if (debug_locks_off_graph_unlock()) + if (debug_locks_off_graph_unlock()) { + /* + * We all just reset everything, how did it match? + */ WARN_ON(1); + } goto out_restore; } } @@ -3823,7 +3976,8 @@ void __init lockdep_info(void) #ifdef CONFIG_DEBUG_LOCKDEP if (lockdep_init_error) { - printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); + printk("WARNING: lockdep init error! lock-%s was acquired" + "before lockdep_init\n", lock_init_error); printk("Call stack leading to lockdep invocation was:\n"); print_stack_trace(&lockdep_init_trace, 0); } @@ -3839,9 +3993,11 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; - printk("\n=========================\n"); - printk( "[ BUG: held lock freed! ]\n"); - printk( "-------------------------\n"); + printk("\n"); + printk("=========================\n"); + printk("[ BUG: held lock freed! ]\n"); + print_kernel_ident(); + printk("-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); @@ -3895,9 +4051,11 @@ static void print_held_locks_bug(struct task_struct *curr) if (debug_locks_silent) return; - printk("\n=====================================\n"); - printk( "[ BUG: lock held at task exit time! ]\n"); - printk( "-------------------------------------\n"); + printk("\n"); + printk("=====================================\n"); + printk("[ BUG: lock held at task exit time! ]\n"); + print_kernel_ident(); + printk("-------------------------------------\n"); printk("%s/%d is exiting with locks still held!\n", curr->comm, task_pid_nr(curr)); lockdep_print_held_locks(curr); @@ -3991,16 +4149,18 @@ void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; - printk("\n================================================\n"); - printk( "[ BUG: lock held when returning to user space! ]\n"); - printk( "------------------------------------------------\n"); + printk("\n"); + printk("================================================\n"); + printk("[ BUG: lock held when returning to user space! ]\n"); + print_kernel_ident(); + printk("------------------------------------------------\n"); printk("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); } } -void lockdep_rcu_dereference(const char *file, const int line) +void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; @@ -4009,15 +4169,38 @@ void lockdep_rcu_dereference(const char *file, const int line) return; #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ - printk("\n===================================================\n"); - printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); - printk( "---------------------------------------------------\n"); - printk("%s:%d invoked rcu_dereference_check() without protection!\n", - file, line); + printk("\n"); + printk("===============================\n"); + printk("[ INFO: suspicious RCU usage. ]\n"); + print_kernel_ident(); + printk("-------------------------------\n"); + printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); + + /* + * If a CPU is in the RCU-free window in idle (ie: in the section + * between rcu_idle_enter() and rcu_idle_exit(), then RCU + * considers that CPU to be in an "extended quiescent state", + * which means that RCU will be completely ignoring that CPU. + * Therefore, rcu_read_lock() and friends have absolutely no + * effect on a CPU running in that state. In other words, even if + * such an RCU-idle CPU has called rcu_read_lock(), RCU might well + * delete data structures out from under it. RCU really has no + * choice here: we need to keep an RCU-free window in idle where + * the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run + * in the idle task. + * + * So complain bitterly if someone does call rcu_read_lock(), + * rcu_read_lock_bh() and so on from extended quiescent states. + */ + if (rcu_is_cpu_idle()) + printk("RCU used illegally from extended quiescent state!\n"); + lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); } -EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); +EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 71edd2f60c02..91c32a0b612c 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -11,7 +11,7 @@ * Code for /proc/lockdep and /proc/lockdep_stats: * */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> diff --git a/kernel/module.c b/kernel/module.c index 04379f92f843..acf6ed3ebe81 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -16,7 +16,7 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/moduleloader.h> #include <linux/ftrace_event.h> #include <linux/init.h> @@ -62,12 +62,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/module.h> -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt , a...) -#endif - #ifndef ARCH_SHF_SMALL #define ARCH_SHF_SMALL 0 #endif @@ -138,7 +132,6 @@ struct load_info { unsigned long len; Elf_Shdr *sechdrs; char *secstrings, *strtab; - unsigned long *strmap; unsigned long symoffs, stroffs; struct _ddebug *debug; unsigned int num_debug; @@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name, return fsa.sym; } - DEBUGP("Failed to find symbol %s\n", name); + pr_debug("Failed to find symbol %s\n", name); return NULL; } EXPORT_SYMBOL_GPL(find_symbol); @@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b) list_for_each_entry(use, &b->source_list, source_list) { if (use->source == a) { - DEBUGP("%s uses %s!\n", a->name, b->name); + pr_debug("%s uses %s!\n", a->name, b->name); return 1; } } - DEBUGP("%s does not use %s!\n", a->name, b->name); + pr_debug("%s does not use %s!\n", a->name, b->name); return 0; } @@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b) { struct module_use *use; - DEBUGP("Allocating new usage for %s.\n", a->name); + pr_debug("Allocating new usage for %s.\n", a->name); use = kmalloc(sizeof(*use), GFP_ATOMIC); if (!use) { printk(KERN_WARNING "%s: out of memory loading\n", a->name); @@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod) mutex_lock(&module_mutex); list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { struct module *i = use->target; - DEBUGP("%s unusing %s\n", mod->name, i->name); + pr_debug("%s unusing %s\n", mod->name, i->name); module_put(i); list_del(&use->source_list); list_del(&use->target_list); @@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced) } } -unsigned int module_refcount(struct module *mod) +unsigned long module_refcount(struct module *mod) { - unsigned int incs = 0, decs = 0; + unsigned long incs = 0, decs = 0; int cpu; for_each_possible_cpu(cpu) @@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod) /* Since we might sleep for some time, release the mutex first */ mutex_unlock(&module_mutex); for (;;) { - DEBUGP("Looking at refcount...\n"); + pr_debug("Looking at refcount...\n"); set_current_state(TASK_UNINTERRUPTIBLE); if (module_refcount(mod) == 0) break; @@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, if (mod->state != MODULE_STATE_LIVE) { /* FIXME: if (force), slam module count and wake up waiter --RR */ - DEBUGP("%s already dying\n", mod->name); + pr_debug("%s already dying\n", mod->name); ret = -EBUSY; goto out; } @@ -849,12 +842,32 @@ out: return ret; } +static size_t module_flags_taint(struct module *mod, char *buf) +{ + size_t l = 0; + + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) + buf[l++] = 'P'; + if (mod->taints & (1 << TAINT_OOT_MODULE)) + buf[l++] = 'O'; + if (mod->taints & (1 << TAINT_FORCED_MODULE)) + buf[l++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[l++] = 'C'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + return l; +} + static inline void print_unload_info(struct seq_file *m, struct module *mod) { struct module_use *use; int printed_something = 0; - seq_printf(m, " %u ", module_refcount(mod)); + seq_printf(m, " %lu ", module_refcount(mod)); /* Always include a trailing , so userspace can differentiate between this and the old multi-field proc format. */ @@ -904,13 +917,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", module_refcount(mk->mod)); + return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); } -static struct module_attribute refcnt = { - .attr = { .name = "refcnt", .mode = 0444 }, - .show = show_refcnt, -}; +static struct module_attribute modinfo_refcnt = + __ATTR(refcnt, 0444, show_refcnt, NULL); void module_put(struct module *module) { @@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, return sprintf(buffer, "%s\n", state); } -static struct module_attribute initstate = { - .attr = { .name = "initstate", .mode = 0444 }, - .show = show_initstate, -}; +static struct module_attribute modinfo_initstate = + __ATTR(initstate, 0444, show_initstate, NULL); static ssize_t store_uevent(struct module_attribute *mattr, struct module_kobject *mk, @@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr, return count; } -struct module_attribute module_uevent = { - .attr = { .name = "uevent", .mode = 0200 }, - .store = store_uevent, -}; +struct module_attribute module_uevent = + __ATTR(uevent, 0200, NULL, store_uevent); + +static ssize_t show_coresize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->core_size); +} + +static struct module_attribute modinfo_coresize = + __ATTR(coresize, 0444, show_coresize, NULL); + +static ssize_t show_initsize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->init_size); +} + +static struct module_attribute modinfo_initsize = + __ATTR(initsize, 0444, show_initsize, NULL); + +static ssize_t show_taint(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + size_t l; + + l = module_flags_taint(mk->mod, buffer); + buffer[l++] = '\n'; + return l; +} + +static struct module_attribute modinfo_taint = + __ATTR(taint, 0444, show_taint, NULL); static struct module_attribute *modinfo_attrs[] = { + &module_uevent, &modinfo_version, &modinfo_srcversion, - &initstate, - &module_uevent, + &modinfo_initstate, + &modinfo_coresize, + &modinfo_initsize, + &modinfo_taint, #ifdef CONFIG_MODULE_UNLOAD - &refcnt, + &modinfo_refcnt, #endif NULL, }; @@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs, if (versions[i].crc == maybe_relocated(*crc, crc_owner)) return 1; - DEBUGP("Found checksum %lX vs module %lX\n", + pr_debug("Found checksum %lX vs module %lX\n", maybe_relocated(*crc, crc_owner), versions[i].crc); goto bad_version; } @@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) case SHN_COMMON: /* We compiled with -fno-common. These are not supposed to happen. */ - DEBUGP("Common symbol: %s\n", name); + pr_debug("Common symbol: %s\n", name); printk("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; @@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) case SHN_ABS: /* Don't need to do anything */ - DEBUGP("Absolute symbol: 0x%08lx\n", + pr_debug("Absolute symbol: 0x%08lx\n", (long)sym[i].st_value); break; @@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info) for (i = 0; i < info->hdr->e_shnum; i++) info->sechdrs[i].sh_entsize = ~0UL; - DEBUGP("Core section allocation order:\n"); + pr_debug("Core section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; @@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info) || strstarts(sname, ".init")) continue; s->sh_entsize = get_offset(mod, &mod->core_size, s, i); - DEBUGP("\t%s\n", name); + pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ @@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info) } } - DEBUGP("Init section allocation order:\n"); + pr_debug("Init section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; @@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info) continue; s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); - DEBUGP("\t%s\n", sname); + pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ @@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, return true; } +/* + * We only allocate and copy the strings needed by the parts of symtab + * we keep. This is simple, but has the effect of making multiple + * copies of duplicates. We could be more sophisticated, see + * linux-kernel thread starting with + * <73defb5e4bca04a6431392cc341112b1@localhost>. + */ static void layout_symtab(struct module *mod, struct load_info *info) { Elf_Shdr *symsect = info->sechdrs + info->index.sym; Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - unsigned int i, nsrc, ndst; + unsigned int i, nsrc, ndst, strtab_size; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, info->index.sym) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); + pr_debug("\t%s\n", info->secstrings + symsect->sh_name); src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - for (ndst = i = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { - unsigned int j = src->st_name; - while (!__test_and_set_bit(j, info->strmap) - && info->strtab[j]) - ++j; - ++ndst; + /* Compute total space required for the core symbols' strtab. */ + for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) + if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { + strtab_size += strlen(&info->strtab[src->st_name]) + 1; + ndst++; } /* Append room for core symbols at end of core part. */ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); + info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_size += strtab_size; /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); - - /* Append room for core symbols' strings at end of core part. */ - info->stroffs = mod->core_size; - __set_bit(0, info->strmap); - mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); + pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } static void add_kallsyms(struct module *mod, const struct load_info *info) @@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); mod->core_symtab = dst = mod->module_core + info->symoffs; + mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; *dst = *src; + *s++ = 0; for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) continue; + dst[ndst] = *src; - dst[ndst].st_name = bitmap_weight(info->strmap, - dst[ndst].st_name); - ++ndst; + dst[ndst++].st_name = s - mod->core_strtab; + s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; } mod->core_num_syms = ndst; - - mod->core_strtab = s = mod->module_core + info->stroffs; - for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) - if (test_bit(i, info->strmap)) - *++s = mod->strtab[i]; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -2487,6 +2526,9 @@ static int check_modinfo(struct module *mod, struct load_info *info) return -ENOEXEC; } + if (!get_modinfo(info, "intree")) + add_taint_module(mod, TAINT_OOT_MODULE); + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP); printk(KERN_WARNING "%s: module is from the staging directory," @@ -2618,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info) mod->module_init = ptr; /* Transfer each section which specifies SHF_ALLOC */ - DEBUGP("final section addresses:\n"); + pr_debug("final section addresses:\n"); for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; @@ -2636,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info) memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); /* Update sh_addr to point to copy in image. */ shdr->sh_addr = (unsigned long)dest; - DEBUGP("\t0x%lx %s\n", - shdr->sh_addr, info->secstrings + shdr->sh_name); + pr_debug("\t0x%lx %s\n", + (long)shdr->sh_addr, info->secstrings + shdr->sh_name); } return 0; @@ -2739,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info) this is done generically; there doesn't appear to be any special cases for the architectures. */ layout_sections(mod, info); - - info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) - * sizeof(long), GFP_KERNEL); - if (!info->strmap) { - err = -ENOMEM; - goto free_percpu; - } layout_symtab(mod, info); /* Allocate and move to the final place */ err = move_module(mod, info); if (err) - goto free_strmap; + goto free_percpu; /* Module has been copied to its final place now: return it. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; kmemleak_load_module(mod, info); return mod; -free_strmap: - kfree(info->strmap); free_percpu: percpu_modfree(mod); out: @@ -2769,7 +2802,6 @@ out: /* mod is no longer valid after this! */ static void module_deallocate(struct module *mod, struct load_info *info) { - kfree(info->strmap); percpu_modfree(mod); module_free(mod, mod->module_init); module_free(mod, mod->module_core); @@ -2808,7 +2840,7 @@ static struct module *load_module(void __user *umod, struct module *mod; long err; - DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", + pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", umod, len, uargs); /* Copy in the blobs from userspace, check they are vaguely sane. */ @@ -2878,8 +2910,7 @@ static struct module *load_module(void __user *umod, } /* This has to be done once we're sure module name is unique. */ - if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) - dynamic_debug_setup(info.debug, info.num_debug); + dynamic_debug_setup(info.debug, info.num_debug); /* Find duplicate symbols */ err = verify_export_symbols(mod); @@ -2900,8 +2931,7 @@ static struct module *load_module(void __user *umod, if (err < 0) goto unlink; - /* Get rid of temporary copy and strmap. */ - kfree(info.strmap); + /* Get rid of temporary copy. */ free_copy(&info); /* Done! */ @@ -2915,8 +2945,7 @@ static struct module *load_module(void __user *umod, module_bug_cleanup(mod); ddebug: - if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) - dynamic_debug_remove(info.debug); + dynamic_debug_remove(info.debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); @@ -3255,18 +3284,7 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) - buf[bx++] = 'P'; - if (mod->taints & (1 << TAINT_FORCED_MODULE)) - buf[bx++] = 'F'; - if (mod->taints & (1 << TAINT_CRAP)) - buf[bx++] = 'C'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ - + bx += module_flags_taint(mod, buf + bx); /* Show a - for module-is-being-unloaded */ if (mod->state == MODULE_STATE_GOING) buf[bx++] = '-'; @@ -3487,50 +3505,3 @@ void module_layout(struct module *mod, } EXPORT_SYMBOL(module_layout); #endif - -#ifdef CONFIG_TRACEPOINTS -void module_update_tracepoints(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) - if (!mod->taints) - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); - mutex_unlock(&module_mutex); -} - -/* - * Returns 0 if current not found. - * Returns 1 if current found. - */ -int module_get_iter_tracepoints(struct tracepoint_iter *iter) -{ - struct module *iter_mod; - int found = 0; - - mutex_lock(&module_mutex); - list_for_each_entry(iter_mod, &modules, list) { - if (!iter_mod->taints) { - /* - * Sorted module list - */ - if (iter_mod < iter->module) - continue; - else if (iter_mod > iter->module) - iter->tracepoint = NULL; - found = tracepoint_get_iter_range(&iter->tracepoint, - iter_mod->tracepoints_ptrs, - iter_mod->tracepoints_ptrs - + iter_mod->num_tracepoints); - if (found) { - iter->module = iter_mod; - break; - } - } - } - mutex_unlock(&module_mutex); - return found; -} -#endif diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 73da83aff418..7e3443fe1f48 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -14,7 +14,7 @@ */ #include <linux/mutex.h> #include <linux/delay.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/poison.h> #include <linux/sched.h> #include <linux/spinlock.h> diff --git a/kernel/mutex.c b/kernel/mutex.c index d607ed5dd441..89096dd8786f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -19,7 +19,7 @@ */ #include <linux/mutex.h> #include <linux/sched.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/debug_locks.h> diff --git a/kernel/notifier.c b/kernel/notifier.c index 8d7b435806c9..2d5cc4ccff7f 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -1,6 +1,6 @@ #include <linux/kdebug.h> #include <linux/kprobes.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/vmalloc.h> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 9aeab4b98c64..b576f7f14bc6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -14,7 +14,7 @@ */ #include <linux/slab.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/nsproxy.h> #include <linux/init_task.h> #include <linux/mnt_namespace.h> diff --git a/kernel/padata.c b/kernel/padata.c index b91941df5e63..b45259931512 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -18,7 +18,7 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/cpumask.h> #include <linux/err.h> #include <linux/cpu.h> diff --git a/kernel/panic.c b/kernel/panic.c index d7bb6974efb5..80aed44e345a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -49,6 +49,15 @@ static long no_blink(int state) long (*panic_blink)(int state); EXPORT_SYMBOL(panic_blink); +/* + * Stop ourself in panic -- architecture code may override this + */ +void __weak panic_smp_self_stop(void) +{ + while (1) + cpu_relax(); +} + /** * panic - halt the system * @fmt: The text string to print @@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink); * * This function never returns. */ -NORET_TYPE void panic(const char * fmt, ...) +void panic(const char *fmt, ...) { + static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; @@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...) * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... + * + * Only one CPU is allowed to execute the panic code from here. For + * multiple parallel invocations of panic, all other CPUs either + * stop themself or will wait until they are stopped by the 1st CPU + * with smp_send_stop(). */ - preempt_disable(); + if (!spin_trylock(&panic_lock)) + panic_smp_self_stop(); console_verbose(); bust_spinlocks(1); @@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...) va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); #ifdef CONFIG_DEBUG_BUGVERBOSE - dump_stack(); + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!oops_in_progress) + dump_stack(); #endif /* @@ -177,6 +197,7 @@ static const struct tnt tnts[] = { { TAINT_WARN, 'W', ' ' }, { TAINT_CRAP, 'C', ' ' }, { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, + { TAINT_OOT_MODULE, 'O', ' ' }, }; /** @@ -194,6 +215,7 @@ static const struct tnt tnts[] = { * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. * 'I' - Working around severe firmware bug. + * 'O' - Out-of-tree module has been loaded. * * The string is overwritten by the next call to print_tainted(). */ @@ -235,11 +257,20 @@ void add_taint(unsigned flag) * Can't trust the integrity of the kernel anymore. * We don't call directly debug_locks_off() because the issue * is not necessarily serious enough to set oops_in_progress to 1 - * Also we want to keep up lockdep for staging development and - * post-warning case. + * Also we want to keep up lockdep for staging/out-of-tree + * development and post-warning case. */ - if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) - printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); + switch (flag) { + case TAINT_CRAP: + case TAINT_OOT_MODULE: + case TAINT_WARN: + case TAINT_FIRMWARE_WORKAROUND: + break; + + default: + if (__debug_locks_off()) + printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); + } set_bit(flag, &tainted_mask); } diff --git a/kernel/params.c b/kernel/params.c index 22df3e0d142a..32ee04308285 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -15,7 +15,7 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <linux/moduleparam.h> +#include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> @@ -25,12 +25,6 @@ #include <linux/slab.h> #include <linux/ctype.h> -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt, a...) -#endif - /* Protects all parameters, and incidentally kmalloced_param list. */ static DEFINE_MUTEX(param_lock); @@ -67,20 +61,27 @@ static void maybe_kfree_parameter(void *param) } } -static inline char dash2underscore(char c) +static char dash2underscore(char c) { if (c == '-') return '_'; return c; } -static inline int parameq(const char *input, const char *paramname) +bool parameqn(const char *a, const char *b, size_t n) { - unsigned int i; - for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) - if (input[i] == '\0') - return 1; - return 0; + size_t i; + + for (i = 0; i < n; i++) { + if (dash2underscore(a[i]) != dash2underscore(b[i])) + return false; + } + return true; +} + +bool parameq(const char *a, const char *b) +{ + return parameqn(a, b, strlen(a)+1); } static int parse_one(char *param, @@ -98,7 +99,7 @@ static int parse_one(char *param, /* No one handled NULL, so do it here. */ if (!val && params[i].ops->set != param_set_bool) return -EINVAL; - DEBUGP("They are equal! Calling %p\n", + pr_debug("They are equal! Calling %p\n", params[i].ops->set); mutex_lock(¶m_lock); err = params[i].ops->set(val, ¶ms[i]); @@ -108,11 +109,11 @@ static int parse_one(char *param, } if (handle_unknown) { - DEBUGP("Unknown argument: calling %p\n", handle_unknown); + pr_debug("Unknown argument: calling %p\n", handle_unknown); return handle_unknown(param, val); } - DEBUGP("Unknown argument `%s'\n", param); + pr_debug("Unknown argument `%s'\n", param); return -ENOENT; } @@ -177,7 +178,7 @@ int parse_args(const char *name, { char *param, *val; - DEBUGP("Parsing ARGS: %s\n", args); + pr_debug("Parsing ARGS: %s\n", args); /* Chew leading spaces */ args = skip_spaces(args); @@ -362,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = { }; EXPORT_SYMBOL(param_ops_invbool); +int param_set_bint(const char *val, const struct kernel_param *kp) +{ + struct kernel_param boolkp; + bool v; + int ret; + + /* Match bool exactly, by re-using it. */ + boolkp = *kp; + boolkp.arg = &v; + boolkp.flags |= KPARAM_ISBOOL; + + ret = param_set_bool(val, &boolkp); + if (ret == 0) + *(int *)kp->arg = v; + return ret; +} +EXPORT_SYMBOL(param_set_bint); + +struct kernel_param_ops param_ops_bint = { + .set = param_set_bint, + .get = param_get_int, +}; +EXPORT_SYMBOL(param_ops_bint); + /* We break the rule and mangle the string. */ static int param_array(const char *name, const char *val, diff --git a/kernel/pid.c b/kernel/pid.c index e432057f3b21..ce8e00deaccb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -27,7 +27,7 @@ */ #include <linux/mm.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> @@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) } /* - * We might be racing with someone else trying to set pid_ns->last_pid. + * We might be racing with someone else trying to set pid_ns->last_pid + * at the pid allocation time (there's also a sysctl for this, but racing + * with this one is OK, see comment in kernel/pid_namespace.c about it). * We want the winner to have the "later" value, because if the * "earlier" value prevails, then a pid may get reused immediately. * @@ -418,7 +420,9 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { - rcu_lockdep_assert(rcu_read_lock_held()); + rcu_lockdep_assert(rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock()" + " protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca6..a8968396046d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +static int pid_ns_ctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Writing directly to ns' last_pid field is OK, since this field + * is volatile in a living namespace anyway and a code writing to + * it should synchronize its usage with external means. + */ + + tmp.data = ¤t->nsproxy->pid_ns->last_pid; + return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table[] = { + { + .procname = "ns_last_pid", + .maxlen = sizeof(int), + .mode = 0666, /* permissions are checked in the handler */ + .proc_handler = pid_ns_ctl_handler, + }, + { } +}; + +static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + register_sysctl_paths(kern_path, pid_ns_ctl_table); return 0; } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 640ded8f5c48..125cb67daa21 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock, if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { return now.sched < then.sched; } else { - return cputime_lt(now.cpu, then.cpu); + return now.cpu < then.cpu; } } static inline void cpu_time_add(const clockid_t which_clock, @@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock, if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { acc->sched += val.sched; } else { - acc->cpu = cputime_add(acc->cpu, val.cpu); + acc->cpu += val.cpu; } } static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, @@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { a.sched -= b.sched; } else { - a.cpu = cputime_sub(a.cpu, b.cpu); + a.cpu -= b.cpu; } return a; } /* - * Divide and limit the result to res >= 1 - * - * This is necessary to prevent signal delivery starvation, when the result of - * the division would be rounded down to 0. - */ -static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) -{ - cputime_t res = cputime_div(time, div); - - return max_t(cputime_t, res, 1); -} - -/* * Update expiry time from increment, and increase overrun count, * given the current clock sample. */ @@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer, } else { cputime_t delta, incr; - if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) + if (now.cpu < timer->it.cpu.expires.cpu) return; incr = timer->it.cpu.incr.cpu; - delta = cputime_sub(cputime_add(now.cpu, incr), - timer->it.cpu.expires.cpu); + delta = now.cpu + incr - timer->it.cpu.expires.cpu; /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) - incr = cputime_add(incr, incr); - for (; i >= 0; incr = cputime_halve(incr), i--) { - if (cputime_lt(delta, incr)) + for (i = 0; incr < delta - incr; i++) + incr += incr; + for (; i >= 0; incr = incr >> 1, i--) { + if (delta < incr) continue; - timer->it.cpu.expires.cpu = - cputime_add(timer->it.cpu.expires.cpu, incr); + timer->it.cpu.expires.cpu += incr; timer->it_overrun += 1 << i; - delta = cputime_sub(delta, incr); + delta -= incr; } } } static inline cputime_t prof_ticks(struct task_struct *p) { - return cputime_add(p->utime, p->stime); + return p->utime + p->stime; } static inline cputime_t virt_ticks(struct task_struct *p) { @@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - times->utime = cputime_add(times->utime, t->utime); - times->stime = cputime_add(times->stime, t->stime); + times->utime += t->utime; + times->stime += t->stime; times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: @@ -258,10 +243,10 @@ out: static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) { - if (cputime_gt(b->utime, a->utime)) + if (b->utime > a->utime) a->utime = b->utime; - if (cputime_gt(b->stime, a->stime)) + if (b->stime > a->stime) a->stime = b->stime; if (b->sum_exec_runtime > a->sum_exec_runtime) @@ -282,13 +267,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) * it. */ thread_group_cputime(tsk, &sum); - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 1; update_gt_cputime(&cputimer->cputime, &sum); } else - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); *times = cputimer->cputime; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } /* @@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - cpu->cpu = cputime_add(cputime.utime, cputime.stime); + cpu->cpu = cputime.utime + cputime.stime; break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); @@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head, unsigned long long sum_exec_runtime) { struct cpu_timer_list *timer, *next; - cputime_t ptime = cputime_add(utime, stime); + cputime_t ptime = utime + stime; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (cputime_lt(timer->expires.cpu, ptime)) { - timer->expires.cpu = cputime_zero; + if (timer->expires.cpu < ptime) { + timer->expires.cpu = 0; } else { - timer->expires.cpu = cputime_sub(timer->expires.cpu, - ptime); + timer->expires.cpu -= ptime; } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (cputime_lt(timer->expires.cpu, utime)) { - timer->expires.cpu = cputime_zero; + if (timer->expires.cpu < utime) { + timer->expires.cpu = 0; } else { - timer->expires.cpu = cputime_sub(timer->expires.cpu, - utime); + timer->expires.cpu -= utime; } } @@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) struct signal_struct *const sig = tsk->signal; cleanup_timers(tsk->signal->cpu_timers, - cputime_add(tsk->utime, sig->utime), - cputime_add(tsk->stime, sig->stime), + tsk->utime + sig->utime, tsk->stime + sig->stime, tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } @@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) static inline int expires_gt(cputime_t expires, cputime_t new_exp) { - return cputime_eq(expires, cputime_zero) || - cputime_gt(expires, new_exp); + return expires == 0 || expires > new_exp; } /* @@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime_add(cputime.utime, cputime.stime); + cpu->cpu = cputime.utime + cputime.stime; break; case CPUCLOCK_VIRT: cpu->cpu = cputime.utime; @@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk, unsigned long soft; maxfire = 20; - tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.prof_exp = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { + if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { tsk->cputime_expires.prof_exp = t->expires.cpu; break; } @@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.virt_exp = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { + if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { tsk->cputime_expires.virt_exp = t->expires.cpu; break; } @@ -999,9 +980,9 @@ static void stop_process_timers(struct signal_struct *sig) struct thread_group_cputimer *cputimer = &sig->cputimer; unsigned long flags; - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } static u32 onecputick; @@ -1009,20 +990,19 @@ static u32 onecputick; static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, cputime_t *expires, cputime_t cur_time, int signo) { - if (cputime_eq(it->expires, cputime_zero)) + if (!it->expires) return; - if (cputime_ge(cur_time, it->expires)) { - if (!cputime_eq(it->incr, cputime_zero)) { - it->expires = cputime_add(it->expires, it->incr); + if (cur_time >= it->expires) { + if (it->incr) { + it->expires += it->incr; it->error += it->incr_error; if (it->error >= onecputick) { - it->expires = cputime_sub(it->expires, - cputime_one_jiffy); + it->expires -= cputime_one_jiffy; it->error -= onecputick; } } else { - it->expires = cputime_zero; + it->expires = 0; } trace_itimer_expire(signo == SIGPROF ? @@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } - if (!cputime_eq(it->expires, cputime_zero) && - (cputime_eq(*expires, cputime_zero) || - cputime_lt(it->expires, *expires))) { + if (it->expires && (!*expires || it->expires < *expires)) { *expires = it->expires; } } @@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, */ static inline int task_cputime_zero(const struct task_cputime *cputime) { - if (cputime_eq(cputime->utime, cputime_zero) && - cputime_eq(cputime->stime, cputime_zero) && - cputime->sum_exec_runtime == 0) + if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) return 1; return 0; } @@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk, */ thread_group_cputimer(tsk, &cputime); utime = cputime.utime; - ptime = cputime_add(utime, cputime.stime); + ptime = utime + cputime.stime; sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; - prof_expires = cputime_zero; + prof_expires = 0; while (!list_empty(timers)) { struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { + if (!--maxfire || ptime < tl->expires.cpu) { prof_expires = tl->expires.cpu; break; } @@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk, ++timers; maxfire = 20; - virt_expires = cputime_zero; + virt_expires = 0; while (!list_empty(timers)) { struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { + if (!--maxfire || utime < tl->expires.cpu) { virt_expires = tl->expires.cpu; break; } @@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk, } } x = secs_to_cputime(soft); - if (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(x, prof_expires)) { + if (!prof_expires || x < prof_expires) { prof_expires = x; } } @@ -1249,12 +1224,9 @@ out: static inline int task_cputime_expired(const struct task_cputime *sample, const struct task_cputime *expires) { - if (!cputime_eq(expires->utime, cputime_zero) && - cputime_ge(sample->utime, expires->utime)) + if (expires->utime && sample->utime >= expires->utime) return 1; - if (!cputime_eq(expires->stime, cputime_zero) && - cputime_ge(cputime_add(sample->utime, sample->stime), - expires->stime)) + if (expires->stime && sample->utime + sample->stime >= expires->stime) return 1; if (expires->sum_exec_runtime != 0 && sample->sum_exec_runtime >= expires->sum_exec_runtime) @@ -1291,9 +1263,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (sig->cputimer.running) { struct task_cputime group_sample; - spin_lock(&sig->cputimer.lock); + raw_spin_lock(&sig->cputimer.lock); group_sample = sig->cputimer.cputime; - spin_unlock(&sig->cputimer.lock); + raw_spin_unlock(&sig->cputimer.lock); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; @@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, * it to be relative, *newval argument is relative and we update * it to be absolute. */ - if (!cputime_eq(*oldval, cputime_zero)) { - if (cputime_le(*oldval, now.cpu)) { + if (*oldval) { + if (*oldval <= now.cpu) { /* Just about to fire. */ *oldval = cputime_one_jiffy; } else { - *oldval = cputime_sub(*oldval, now.cpu); + *oldval -= now.cpu; } } - if (cputime_eq(*newval, cputime_zero)) + if (!*newval) return; - *newval = cputime_add(*newval, now.cpu); + *newval += now.cpu; } /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4556182527f3..69185ae6b701 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -46,7 +46,7 @@ #include <linux/syscalls.h> #include <linux/wait.h> #include <linux/workqueue.h> -#include <linux/module.h> +#include <linux/export.h> /* * Management arrays for POSIX timers. Timers are kept in slab memory diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 3744c594b19b..deb5461e3216 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -27,6 +27,7 @@ config HIBERNATION select HIBERNATE_CALLBACKS select LZO_COMPRESS select LZO_DECOMPRESS + select CRC32 ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the @@ -65,6 +66,9 @@ config HIBERNATION For more information take a look at <file:Documentation/power/swsusp.txt>. +config ARCH_SAVE_PAGE_KEYS + bool + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION @@ -235,3 +239,7 @@ config PM_GENERIC_DOMAINS config PM_GENERIC_DOMAINS_RUNTIME def_bool y depends on PM_RUNTIME && PM_GENERIC_DOMAINS + +config CPU_PM + bool + depends on SUSPEND || CPU_IDLE diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c5ebc6a90643..07e0e28ffba7 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,8 +1,8 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -obj-$(CONFIG_PM) += main.o -obj-$(CONFIG_PM_SLEEP) += console.o +obj-$(CONFIG_PM) += main.o qos.o +obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o diff --git a/kernel/power/console.c b/kernel/power/console.c index 218e5af90156..b1dc456474b5 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -1,5 +1,5 @@ /* - * drivers/power/process.c - Functions for saving/restoring console. + * Functions for saving/restoring console. * * Originally from swsusp. */ @@ -10,7 +10,6 @@ #include <linux/module.h> #include "power.h" -#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; @@ -32,4 +31,3 @@ void pm_restore_console(void) vt_kmsg_redirect(orig_kmsg); } } -#endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8f7b1db1ece1..6d6d28870335 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -9,11 +9,13 @@ * This file is released under the GPLv2. */ +#include <linux/export.h> #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> +#include <linux/async.h> #include <linux/kmod.h> #include <linux/delay.h> #include <linux/fs.h> @@ -29,18 +31,18 @@ #include "power.h" -static int nocompress = 0; -static int noresume = 0; +static int nocompress; +static int noresume; +static int resume_wait; +static int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; -int in_suspend __nosavedata = 0; +int in_suspend __nosavedata; enum { HIBERNATION_INVALID, HIBERNATION_PLATFORM, - HIBERNATION_TEST, - HIBERNATION_TESTPROC, HIBERNATION_SHUTDOWN, HIBERNATION_REBOOT, /* keep last */ @@ -51,6 +53,8 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; +bool freezer_test_done; + static const struct platform_hibernation_ops *hibernation_ops; /** @@ -65,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops) WARN_ON(1); return; } - mutex_lock(&pm_mutex); + lock_system_sleep(); hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } static bool entering_platform_hibernation; @@ -90,15 +94,6 @@ static void hibernation_debug_sleep(void) mdelay(5000); } -static int hibernation_testmode(int mode) -{ - if (hibernation_mode == mode) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} - static int hibernation_test(int level) { if (pm_test_level == level) { @@ -108,7 +103,6 @@ static int hibernation_test(int level) return 0; } #else /* !CONFIG_PM_DEBUG */ -static int hibernation_testmode(int mode) { return 0; } static int hibernation_test(int level) { return 0; } #endif /* !CONFIG_PM_DEBUG */ @@ -272,8 +266,7 @@ static int create_image(int platform_mode) goto Platform_finish; error = disable_nonboot_cpus(); - if (error || hibernation_test(TEST_CPUS) - || hibernation_testmode(HIBERNATION_TEST)) + if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; local_irq_disable(); @@ -327,38 +320,54 @@ static int create_image(int platform_mode) */ int hibernation_snapshot(int platform_mode) { - pm_message_t msg = PMSG_RECOVER; + pm_message_t msg; int error; error = platform_begin(platform_mode); if (error) goto Close; - error = dpm_prepare(PMSG_FREEZE); - if (error) - goto Complete_devices; - /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); if (error) - goto Complete_devices; + goto Close; + + error = freeze_kernel_threads(); + if (error) + goto Cleanup; + + if (hibernation_test(TEST_FREEZER)) { + + /* + * Indicate to the caller that we are returning due to a + * successful freezer test. + */ + freezer_test_done = true; + goto Cleanup; + } + + error = dpm_prepare(PMSG_FREEZE); + if (error) { + dpm_complete(PMSG_RECOVER); + goto Cleanup; + } suspend_console(); pm_restrict_gfp_mask(); + error = dpm_suspend(PMSG_FREEZE); - if (error) - goto Recover_platform; - if (hibernation_test(TEST_DEVICES)) - goto Recover_platform; + if (error || hibernation_test(TEST_DEVICES)) + platform_recover(platform_mode); + else + error = create_image(platform_mode); - error = create_image(platform_mode); /* - * Control returns here (1) after the image has been created or the + * In the case that we call create_image() above, the control + * returns here (1) after the image has been created or the * image creation has failed and (2) after a successful restore. */ - Resume_devices: /* We may need to release the preallocated image pages here. */ if (error || !in_suspend) swsusp_free(); @@ -370,17 +379,15 @@ int hibernation_snapshot(int platform_mode) pm_restore_gfp_mask(); resume_console(); - - Complete_devices: dpm_complete(msg); Close: platform_end(platform_mode); return error; - Recover_platform: - platform_recover(platform_mode); - goto Resume_devices; + Cleanup: + swsusp_free(); + goto Close; } /** @@ -463,7 +470,7 @@ static int resume_target_kernel(bool platform_mode) * @platform_mode: If set, use platform driver to prepare for the transition. * * This routine must be called with pm_mutex held. If it is successful, control - * reappears in the restored target kernel in hibernation_snaphot(). + * reappears in the restored target kernel in hibernation_snapshot(). */ int hibernation_restore(int platform_mode) { @@ -565,9 +572,6 @@ int hibernation_platform_enter(void) static void power_down(void) { switch (hibernation_mode) { - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - break; case HIBERNATION_REBOOT: kernel_restart(NULL); break; @@ -586,17 +590,6 @@ static void power_down(void) while(1); } -static int prepare_processes(void) -{ - int error = 0; - - if (freeze_processes()) { - error = -EBUSY; - thaw_processes(); - } - return error; -} - /** * hibernate - Carry out system hibernation, including saving the image. */ @@ -604,7 +597,7 @@ int hibernate(void) { int error; - mutex_lock(&pm_mutex); + lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; @@ -629,19 +622,17 @@ int hibernate(void) sys_sync(); printk("done.\n"); - error = prepare_processes(); + error = freeze_processes(); if (error) goto Finish; - if (hibernation_test(TEST_FREEZER)) - goto Thaw; - - if (hibernation_testmode(HIBERNATION_TESTPROC)) - goto Thaw; - error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (error) goto Thaw; + if (freezer_test_done) { + freezer_test_done = false; + goto Thaw; + } if (in_suspend) { unsigned int flags = 0; @@ -650,6 +641,9 @@ int hibernate(void) flags |= SF_PLATFORM_MODE; if (nocompress) flags |= SF_NOCOMPRESS_MODE; + else + flags |= SF_CRC32_MODE; + pr_debug("PM: writing image.\n"); error = swsusp_write(flags); swsusp_free(); @@ -671,7 +665,7 @@ int hibernate(void) pm_restore_console(); atomic_inc(&snapshot_device_available); Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error; } @@ -724,6 +718,12 @@ static int software_resume(void) pr_debug("PM: Checking hibernation image partition %s\n", resume_file); + if (resume_delay) { + printk(KERN_INFO "Waiting %dsec before reading resume device...\n", + resume_delay); + ssleep(resume_delay); + } + /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { @@ -732,6 +732,13 @@ static int software_resume(void) * to wait for this to finish. */ wait_for_device_probe(); + + if (resume_wait) { + while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) + msleep(10); + async_synchronize_full(); + } + /* * We can't depend on SCSI devices being available after loading * one of their modules until scsi_complete_async_scans() is @@ -772,11 +779,13 @@ static int software_resume(void) goto close_finish; error = create_basic_memory_bitmaps(); - if (error) + if (error) { + usermodehelper_enable(); goto close_finish; + } pr_debug("PM: Preparing processes for restore.\n"); - error = prepare_processes(); + error = freeze_processes(); if (error) { swsusp_close(FMODE_READ); goto Done; @@ -816,8 +825,6 @@ static const char * const hibernation_modes[] = { [HIBERNATION_PLATFORM] = "platform", [HIBERNATION_SHUTDOWN] = "shutdown", [HIBERNATION_REBOOT] = "reboot", - [HIBERNATION_TEST] = "test", - [HIBERNATION_TESTPROC] = "testproc", }; /* @@ -826,17 +833,15 @@ static const char * const hibernation_modes[] = { * Hibernation can be handled in several ways. There are a few different ways * to put the system into the sleep state: using the platform driver (e.g. ACPI * or other hibernation_ops), powering it off or rebooting it (for testing - * mostly), or using one of the two available test modes. + * mostly). * * The sysfs file /sys/power/disk provides an interface for selecting the * hibernation mode to use. Reading from this file causes the available modes - * to be printed. There are 5 modes that can be supported: + * to be printed. There are 3 modes that can be supported: * * 'platform' * 'shutdown' * 'reboot' - * 'test' - * 'testproc' * * If a platform hibernation driver is in use, 'platform' will be supported * and will be used by default. Otherwise, 'shutdown' will be used by default. @@ -860,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, switch (i) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -890,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - mutex_lock(&pm_mutex); + lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { @@ -902,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, switch (mode) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: hibernation_mode = mode; break; case HIBERNATION_PLATFORM: @@ -918,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, if (!error) pr_debug("PM: Hibernation mode set to '%s'\n", hibernation_modes[mode]); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error ? error : n; } @@ -945,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (maj != MAJOR(res) || min != MINOR(res)) goto out; - mutex_lock(&pm_mutex); + lock_system_sleep(); swsusp_resume_device = res; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); printk(KERN_INFO "PM: Starting manual resume from disk\n"); noresume = 0; software_resume(); @@ -1060,7 +1061,21 @@ static int __init noresume_setup(char *str) return 1; } +static int __init resumewait_setup(char *str) +{ + resume_wait = 1; + return 1; +} + +static int __init resumedelay_setup(char *str) +{ + resume_delay = simple_strtoul(str, NULL, 0); + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); __setup("hibernate=", hibernate_setup); +__setup("resumewait", resumewait_setup); +__setup("resumedelay=", resumedelay_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 6c601f871964..9824b41e5a18 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -3,15 +3,18 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * + * * This file is released under the GPLv2 * */ +#include <linux/export.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/resume-trace.h> #include <linux/workqueue.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include "power.h" @@ -113,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - mutex_lock(&pm_mutex); + lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -123,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, break; } - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error ? error : n; } @@ -131,6 +134,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_test); #endif /* CONFIG_PM_DEBUG */ +#ifdef CONFIG_DEBUG_FS +static char *suspend_step_name(enum suspend_stat_step step) +{ + switch (step) { + case SUSPEND_FREEZE: + return "freeze"; + case SUSPEND_PREPARE: + return "prepare"; + case SUSPEND_SUSPEND: + return "suspend"; + case SUSPEND_SUSPEND_NOIRQ: + return "suspend_noirq"; + case SUSPEND_RESUME_NOIRQ: + return "resume_noirq"; + case SUSPEND_RESUME: + return "resume"; + default: + return ""; + } +} + +static int suspend_stats_show(struct seq_file *s, void *unused) +{ + int i, index, last_dev, last_errno, last_step; + + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; + last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; + last_errno %= REC_FAILED_NUM; + last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; + last_step %= REC_FAILED_NUM; + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + "success", suspend_stats.success, + "fail", suspend_stats.fail, + "failed_freeze", suspend_stats.failed_freeze, + "failed_prepare", suspend_stats.failed_prepare, + "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_noirq", + suspend_stats.failed_suspend_noirq, + "failed_resume", suspend_stats.failed_resume, + "failed_resume_noirq", + suspend_stats.failed_resume_noirq); + seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", + suspend_stats.failed_devs[last_dev]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_dev + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_stats.failed_devs[index]); + } + seq_printf(s, " last_failed_errno:\t%-d\n", + suspend_stats.errno[last_errno]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_errno + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-d\n", + suspend_stats.errno[index]); + } + seq_printf(s, " last_failed_step:\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[last_step])); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_step + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[index])); + } + + return 0; +} + +static int suspend_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, suspend_stats_show, NULL); +} + +static const struct file_operations suspend_stats_operations = { + .open = suspend_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init pm_debugfs_init(void) +{ + debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, + NULL, NULL, &suspend_stats_operations); + return 0; +} + +late_initcall(pm_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ + #endif /* CONFIG_PM_SLEEP */ struct kobject *power_kobj; @@ -142,7 +240,7 @@ struct kobject *power_kobj; * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and * 'disk' (Suspend-to-Disk). * - * store() accepts one of those strings, translates it into the + * store() accepts one of those strings, translates it into the * proper enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -184,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, /* First, check if we are requested to hibernate */ if (len == 4 && !strncmp(buf, "disk", len)) { error = hibernate(); - goto Exit; + goto Exit; } #ifdef CONFIG_SUSPEND @@ -192,8 +290,14 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) break; } - if (state < PM_SUSPEND_MAX && *s) + if (state < PM_SUSPEND_MAX && *s) { error = enter_state(state); + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else + suspend_stats.success++; + } #endif Exit: diff --git a/kernel/power/power.h b/kernel/power/power.h index 9a00a0a26280..0c4defe6d3b8 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) /* kernel/power/hibernate.c */ +extern bool freezer_test_done; + extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); @@ -146,6 +148,7 @@ extern int swsusp_swap_in_use(void); */ #define SF_PLATFORM_MODE 1 #define SF_NOCOMPRESS_MODE 2 +#define SF_CRC32_MODE 4 /* kernel/power/hibernate.c */ extern int swsusp_check(void); @@ -228,7 +231,8 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - return freeze_processes(); + int error = freeze_processes(); + return error ? : freeze_kernel_threads(); } static inline void suspend_thaw_processes(void) diff --git a/kernel/power/process.c b/kernel/power/process.c index 0cf3a27a6c9d..77274c9ba2f1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -22,16 +22,7 @@ */ #define TIMEOUT (20 * HZ) -static inline int freezable(struct task_struct * p) -{ - if ((p == current) || - (p->flags & PF_NOFREEZE) || - (p->exit_state != 0)) - return 0; - return 1; -} - -static int try_to_freeze_tasks(bool sig_only) +static int try_to_freeze_tasks(bool user_only) { struct task_struct *g, *p; unsigned long end_time; @@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only) end_time = jiffies + TIMEOUT; - if (!sig_only) + if (!user_only) freeze_workqueues_begin(); while (true) { todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (frozen(p) || !freezable(p)) - continue; - - if (!freeze_task(p, sig_only)) + if (p == current || !freeze_task(p)) continue; /* @@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only) } while_each_thread(g, p); read_unlock(&tasklist_lock); - if (!sig_only) { + if (!user_only) { wq_busy = freeze_workqueues_busy(); todo += wq_busy; } @@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only) elapsed_csecs = elapsed_csecs64; if (todo) { - /* This does not unfreeze processes that are already frozen - * (we have slightly ugly calling convention in that respect, - * and caller must call thaw_processes() if something fails), - * but it cleans up leftover PF_FREEZE requests. - */ printk("\n"); printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", @@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only) elapsed_csecs / 100, elapsed_csecs % 100, todo - wq_busy, wq_busy); - thaw_workqueues(); - read_lock(&tasklist_lock); do_each_thread(g, p) { - task_lock(p); - if (!wakeup && freezing(p) && !freezer_should_skip(p)) + if (!wakeup && !freezer_should_skip(p) && + p != current && freezing(p) && !frozen(p)) sched_show_task(p); - cancel_freezing(p); - task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } else { @@ -135,60 +114,76 @@ static int try_to_freeze_tasks(bool sig_only) } /** - * freeze_processes - tell processes to enter the refrigerator + * freeze_processes - Signal user space processes to enter the refrigerator. + * + * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_processes(void) { int error; + if (!pm_freezing) + atomic_inc(&system_freezing_cnt); + printk("Freezing user space processes ... "); + pm_freezing = true; error = try_to_freeze_tasks(true); + if (!error) { + printk("done."); + oom_killer_disable(); + } + printk("\n"); + BUG_ON(in_atomic()); + if (error) - goto Exit; - printk("done.\n"); + thaw_processes(); + return error; +} + +/** + * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. + * + * On success, returns 0. On failure, -errno and system is fully thawed. + */ +int freeze_kernel_threads(void) +{ + int error; printk("Freezing remaining freezable tasks ... "); + pm_nosig_freezing = true; error = try_to_freeze_tasks(false); - if (error) - goto Exit; - printk("done."); + if (!error) + printk("done."); - oom_killer_disable(); - Exit: - BUG_ON(in_atomic()); printk("\n"); + BUG_ON(in_atomic()); + if (error) + thaw_processes(); return error; } -static void thaw_tasks(bool nosig_only) +void thaw_processes(void) { struct task_struct *g, *p; - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!freezable(p)) - continue; + if (pm_freezing) + atomic_dec(&system_freezing_cnt); + pm_freezing = false; + pm_nosig_freezing = false; - if (nosig_only && should_send_signal(p)) - continue; + oom_killer_enable(); - if (cgroup_freezing_or_frozen(p)) - continue; + printk("Restarting tasks ... "); - thaw_process(p); + thaw_workqueues(); + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); -} - -void thaw_processes(void) -{ - oom_killer_enable(); - printk("Restarting tasks ... "); - thaw_workqueues(); - thaw_tasks(true); - thaw_tasks(false); schedule(); printk("done.\n"); } diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c index 37f05d0f0793..995e3bd3417b 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/power/qos.c @@ -29,7 +29,7 @@ /*#define DEBUG*/ -#include <linux/pm_qos_params.h> +#include <linux/pm_qos.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/slab.h> @@ -43,64 +43,61 @@ #include <linux/kernel.h> #include <linux/uaccess.h> +#include <linux/export.h> /* - * locking rule: all changes to requests or notifiers lists + * locking rule: all changes to constraints or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -enum pm_qos_type { - PM_QOS_MAX, /* return the largest value */ - PM_QOS_MIN /* return the smallest value */ -}; - -/* - * Note: The lockless read path depends on the CPU accessing - * target_value atomically. Atomic access is only guaranteed on all CPU - * types linux supports for 32 bit quantites - */ struct pm_qos_object { - struct plist_head requests; - struct blocking_notifier_head *notifiers; + struct pm_qos_constraints *constraints; struct miscdevice pm_qos_power_miscdev; char *name; - s32 target_value; /* Do not change to 64 bit */ - s32 default_value; - enum pm_qos_type type; }; static DEFINE_SPINLOCK(pm_qos_lock); static struct pm_qos_object null_pm_qos; + static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); -static struct pm_qos_object cpu_dma_pm_qos = { - .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), - .notifiers = &cpu_dma_lat_notifier, - .name = "cpu_dma_latency", +static struct pm_qos_constraints cpu_dma_constraints = { + .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, + .notifiers = &cpu_dma_lat_notifier, +}; +static struct pm_qos_object cpu_dma_pm_qos = { + .constraints = &cpu_dma_constraints, + .name = "cpu_dma_latency", }; static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); -static struct pm_qos_object network_lat_pm_qos = { - .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), - .notifiers = &network_lat_notifier, - .name = "network_latency", +static struct pm_qos_constraints network_lat_constraints = { + .list = PLIST_HEAD_INIT(network_lat_constraints.list), .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN + .type = PM_QOS_MIN, + .notifiers = &network_lat_notifier, +}; +static struct pm_qos_object network_lat_pm_qos = { + .constraints = &network_lat_constraints, + .name = "network_latency", }; static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); -static struct pm_qos_object network_throughput_pm_qos = { - .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), - .notifiers = &network_throughput_notifier, - .name = "network_throughput", +static struct pm_qos_constraints network_tput_constraints = { + .list = PLIST_HEAD_INIT(network_tput_constraints.list), .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .type = PM_QOS_MAX, + .notifiers = &network_throughput_notifier, +}; +static struct pm_qos_object network_throughput_pm_qos = { + .constraints = &network_tput_constraints, + .name = "network_throughput", }; @@ -127,17 +124,17 @@ static const struct file_operations pm_qos_power_fops = { }; /* unlocked internal variant */ -static inline int pm_qos_get_value(struct pm_qos_object *o) +static inline int pm_qos_get_value(struct pm_qos_constraints *c) { - if (plist_head_empty(&o->requests)) - return o->default_value; + if (plist_head_empty(&c->list)) + return c->default_value; - switch (o->type) { + switch (c->type) { case PM_QOS_MIN: - return plist_first(&o->requests)->prio; + return plist_first(&c->list)->prio; case PM_QOS_MAX: - return plist_last(&o->requests)->prio; + return plist_last(&c->list)->prio; default: /* runtime check for not using enum */ @@ -145,69 +142,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) } } -static inline s32 pm_qos_read_value(struct pm_qos_object *o) +s32 pm_qos_read_value(struct pm_qos_constraints *c) { - return o->target_value; + return c->target_value; } -static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) +static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) { - o->target_value = value; + c->target_value = value; } -static void update_target(struct pm_qos_object *o, struct plist_node *node, - int del, int value) +/** + * pm_qos_update_target - manages the constraints list and calls the notifiers + * if needed + * @c: constraints data struct + * @node: request to add to the list, to update or to remove + * @action: action to take on the constraints list + * @value: value of the request to add or update + * + * This function returns 1 if the aggregated constraint value has changed, 0 + * otherwise. + */ +int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, + enum pm_qos_req_action action, int value) { unsigned long flags; - int prev_value, curr_value; + int prev_value, curr_value, new_value; spin_lock_irqsave(&pm_qos_lock, flags); - prev_value = pm_qos_get_value(o); - /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ - if (value != PM_QOS_DEFAULT_VALUE) { + prev_value = pm_qos_get_value(c); + if (value == PM_QOS_DEFAULT_VALUE) + new_value = c->default_value; + else + new_value = value; + + switch (action) { + case PM_QOS_REMOVE_REQ: + plist_del(node, &c->list); + break; + case PM_QOS_UPDATE_REQ: /* * to change the list, we atomically remove, reinit * with new value and add, then see if the extremal * changed */ - plist_del(node, &o->requests); - plist_node_init(node, value); - plist_add(node, &o->requests); - } else if (del) { - plist_del(node, &o->requests); - } else { - plist_add(node, &o->requests); + plist_del(node, &c->list); + case PM_QOS_ADD_REQ: + plist_node_init(node, new_value); + plist_add(node, &c->list); + break; + default: + /* no action */ + ; } - curr_value = pm_qos_get_value(o); - pm_qos_set_value(o, curr_value); + + curr_value = pm_qos_get_value(c); + pm_qos_set_value(c, curr_value); + spin_unlock_irqrestore(&pm_qos_lock, flags); - if (prev_value != curr_value) - blocking_notifier_call_chain(o->notifiers, + if (prev_value != curr_value) { + blocking_notifier_call_chain(c->notifiers, (unsigned long)curr_value, NULL); -} - -static int register_pm_qos_misc(struct pm_qos_object *qos) -{ - qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; - qos->pm_qos_power_miscdev.name = qos->name; - qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - - return misc_register(&qos->pm_qos_power_miscdev); -} - -static int find_pm_qos_object_by_minor(int minor) -{ - int pm_qos_class; - - for (pm_qos_class = 0; - pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { - if (minor == - pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) - return pm_qos_class; + return 1; + } else { + return 0; } - return -1; } /** @@ -218,11 +219,11 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_request(int pm_qos_class) { - return pm_qos_read_value(pm_qos_array[pm_qos_class]); + return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); } EXPORT_SYMBOL_GPL(pm_qos_request); -int pm_qos_request_active(struct pm_qos_request_list *req) +int pm_qos_request_active(struct pm_qos_request *req) { return req->pm_qos_class != 0; } @@ -230,40 +231,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active); /** * pm_qos_add_request - inserts new qos request into the list - * @dep: pointer to a preallocated handle + * @req: pointer to a preallocated handle * @pm_qos_class: identifies which list of qos request to use * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters and initializes the pm_qos_request_list + * for the pm_qos_class of parameters and initializes the pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. */ -void pm_qos_add_request(struct pm_qos_request_list *dep, +void pm_qos_add_request(struct pm_qos_request *req, int pm_qos_class, s32 value) { - struct pm_qos_object *o = pm_qos_array[pm_qos_class]; - int new_value; + if (!req) /*guard against callers passing in null */ + return; - if (pm_qos_request_active(dep)) { + if (pm_qos_request_active(req)) { WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); return; } - if (value == PM_QOS_DEFAULT_VALUE) - new_value = o->default_value; - else - new_value = value; - plist_node_init(&dep->list, new_value); - dep->pm_qos_class = pm_qos_class; - update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); + req->pm_qos_class = pm_qos_class; + pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, + &req->node, PM_QOS_ADD_REQ, value); } EXPORT_SYMBOL_GPL(pm_qos_add_request); /** * pm_qos_update_request - modifies an existing qos request - * @pm_qos_req : handle to list element holding a pm_qos request to use + * @req : handle to list element holding a pm_qos request to use * @value: defines the qos request * * Updates an existing qos request for the pm_qos_class of parameters along @@ -271,56 +268,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); * * Attempts are made to make this code callable on hot code paths. */ -void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, +void pm_qos_update_request(struct pm_qos_request *req, s32 new_value) { - s32 temp; - struct pm_qos_object *o; - - if (!pm_qos_req) /*guard against callers passing in null */ + if (!req) /*guard against callers passing in null */ return; - if (!pm_qos_request_active(pm_qos_req)) { + if (!pm_qos_request_active(req)) { WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); return; } - o = pm_qos_array[pm_qos_req->pm_qos_class]; - - if (new_value == PM_QOS_DEFAULT_VALUE) - temp = o->default_value; - else - temp = new_value; - - if (temp != pm_qos_req->list.prio) - update_target(o, &pm_qos_req->list, 0, temp); + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); /** * pm_qos_remove_request - modifies an existing qos request - * @pm_qos_req: handle to request list element + * @req: handle to request list element * - * Will remove pm qos request from the list of requests and + * Will remove pm qos request from the list of constraints and * recompute the current target value for the pm_qos_class. Call this * on slow code paths. */ -void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) +void pm_qos_remove_request(struct pm_qos_request *req) { - struct pm_qos_object *o; - - if (pm_qos_req == NULL) + if (!req) /*guard against callers passing in null */ return; /* silent return to keep pcm code cleaner */ - if (!pm_qos_request_active(pm_qos_req)) { + if (!pm_qos_request_active(req)) { WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); return; } - o = pm_qos_array[pm_qos_req->pm_qos_class]; - update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); - memset(pm_qos_req, 0, sizeof(*pm_qos_req)); + pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_DEFAULT_VALUE); + memset(req, 0, sizeof(*req)); } EXPORT_SYMBOL_GPL(pm_qos_remove_request); @@ -337,7 +325,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) int retval; retval = blocking_notifier_chain_register( - pm_qos_array[pm_qos_class]->notifiers, notifier); + pm_qos_array[pm_qos_class]->constraints->notifiers, + notifier); return retval; } @@ -356,34 +345,57 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) int retval; retval = blocking_notifier_chain_unregister( - pm_qos_array[pm_qos_class]->notifiers, notifier); + pm_qos_array[pm_qos_class]->constraints->notifiers, + notifier); return retval; } EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); +/* User space interface to PM QoS classes via misc devices */ +static int register_pm_qos_misc(struct pm_qos_object *qos) +{ + qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; + qos->pm_qos_power_miscdev.name = qos->name; + qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + + return misc_register(&qos->pm_qos_power_miscdev); +} + +static int find_pm_qos_object_by_minor(int minor) +{ + int pm_qos_class; + + for (pm_qos_class = 0; + pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { + if (minor == + pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) + return pm_qos_class; + } + return -1; +} + static int pm_qos_power_open(struct inode *inode, struct file *filp) { long pm_qos_class; pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); + struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); filp->private_data = req; - if (filp->private_data) - return 0; + return 0; } return -EPERM; } static int pm_qos_power_release(struct inode *inode, struct file *filp) { - struct pm_qos_request_list *req; + struct pm_qos_request *req; req = filp->private_data; pm_qos_remove_request(req); @@ -398,17 +410,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, { s32 value; unsigned long flags; - struct pm_qos_object *o; - struct pm_qos_request_list *pm_qos_req = filp->private_data; + struct pm_qos_request *req = filp->private_data; - if (!pm_qos_req) + if (!req) return -EINVAL; - if (!pm_qos_request_active(pm_qos_req)) + if (!pm_qos_request_active(req)) return -EINVAL; - o = pm_qos_array[pm_qos_req->pm_qos_class]; spin_lock_irqsave(&pm_qos_lock, flags); - value = pm_qos_get_value(o); + value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); spin_unlock_irqrestore(&pm_qos_lock, flags); return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); @@ -418,7 +428,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { s32 value; - struct pm_qos_request_list *pm_qos_req; + struct pm_qos_request *req; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) @@ -449,8 +459,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, return -EINVAL; } - pm_qos_req = filp->private_data; - pm_qos_update_request(pm_qos_req, value); + req = filp->private_data; + pm_qos_update_request(req, value); return count; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 06efa54f93d6..1cf88900ec4f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) PageReserved(page)) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } @@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } @@ -1339,6 +1345,9 @@ int hibernate_preallocate_memory(void) count += highmem; count -= totalreserve_pages; + /* Add number of pages required for page keys (s390 only). */ + size += page_key_additional_pages(saveable); + /* Compute the maximum number of saveable pages to leave in memory. */ max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); @@ -1662,6 +1671,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; + /* Save page key for data page (s390 only). */ + page_key_read(buf + j); } } @@ -1821,6 +1832,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; + /* Extract and buffer page key for data page (s390 only). */ + page_key_memorize(buf + j); + if (memory_bm_pfn_present(bm, buf[j])) memory_bm_set_bit(bm, buf[j]); else @@ -2223,6 +2237,11 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + /* Allocate buffer for page keys. */ + error = page_key_alloc(nr_copy_pages); + if (error) + return error; + } else if (handle->cur <= nr_meta_pages + 1) { error = unpack_orig_pfns(buffer, ©_bm); if (error) @@ -2243,6 +2262,8 @@ int snapshot_write_next(struct snapshot_handle *handle) } } else { copy_last_highmem_page(); + /* Restore page key for data page (s390 only). */ + page_key_write(handle->buffer); handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); @@ -2264,6 +2285,9 @@ int snapshot_write_next(struct snapshot_handle *handle) void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); + /* Restore page key for data page (s390 only). */ + page_key_write(handle->buffer); + page_key_free(); /* Free only if we have loaded the image entirely */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b6b71ad2208f..4fd51beed879 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -12,6 +12,7 @@ #include <linux/delay.h> #include <linux/errno.h> #include <linux/init.h> +#include <linux/kmod.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/syscalls.h> @@ -21,6 +22,7 @@ #include <linux/list.h> #include <linux/mm.h> #include <linux/slab.h> +#include <linux/export.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> #include <trace/events/power.h> @@ -40,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops; */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - mutex_lock(&pm_mutex); + lock_system_sleep(); suspend_ops = ops; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } EXPORT_SYMBOL_GPL(suspend_set_ops); @@ -107,7 +109,8 @@ static int suspend_prepare(void) if (!error) return 0; - suspend_thaw_processes(); + suspend_stats.failed_freeze++; + dpm_save_failed_step(SUSPEND_FREEZE); usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); @@ -315,8 +318,16 @@ int enter_state(suspend_state_t state) */ int pm_suspend(suspend_state_t state) { - if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) - return enter_state(state); + int ret; + if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { + ret = enter_state(state); + if (ret) { + suspend_stats.fail++; + dpm_save_failed_errno(ret); + } else + suspend_stats.success++; + return ret; + } return -EINVAL; } EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c97c3a0eee3..3739ecced085 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -18,7 +18,6 @@ #include <linux/bitops.h> #include <linux/genhd.h> #include <linux/device.h> -#include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/swap.h> @@ -27,6 +26,10 @@ #include <linux/slab.h> #include <linux/lzo.h> #include <linux/vmalloc.h> +#include <linux/cpumask.h> +#include <linux/atomic.h> +#include <linux/kthread.h> +#include <linux/crc32.h> #include "power.h" @@ -43,8 +46,7 @@ * allocated and populated one at a time, so we only need one memory * page to set up the entire structure. * - * During resume we also only need to use one swap_map_page structure - * at a time. + * During resume we pick up all swap_map_page structures into a list. */ #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) @@ -54,6 +56,11 @@ struct swap_map_page { sector_t next_swap; }; +struct swap_map_page_list { + struct swap_map_page *map; + struct swap_map_page_list *next; +}; + /** * The swap_map_handle structure is used for handling swap in * a file-alike way @@ -61,13 +68,18 @@ struct swap_map_page { struct swap_map_handle { struct swap_map_page *cur; + struct swap_map_page_list *maps; sector_t cur_swap; sector_t first_sector; unsigned int k; + unsigned long nr_free_pages, written; + u32 crc32; }; struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; + char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - + sizeof(u32)]; + u32 crc32; sector_t image; unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; @@ -199,6 +211,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; swsusp_header->flags = flags; + if (flags & SF_CRC32_MODE) + swsusp_header->crc32 = handle->crc32; error = hib_bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { @@ -245,6 +259,7 @@ static int swsusp_swap_check(void) static int write_page(void *buf, sector_t offset, struct bio **bio_chain) { void *src; + int ret; if (!offset) return -ENOSPC; @@ -254,9 +269,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (src) { copy_page(src, buf); } else { - WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ - src = buf; + ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (src) { + copy_page(src, buf); + } else { + WARN_ON_ONCE(1); + bio_chain = NULL; /* Go synchronous */ + src = buf; + } } } else { src = buf; @@ -293,6 +316,8 @@ static int get_swap_writer(struct swap_map_handle *handle) goto err_rel; } handle->k = 0; + handle->nr_free_pages = nr_free_pages() >> 1; + handle->written = 0; handle->first_sector = handle->cur_swap; return 0; err_rel: @@ -316,20 +341,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, return error; handle->cur->entries[handle->k++] = offset; if (handle->k >= MAP_PAGE_ENTRIES) { - error = hib_wait_on_bio_chain(bio_chain); - if (error) - goto out; offset = alloc_swapdev_block(root_swap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, NULL); + error = write_page(handle->cur, handle->cur_swap, bio_chain); if (error) goto out; clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; } + if (bio_chain && ++handle->written > handle->nr_free_pages) { + error = hib_wait_on_bio_chain(bio_chain); + if (error) + goto out; + handle->written = 0; + } out: return error; } @@ -372,6 +400,13 @@ static int swap_writer_finish(struct swap_map_handle *handle, LZO_HEADER, PAGE_SIZE) #define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) +/* Maximum number of threads for compression/decompression. */ +#define LZO_THREADS 3 + +/* Maximum number of pages for read buffering. */ +#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) + + /** * save_image - save the suspend image data */ @@ -419,6 +454,92 @@ static int save_image(struct swap_map_handle *handle, return ret; } +/** + * Structure used for CRC32. + */ +struct crc_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + unsigned run_threads; /* nr current threads */ + wait_queue_head_t go; /* start crc update */ + wait_queue_head_t done; /* crc update done */ + u32 *crc32; /* points to handle's crc32 */ + size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ + unsigned char *unc[LZO_THREADS]; /* uncompressed data */ +}; + +/** + * CRC32 update function that runs in its own thread. + */ +static int crc32_threadfn(void *data) +{ + struct crc_data *d = data; + unsigned i; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + for (i = 0; i < d->run_threads; i++) + *d->crc32 = crc32_le(*d->crc32, + d->unc[i], *d->unc_len[i]); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} +/** + * Structure used for LZO data compression. + */ +struct cmp_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start compression */ + wait_queue_head_t done; /* compression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ + unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ +}; + +/** + * Compression function that runs in its own thread. + */ +static int lzo_compress_threadfn(void *data) +{ + struct cmp_data *d = data; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->ret = lzo1x_1_compress(d->unc, d->unc_len, + d->cmp + LZO_HEADER, &d->cmp_len, + d->wrk); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} /** * save_image_lzo - Save the suspend image data compressed with LZO. @@ -437,42 +558,93 @@ static int save_image_lzo(struct swap_map_handle *handle, struct bio *bio; struct timeval start; struct timeval stop; - size_t off, unc_len, cmp_len; - unsigned char *unc, *cmp, *wrk, *page; + size_t off; + unsigned thr, run_threads, nr_threads; + unsigned char *page = NULL; + struct cmp_data *data = NULL; + struct crc_data *crc = NULL; + + /* + * We'll limit the number of threads for compression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out_clean; } - wrk = vmalloc(LZO1X_1_MEM_COMPRESS); - if (!wrk) { - printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); - free_page((unsigned long)page); - return -ENOMEM; + data = vmalloc(sizeof(*data) * nr_threads); + if (!data) { + printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; } + for (thr = 0; thr < nr_threads; thr++) + memset(&data[thr], 0, offsetof(struct cmp_data, go)); - unc = vmalloc(LZO_UNC_SIZE); - if (!unc) { - printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); - vfree(wrk); - free_page((unsigned long)page); - return -ENOMEM; + crc = kmalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + printk(KERN_ERR "PM: Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + memset(crc, 0, offsetof(struct crc_data, go)); + + /* + * Start the compression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_compress_threadfn, + &data[thr], + "image_compress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + printk(KERN_ERR + "PM: Cannot start compression threads\n"); + ret = -ENOMEM; + goto out_clean; + } } - cmp = vmalloc(LZO_CMP_SIZE); - if (!cmp) { - printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); - vfree(unc); - vfree(wrk); - free_page((unsigned long)page); - return -ENOMEM; + /* + * Adjust number of free pages after all allocations have been done. + * We don't want to run out of pages when writing. + */ + handle->nr_free_pages = nr_free_pages() >> 1; + + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; + } + + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; } printk(KERN_INFO + "PM: Using %u thread(s) for compression.\n" "PM: Compressing and saving image data (%u pages) ... ", - nr_to_write); + nr_threads, nr_to_write); m = nr_to_write / 100; if (!m) m = 1; @@ -480,55 +652,83 @@ static int save_image_lzo(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); for (;;) { - for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { - ret = snapshot_read_next(snapshot); - if (ret < 0) - goto out_finish; - - if (!ret) + for (thr = 0; thr < nr_threads; thr++) { + for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + ret = snapshot_read_next(snapshot); + if (ret < 0) + goto out_finish; + + if (!ret) + break; + + memcpy(data[thr].unc + off, + data_of(*snapshot), PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_CONT "\b\b\b\b%3d%%", + nr_pages / m); + nr_pages++; + } + if (!off) break; - memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); + data[thr].unc_len = off; - if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; + atomic_set(&data[thr].ready, 1); + wake_up(&data[thr].go); } - if (!off) + if (!thr) break; - unc_len = off; - ret = lzo1x_1_compress(unc, unc_len, - cmp + LZO_HEADER, &cmp_len, wrk); - if (ret < 0) { - printk(KERN_ERR "PM: LZO compression failed\n"); - break; - } + crc->run_threads = thr; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); - if (unlikely(!cmp_len || - cmp_len > lzo1x_worst_compress(unc_len))) { - printk(KERN_ERR "PM: Invalid LZO compressed length\n"); - ret = -1; - break; - } + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); - *(size_t *)cmp = cmp_len; + ret = data[thr].ret; - /* - * Given we are writing one page at a time to disk, we copy - * that much from the buffer, although the last bit will likely - * be smaller than full page. This is OK - we saved the length - * of the compressed data, so any garbage at the end will be - * discarded when we read it. - */ - for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { - memcpy(page, cmp + off, PAGE_SIZE); + if (ret < 0) { + printk(KERN_ERR "PM: LZO compression failed\n"); + goto out_finish; + } - ret = swap_write_page(handle, page, &bio); - if (ret) + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(data[thr].unc_len))) { + printk(KERN_ERR + "PM: Invalid LZO compressed length\n"); + ret = -1; goto out_finish; + } + + *(size_t *)data[thr].cmp = data[thr].cmp_len; + + /* + * Given we are writing one page at a time to disk, we + * copy that much from the buffer, although the last + * bit will likely be smaller than full page. This is + * OK - we saved the length of the compressed data, so + * any garbage at the end will be discarded when we + * read it. + */ + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(page, data[thr].cmp + off, PAGE_SIZE); + + ret = swap_write_page(handle, page, &bio); + if (ret) + goto out_finish; + } } + + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); } out_finish: @@ -536,16 +736,25 @@ out_finish: do_gettimeofday(&stop); if (!ret) ret = err2; - if (!ret) + if (!ret) { printk(KERN_CONT "\b\b\b\bdone\n"); - else + } else { printk(KERN_CONT "\n"); + } swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - - vfree(cmp); - vfree(unc); - vfree(wrk); - free_page((unsigned long)page); +out_clean: + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) free_page((unsigned long)page); return ret; } @@ -625,8 +834,15 @@ out_finish: static void release_swap_reader(struct swap_map_handle *handle) { - if (handle->cur) - free_page((unsigned long)handle->cur); + struct swap_map_page_list *tmp; + + while (handle->maps) { + if (handle->maps->map) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + } handle->cur = NULL; } @@ -634,22 +850,46 @@ static int get_swap_reader(struct swap_map_handle *handle, unsigned int *flags_p) { int error; + struct swap_map_page_list *tmp, *last; + sector_t offset; *flags_p = swsusp_header->flags; if (!swsusp_header->image) /* how can this happen? */ return -EINVAL; - handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); - if (!handle->cur) - return -ENOMEM; + handle->cur = NULL; + last = handle->maps = NULL; + offset = swsusp_header->image; + while (offset) { + tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); + if (!tmp) { + release_swap_reader(handle); + return -ENOMEM; + } + memset(tmp, 0, sizeof(*tmp)); + if (!handle->maps) + handle->maps = tmp; + if (last) + last->next = tmp; + last = tmp; + + tmp->map = (struct swap_map_page *) + __get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!tmp->map) { + release_swap_reader(handle); + return -ENOMEM; + } - error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); - if (error) { - release_swap_reader(handle); - return error; + error = hib_bio_read_page(offset, tmp->map, NULL); + if (error) { + release_swap_reader(handle); + return error; + } + offset = tmp->map->next_swap; } handle->k = 0; + handle->cur = handle->maps->map; return 0; } @@ -658,6 +898,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, { sector_t offset; int error; + struct swap_map_page_list *tmp; if (!handle->cur) return -EINVAL; @@ -668,13 +909,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { - error = hib_wait_on_bio_chain(bio_chain); handle->k = 0; - offset = handle->cur->next_swap; - if (!offset) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + if (!handle->maps) release_swap_reader(handle); - else if (!error) - error = hib_bio_read_page(offset, handle->cur, NULL); + else + handle->cur = handle->maps->map; } return error; } @@ -697,7 +940,7 @@ static int load_image(struct swap_map_handle *handle, unsigned int nr_to_read) { unsigned int m; - int error = 0; + int ret = 0; struct timeval start; struct timeval stop; struct bio *bio; @@ -713,15 +956,15 @@ static int load_image(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); for ( ; ; ) { - error = snapshot_write_next(snapshot); - if (error <= 0) + ret = snapshot_write_next(snapshot); + if (ret <= 0) break; - error = swap_read_page(handle, data_of(*snapshot), &bio); - if (error) + ret = swap_read_page(handle, data_of(*snapshot), &bio); + if (ret) break; if (snapshot->sync_read) - error = hib_wait_on_bio_chain(&bio); - if (error) + ret = hib_wait_on_bio_chain(&bio); + if (ret) break; if (!(nr_pages % m)) printk("\b\b\b\b%3d%%", nr_pages / m); @@ -729,17 +972,61 @@ static int load_image(struct swap_map_handle *handle, } err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) { + if (!ret) + ret = err2; + if (!ret) { printk("\b\b\b\bdone\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) - error = -ENODATA; + ret = -ENODATA; } else printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); - return error; + return ret; +} + +/** + * Structure used for LZO data decompression. + */ +struct dec_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start decompression */ + wait_queue_head_t done; /* decompression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ +}; + +/** + * Deompression function that runs in its own thread. + */ +static int lzo_decompress_threadfn(void *data) +{ + struct dec_data *d = data; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->unc_len = LZO_UNC_SIZE; + d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, + d->unc, &d->unc_len); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; } /** @@ -753,50 +1040,120 @@ static int load_image_lzo(struct swap_map_handle *handle, unsigned int nr_to_read) { unsigned int m; - int error = 0; + int ret = 0; + int eof = 0; struct bio *bio; struct timeval start; struct timeval stop; unsigned nr_pages; - size_t i, off, unc_len, cmp_len; - unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; - - for (i = 0; i < LZO_CMP_PAGES; i++) { - page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); - if (!page[i]) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + size_t off; + unsigned i, thr, run_threads, nr_threads; + unsigned ring = 0, pg = 0, ring_size = 0, + have = 0, want, need, asked = 0; + unsigned long read_pages; + unsigned char **page = NULL; + struct dec_data *data = NULL; + struct crc_data *crc = NULL; + + /* + * We'll limit the number of threads for decompression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + + page = vmalloc(sizeof(*page) * LZO_READ_PAGES); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + ret = -ENOMEM; + goto out_clean; + } - while (i) - free_page((unsigned long)page[--i]); + data = vmalloc(sizeof(*data) * nr_threads); + if (!data) { + printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; + } + for (thr = 0; thr < nr_threads; thr++) + memset(&data[thr], 0, offsetof(struct dec_data, go)); - return -ENOMEM; + crc = kmalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + printk(KERN_ERR "PM: Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + memset(crc, 0, offsetof(struct crc_data, go)); + + /* + * Start the decompression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_decompress_threadfn, + &data[thr], + "image_decompress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + printk(KERN_ERR + "PM: Cannot start decompression threads\n"); + ret = -ENOMEM; + goto out_clean; } } - unc = vmalloc(LZO_UNC_SIZE); - if (!unc) { - printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); - - for (i = 0; i < LZO_CMP_PAGES; i++) - free_page((unsigned long)page[i]); - - return -ENOMEM; + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; } - cmp = vmalloc(LZO_CMP_SIZE); - if (!cmp) { - printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; + } - vfree(unc); - for (i = 0; i < LZO_CMP_PAGES; i++) - free_page((unsigned long)page[i]); + /* + * Adjust number of pages for read buffering, in case we are short. + */ + read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; + read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); - return -ENOMEM; + for (i = 0; i < read_pages; i++) { + page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? + __GFP_WAIT | __GFP_HIGH : + __GFP_WAIT); + if (!page[i]) { + if (i < LZO_CMP_PAGES) { + ring_size = i; + printk(KERN_ERR + "PM: Failed to allocate LZO pages\n"); + ret = -ENOMEM; + goto out_clean; + } else { + break; + } + } } + want = ring_size = i; printk(KERN_INFO + "PM: Using %u thread(s) for decompression.\n" "PM: Loading and decompressing image data (%u pages) ... ", - nr_to_read); + nr_threads, nr_to_read); m = nr_to_read / 100; if (!m) m = 1; @@ -804,85 +1161,189 @@ static int load_image_lzo(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); - error = snapshot_write_next(snapshot); - if (error <= 0) + ret = snapshot_write_next(snapshot); + if (ret <= 0) goto out_finish; - for (;;) { - error = swap_read_page(handle, page[0], NULL); /* sync */ - if (error) - break; - - cmp_len = *(size_t *)page[0]; - if (unlikely(!cmp_len || - cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { - printk(KERN_ERR "PM: Invalid LZO compressed length\n"); - error = -1; - break; + for(;;) { + for (i = 0; !eof && i < want; i++) { + ret = swap_read_page(handle, page[ring], &bio); + if (ret) { + /* + * On real read error, finish. On end of data, + * set EOF flag and just exit the read loop. + */ + if (handle->cur && + handle->cur->entries[handle->k]) { + goto out_finish; + } else { + eof = 1; + break; + } + } + if (++ring >= ring_size) + ring = 0; } + asked += i; + want -= i; - for (off = PAGE_SIZE, i = 1; - off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { - error = swap_read_page(handle, page[i], &bio); - if (error) + /* + * We are out of data, wait for some more. + */ + if (!have) { + if (!asked) + break; + + ret = hib_wait_on_bio_chain(&bio); + if (ret) goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; } - error = hib_wait_on_bio_chain(&bio); /* need all data now */ - if (error) - goto out_finish; - - for (off = 0, i = 0; - off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { - memcpy(cmp + off, page[i], PAGE_SIZE); + if (crc->run_threads) { + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + crc->run_threads = 0; } - unc_len = LZO_UNC_SIZE; - error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, - unc, &unc_len); - if (error < 0) { - printk(KERN_ERR "PM: LZO decompression failed\n"); - break; + for (thr = 0; have && thr < nr_threads; thr++) { + data[thr].cmp_len = *(size_t *)page[pg]; + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(LZO_UNC_SIZE))) { + printk(KERN_ERR + "PM: Invalid LZO compressed length\n"); + ret = -1; + goto out_finish; + } + + need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, + PAGE_SIZE); + if (need > have) { + if (eof > 1) { + ret = -1; + goto out_finish; + } + break; + } + + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(data[thr].cmp + off, + page[pg], PAGE_SIZE); + have--; + want++; + if (++pg >= ring_size) + pg = 0; + } + + atomic_set(&data[thr].ready, 1); + wake_up(&data[thr].go); } - if (unlikely(!unc_len || - unc_len > LZO_UNC_SIZE || - unc_len & (PAGE_SIZE - 1))) { - printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); - error = -1; - break; + /* + * Wait for more data while we are decompressing. + */ + if (have < LZO_CMP_PAGES && asked) { + ret = hib_wait_on_bio_chain(&bio); + if (ret) + goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; } - for (off = 0; off < unc_len; off += PAGE_SIZE) { - memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); + + ret = data[thr].ret; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; + if (ret < 0) { + printk(KERN_ERR + "PM: LZO decompression failed\n"); + goto out_finish; + } - error = snapshot_write_next(snapshot); - if (error <= 0) + if (unlikely(!data[thr].unc_len || + data[thr].unc_len > LZO_UNC_SIZE || + data[thr].unc_len & (PAGE_SIZE - 1))) { + printk(KERN_ERR + "PM: Invalid LZO uncompressed length\n"); + ret = -1; goto out_finish; + } + + for (off = 0; + off < data[thr].unc_len; off += PAGE_SIZE) { + memcpy(data_of(*snapshot), + data[thr].unc + off, PAGE_SIZE); + + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + + ret = snapshot_write_next(snapshot); + if (ret <= 0) { + crc->run_threads = thr + 1; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); + goto out_finish; + } + } } + + crc->run_threads = thr; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); } out_finish: + if (crc->run_threads) { + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + } do_gettimeofday(&stop); - if (!error) { + if (!ret) { printk("\b\b\b\bdone\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) - error = -ENODATA; + ret = -ENODATA; + if (!ret) { + if (swsusp_header->flags & SF_CRC32_MODE) { + if(handle->crc32 != swsusp_header->crc32) { + printk(KERN_ERR + "PM: Invalid image CRC32!\n"); + ret = -ENODATA; + } + } + } } else printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); - - vfree(cmp); - vfree(unc); - for (i = 0; i < LZO_CMP_PAGES; i++) +out_clean: + for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) vfree(page); - return error; + return ret; } /** diff --git a/kernel/power/user.c b/kernel/power/user.c index 42ddbc6f0de6..6b1ab7a88522 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -12,6 +12,7 @@ #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/reboot.h> +#include <linux/kmod.h> #include <linux/string.h> #include <linux/device.h> #include <linux/miscdevice.h> @@ -20,6 +21,7 @@ #include <linux/swapops.h> #include <linux/pm.h> #include <linux/fs.h> +#include <linux/compat.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> @@ -29,28 +31,6 @@ #include "power.h" -/* - * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and - * will be removed in the future. They are only preserved here for - * compatibility with existing userland utilities. - */ -#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) -#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) - -#define PMOPS_PREPARE 1 -#define PMOPS_ENTER 2 -#define PMOPS_FINISH 3 - -/* - * NOTE: The following ioctl definitions are wrong and have been replaced with - * correct ones. They are only preserved here for compatibility with existing - * userland utilities and will be removed in the future. - */ -#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) -#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) -#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) -#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) - #define SNAPSHOT_MINOR 231 @@ -70,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; - mutex_lock(&pm_mutex); + lock_system_sleep(); if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; @@ -122,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->platform_support = 0; Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error; } @@ -131,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; - mutex_lock(&pm_mutex); + lock_system_sleep(); swsusp_free(); free_basic_memory_bitmaps(); @@ -145,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return 0; } @@ -157,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, ssize_t res; loff_t pg_offp = *offp & ~PAGE_MASK; - mutex_lock(&pm_mutex); + lock_system_sleep(); data = filp->private_data; if (!data->ready) { @@ -178,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, *offp += res; Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return res; } @@ -190,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, ssize_t res; loff_t pg_offp = *offp & ~PAGE_MASK; - mutex_lock(&pm_mutex); + lock_system_sleep(); data = filp->private_data; @@ -207,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res > 0) *offp += res; unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return res; } -static void snapshot_deprecated_ioctl(unsigned int cmd) -{ - if (printk_ratelimit()) - printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " - "be removed soon, update your suspend-to-disk " - "utilities\n", - __builtin_return_address(0), cmd); -} - static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -256,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; error = freeze_processes(); - if (error) { - thaw_processes(); + if (error) usermodehelper_enable(); - } - if (!error) + else data->frozen = 1; break; @@ -273,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->frozen = 0; break; - case SNAPSHOT_ATOMIC_SNAPSHOT: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_CREATE_IMAGE: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; @@ -282,10 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) + if (!error) { error = put_user(in_suspend, (int __user *)arg); - if (!error) - data->ready = 1; + if (!error && !freezer_test_done) + data->ready = 1; + if (freezer_test_done) { + freezer_test_done = false; + thaw_processes(); + } + } break; case SNAPSHOT_ATOMIC_RESTORE: @@ -304,8 +276,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->ready = 0; break; - case SNAPSHOT_SET_IMAGE_SIZE: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_PREF_IMAGE_SIZE: image_size = arg; break; @@ -320,16 +290,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_AVAIL_SWAP: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_AVAIL_SWAP_SIZE: size = count_swap_pages(data->swap, 1); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_GET_SWAP_PAGE: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_ALLOC_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; @@ -352,27 +318,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, free_all_swap_pages(data->swap); break; - case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ - snapshot_deprecated_ioctl(cmd); - if (!swsusp_swap_in_use()) { - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - if (old_decode_dev(arg)) { - data->swap = swap_type_of(old_decode_dev(arg), - 0, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } else { - error = -EPERM; - } - break; - case SNAPSHOT_S2RAM: if (!data->frozen) { error = -EPERM; @@ -395,33 +340,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = hibernation_platform_enter(); break; - case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ - snapshot_deprecated_ioctl(cmd); - error = -EINVAL; - - switch (arg) { - - case PMOPS_PREPARE: - data->platform_support = 1; - error = 0; - break; - - case PMOPS_ENTER: - if (data->platform_support) - error = hibernation_platform_enter(); - break; - - case PMOPS_FINISH: - if (data->platform_support) - error = 0; - break; - - default: - printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); - - } - break; - case SNAPSHOT_SET_SWAP_AREA: if (swsusp_swap_in_use()) { error = -EPERM; @@ -463,6 +381,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, return error; } +#ifdef CONFIG_COMPAT + +struct compat_resume_swap_area { + compat_loff_t offset; + u32 dev; +} __packed; + +static long +snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); + + switch (cmd) { + case SNAPSHOT_GET_IMAGE_SIZE: + case SNAPSHOT_AVAIL_SWAP_SIZE: + case SNAPSHOT_ALLOC_SWAP_PAGE: { + compat_loff_t __user *uoffset = compat_ptr(arg); + loff_t offset; + mm_segment_t old_fs; + int err; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = snapshot_ioctl(file, cmd, (unsigned long) &offset); + set_fs(old_fs); + if (!err && put_user(offset, uoffset)) + err = -EFAULT; + return err; + } + + case SNAPSHOT_CREATE_IMAGE: + return snapshot_ioctl(file, cmd, + (unsigned long) compat_ptr(arg)); + + case SNAPSHOT_SET_SWAP_AREA: { + struct compat_resume_swap_area __user *u_swap_area = + compat_ptr(arg); + struct resume_swap_area swap_area; + mm_segment_t old_fs; + int err; + + err = get_user(swap_area.offset, &u_swap_area->offset); + err |= get_user(swap_area.dev, &u_swap_area->dev); + if (err) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, + (unsigned long) &swap_area); + set_fs(old_fs); + return err; + } + + default: + return snapshot_ioctl(file, cmd, arg); + } +} + +#endif /* CONFIG_COMPAT */ + static const struct file_operations snapshot_fops = { .open = snapshot_open, .release = snapshot_release, @@ -470,6 +448,9 @@ static const struct file_operations snapshot_fops = { .write = snapshot_write, .llseek = no_llseek, .unlocked_ioctl = snapshot_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = snapshot_compat_ioctl, +#endif }; static struct miscdevice snapshot_device = { diff --git a/kernel/printk.c b/kernel/printk.c index 28a40d8171b8..13c0a1143f49 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -100,7 +100,7 @@ static int console_locked, console_suspended; * It is also used in interesting ways to provide interlocking in * console_unlock();. */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -199,7 +199,7 @@ void __init setup_log_buf(int early) unsigned long mem; mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (mem == MEMBLOCK_ERROR) + if (!mem) return; new_log_buf = __va(mem); } else { @@ -212,7 +212,7 @@ void __init setup_log_buf(int early) return; } - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; @@ -230,7 +230,7 @@ void __init setup_log_buf(int early) log_start -= offset; con_start -= offset; log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); pr_info("log_buf_len: %d\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", @@ -365,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (error) goto out; i = 0; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); while (!error && (log_start != log_end) && i < len) { c = LOG_BUF(log_start); log_start++; - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; i++; cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (!error) error = i; break; @@ -399,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) count = len; if (count > log_buf_len) count = log_buf_len; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (count > logged_chars) count = logged_chars; if (do_clear) @@ -416,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (j + log_buf_len < log_end) break; c = LOG_BUF(j); - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (error) break; error = i; @@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end) } } -static int __read_mostly ignore_loglevel; +static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { @@ -532,6 +532,9 @@ static int __init ignore_loglevel_setup(char *str) } early_param("ignore_loglevel", ignore_loglevel_setup); +module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" + "print all kernel messages to the console."); /* * Write out chars from start to end - 1 inclusive @@ -592,9 +595,6 @@ static size_t log_prefix(const char *p, unsigned int *level, char *special) /* multi digit including the level and facility number */ char *endp = NULL; - if (p[1] < '0' && p[1] > '9') - return 0; - lev = (simple_strtoul(&p[1], &endp, 10) & 7); if (endp == NULL || endp[0] != '>') return 0; @@ -688,16 +688,17 @@ static void zap_locks(void) oops_timestamp = jiffies; + debug_locks_off(); /* If a crash is occurring, make sure we can't deadlock */ - spin_lock_init(&logbuf_lock); + raw_spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ sema_init(&console_sem, 1); } #if defined(CONFIG_PRINTK_TIME) -static int printk_time = 1; +static bool printk_time = 1; #else -static int printk_time = 0; +static bool printk_time = 0; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); @@ -802,9 +803,9 @@ static int console_trylock_for_printk(unsigned int cpu) } } printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); if (wake) up(&console_sem); + raw_spin_unlock(&logbuf_lock); return retval; } static const char recursion_bug_msg [] = @@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) boot_delay_msec(); printk_delay(); - preempt_disable(); /* This stops the holder of console_sem just where we want him */ - raw_local_irq_save(flags); + local_irq_save(flags); this_cpu = smp_processor_id(); /* @@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * recursion and return - but flag the recursion so that * it can be printed at the next appropriate moment: */ - if (!oops_in_progress) { + if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; goto out_restore_irqs; } @@ -864,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) } lockdep_off(); - spin_lock(&logbuf_lock); + raw_spin_lock(&logbuf_lock); printk_cpu = this_cpu; if (recursion_bug) { @@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) lockdep_on(); out_restore_irqs: - raw_local_irq_restore(flags); + local_irq_restore(flags); - preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); @@ -1099,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -int console_suspend_enabled = 1; +bool console_suspend_enabled = 1; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) @@ -1108,6 +1107,10 @@ static int __init console_suspend_disable(char *str) return 1; } __setup("no_console_suspend", console_suspend_disable); +module_param_named(console_suspend, console_suspend_enabled, + bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(console_suspend, "suspend console during suspend" + " and hibernate operations"); /** * suspend_console - suspend the console subsystem @@ -1257,14 +1260,14 @@ void console_unlock(void) again: for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock(&logbuf_lock); + raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); start_critical_timings(); @@ -1276,7 +1279,7 @@ again: if (unlikely(exclusive_console)) exclusive_console = NULL; - spin_unlock(&logbuf_lock); + raw_spin_unlock(&logbuf_lock); up(&console_sem); @@ -1286,10 +1289,11 @@ again: * there's a new owner and the console_unlock() from them will do the * flush, no worries. */ - spin_lock(&logbuf_lock); + raw_spin_lock(&logbuf_lock); if (con_start != log_end) retry = 1; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + if (retry && console_trylock()) goto again; @@ -1522,9 +1526,9 @@ void register_console(struct console *newcon) * console_unlock(); will print out the buffered messages * for us. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to @@ -1731,10 +1735,10 @@ void kmsg_dump(enum kmsg_dump_reason reason) /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); end = log_end & LOG_BUF_MASK; chars = logged_chars; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (chars > end) { s1 = log_buf + log_buf_len - chars + end; diff --git a/kernel/profile.c b/kernel/profile.c index 961b389fe52f..76b8e77773ee 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -13,7 +13,7 @@ * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/profile.h> #include <linux/bootmem.h> #include <linux/notifier.h> diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c890ac9a7962..00ab2ca5ed11 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -8,7 +8,7 @@ */ #include <linux/capability.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> @@ -96,9 +96,20 @@ void __ptrace_unlink(struct task_struct *child) */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count)) + child->signal->group_stop_count)) { child->jobctl |= JOBCTL_STOP_PENDING; + /* + * This is only possible if this thread was cloned by the + * traced task running in the stopped group, set the signal + * for the future reports. + * FIXME: we should change ptrace_init_task() to handle this + * case. + */ + if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) + child->jobctl |= SIGSTOP; + } + /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick * @child in the butt. Note that @resume should be used iff @child diff --git a/kernel/range.c b/kernel/range.c index 37fa9b99ad58..9b8ae2d6ed68 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -1,7 +1,7 @@ /* * Range add and subtract */ -#include <linux/module.h> +#include <linux/kernel.h> #include <linux/init.h> #include <linux/sort.h> diff --git a/kernel/rcu.h b/kernel/rcu.h new file mode 100644 index 000000000000..aa88baab5f78 --- /dev/null +++ b/kernel/rcu.h @@ -0,0 +1,92 @@ +/* + * Read-Copy Update definitions shared among RCU implementations. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2011 + * + * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#ifndef __LINUX_RCU_H +#define __LINUX_RCU_H + +#ifdef CONFIG_RCU_TRACE +#define RCU_TRACE(stmt) stmt +#else /* #ifdef CONFIG_RCU_TRACE */ +#define RCU_TRACE(stmt) +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* + * Process-level increment to ->dynticks_nesting field. This allows for + * architectures that use half-interrupts and half-exceptions from + * process context. + */ +#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) + +/* + * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally + * by call_rcu() and rcu callback execution, and are therefore not part of the + * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. + */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +# define STATE_RCU_HEAD_READY 0 +# define STATE_RCU_HEAD_QUEUED 1 + +extern struct debug_obj_descr rcuhead_debug_descr; + +static inline void debug_rcu_head_queue(struct rcu_head *head) +{ + WARN_ON_ONCE((unsigned long)head & 0x3); + debug_object_activate(head, &rcuhead_debug_descr); + debug_object_active_state(head, &rcuhead_debug_descr, + STATE_RCU_HEAD_READY, + STATE_RCU_HEAD_QUEUED); +} + +static inline void debug_rcu_head_unqueue(struct rcu_head *head) +{ + debug_object_active_state(head, &rcuhead_debug_descr, + STATE_RCU_HEAD_QUEUED, + STATE_RCU_HEAD_READY); + debug_object_deactivate(head, &rcuhead_debug_descr); +} +#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void debug_rcu_head_queue(struct rcu_head *head) +{ +} + +static inline void debug_rcu_head_unqueue(struct rcu_head *head) +{ +} +#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +extern void kfree(const void *); + +static inline void __rcu_reclaim(char *rn, struct rcu_head *head) +{ + unsigned long offset = (unsigned long)head->func; + + if (__is_kfree_rcu_offset(offset)) { + RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); + kfree((void *)head - offset); + } else { + RCU_TRACE(trace_rcu_invoke_callback(rn, head)); + head->func(head); + } +} + +#endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ddddb320be61..2bc4e135ff23 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -43,9 +43,14 @@ #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/hardirq.h> +#define CREATE_TRACE_POINTS +#include <trace/events/rcu.h> + +#include "rcu.h" + #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = @@ -88,17 +93,24 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; + if (rcu_is_cpu_idle()) + return 0; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +struct rcu_synchronize { + struct rcu_head head; + struct completion completion; +}; + /* * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. */ -void wakeme_after_rcu(struct rcu_head *head) +static void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; @@ -106,6 +118,20 @@ void wakeme_after_rcu(struct rcu_head *head) complete(&rcu->completion); } +void wait_rcu_gp(call_rcu_func_t crf) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + crf(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(wait_rcu_gp); + #ifdef CONFIG_PROVE_RCU /* * wrapper function to avoid #include problems. @@ -292,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = { }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) +{ + trace_rcu_torture_read(rcutorturename, rhp); +} +EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#endif diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 7bbac7d0f5ab..977296dca0a4 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -22,13 +22,12 @@ * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU */ -#include <linux/moduleparam.h> #include <linux/completion.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mutex.h> #include <linux/sched.h> #include <linux/types.h> @@ -37,47 +36,154 @@ #include <linux/cpu.h> #include <linux/prefetch.h> -/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ -static struct task_struct *rcu_kthread_task; -static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); -static unsigned long have_rcu_kthread_work; +#ifdef CONFIG_RCU_TRACE +#include <trace/events/rcu.h> +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +#include "rcu.h" /* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; -static void invoke_rcu_kthread(void); -static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); -static int rcu_kthread(void *arg); +static void invoke_rcu_callbacks(void); +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); #include "rcutiny_plugin.h" -#ifdef CONFIG_NO_HZ +static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; -static long rcu_dynticks_nesting = 1; +/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ +static void rcu_idle_enter_common(long long oldval) +{ + if (rcu_dynticks_nesting) { + RCU_TRACE(trace_rcu_dyntick("--=", + oldval, rcu_dynticks_nesting)); + return; + } + RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); + if (!is_idle_task(current)) { + struct task_struct *idle = idle_task(smp_processor_id()); + + RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", + oldval, rcu_dynticks_nesting)); + ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ + } + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ +} /* - * Enter dynticks-idle mode, which is an extended quiescent state - * if we have fully entered that mode (i.e., if the new value of - * dynticks_nesting is zero). + * Enter idle, which is an extended quiescent state if we have fully + * entered that mode (i.e., if the new value of dynticks_nesting is zero). */ -void rcu_enter_nohz(void) +void rcu_idle_enter(void) { - if (--rcu_dynticks_nesting == 0) - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; + rcu_dynticks_nesting = 0; + rcu_idle_enter_common(oldval); + local_irq_restore(flags); } /* - * Exit dynticks-idle mode, so that we are no longer in an extended - * quiescent state. + * Exit an interrupt handler towards idle. + */ +void rcu_irq_exit(void) +{ + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; + rcu_dynticks_nesting--; + WARN_ON_ONCE(rcu_dynticks_nesting < 0); + rcu_idle_enter_common(oldval); + local_irq_restore(flags); +} + +/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ +static void rcu_idle_exit_common(long long oldval) +{ + if (oldval) { + RCU_TRACE(trace_rcu_dyntick("++=", + oldval, rcu_dynticks_nesting)); + return; + } + RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); + if (!is_idle_task(current)) { + struct task_struct *idle = idle_task(smp_processor_id()); + + RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", + oldval, rcu_dynticks_nesting)); + ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ + } +} + +/* + * Exit idle, so that we are no longer in an extended quiescent state. */ -void rcu_exit_nohz(void) +void rcu_idle_exit(void) { + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; + WARN_ON_ONCE(oldval != 0); + rcu_dynticks_nesting = DYNTICK_TASK_NESTING; + rcu_idle_exit_common(oldval); + local_irq_restore(flags); +} + +/* + * Enter an interrupt handler, moving away from idle. + */ +void rcu_irq_enter(void) +{ + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; rcu_dynticks_nesting++; + WARN_ON_ONCE(rcu_dynticks_nesting == 0); + rcu_idle_exit_common(oldval); + local_irq_restore(flags); +} + +#ifdef CONFIG_PROVE_RCU + +/* + * Test whether RCU thinks that the current CPU is idle. + */ +int rcu_is_cpu_idle(void) +{ + return !rcu_dynticks_nesting; } +EXPORT_SYMBOL(rcu_is_cpu_idle); -#endif /* #ifdef CONFIG_NO_HZ */ +#endif /* #ifdef CONFIG_PROVE_RCU */ + +/* + * Test whether the current CPU was interrupted from idle. Nested + * interrupts don't count, we must be running at the first interrupt + * level. + */ +int rcu_is_cpu_rrupt_from_idle(void) +{ + return rcu_dynticks_nesting <= 0; +} /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). @@ -96,16 +202,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) } /* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_kthread(void) -{ - have_rcu_kthread_work = 1; - wake_up(&rcu_kthread_wq); -} - -/* * Record an rcu quiescent state. And an rcu_bh quiescent state while we * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. @@ -117,7 +213,7 @@ void rcu_sched_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); local_irq_restore(flags); } @@ -130,20 +226,19 @@ void rcu_bh_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); local_irq_restore(flags); } /* * Check to see if the scheduling-clock interrupt came from an extended - * quiescent state, and, if so, tell RCU about it. + * quiescent state, and, if so, tell RCU about it. This function must + * be called from hardirq context. It is normally called from the + * scheduling-clock interrupt. */ void rcu_check_callbacks(int cpu, int user) { - if (user || - (idle_cpu(cpu) && - !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) + if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); @@ -154,18 +249,27 @@ void rcu_check_callbacks(int cpu, int user) * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure * whose grace period has elapsed. */ -static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { + char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; RCU_TRACE(int cb_count = 0); /* If no RCU callbacks ready to invoke, just return. */ - if (&rcp->rcucblist == rcp->donetail) + if (&rcp->rcucblist == rcp->donetail) { + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, + ACCESS_ONCE(rcp->rcucblist), + need_resched(), + is_idle_task(current), + rcu_is_callbacks_kthread())); return; + } /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -176,49 +280,28 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ + RCU_TRACE(rn = rcp->name); while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim(list); + __rcu_reclaim(rn, list); local_bh_enable(); list = next; RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), + is_idle_task(current), + rcu_is_callbacks_kthread())); } -/* - * This kthread invokes RCU callbacks whose grace periods have - * elapsed. It is awakened as needed, and takes the place of the - * RCU_SOFTIRQ that was used previously for this purpose. - * This is a kthread, but it is never stopped, at least not until - * the system goes down. - */ -static int rcu_kthread(void *arg) +static void rcu_process_callbacks(struct softirq_action *unused) { - unsigned long work; - unsigned long morework; - unsigned long flags; - - for (;;) { - wait_event_interruptible(rcu_kthread_wq, - have_rcu_kthread_work != 0); - morework = rcu_boost(); - local_irq_save(flags); - work = have_rcu_kthread_work; - have_rcu_kthread_work = morework; - local_irq_restore(flags); - if (work) { - rcu_process_callbacks(&rcu_sched_ctrlblk); - rcu_process_callbacks(&rcu_bh_ctrlblk); - rcu_preempt_process_callbacks(); - } - schedule_timeout_interruptible(1); /* Leave CPU for others. */ - } - - return 0; /* Not reached, but needed to shut gcc up. */ + __rcu_process_callbacks(&rcu_sched_ctrlblk); + __rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); } /* @@ -280,45 +363,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) __call_rcu(head, func, &rcu_bh_ctrlblk); } EXPORT_SYMBOL_GPL(call_rcu_bh); - -void rcu_barrier_bh(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_bh(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -void rcu_barrier_sched(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -/* - * Spawn the kthread that invokes RCU callbacks. - */ -static int __init rcu_spawn_kthreads(void) -{ - struct sched_param sp; - - rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); - sp.sched_priority = RCU_BOOST_PRIO; - sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); - return 0; -} -early_initcall(rcu_spawn_kthreads); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f259c676195f..9cb1ae4aabdd 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -23,32 +23,30 @@ */ #include <linux/kthread.h> +#include <linux/module.h> #include <linux/debugfs.h> #include <linux/seq_file.h> -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(stmt) stmt -#else /* #ifdef CONFIG_RCU_TRACE */ -#define RCU_TRACE(stmt) -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ struct rcu_head **curtail; /* ->next pointer of last CB. */ RCU_TRACE(long qlen); /* Number of pending CBs. */ + RCU_TRACE(char *name); /* Name of RCU type. */ }; /* Definition for rcupdate control block. */ static struct rcu_ctrlblk rcu_sched_ctrlblk = { .donetail = &rcu_sched_ctrlblk.rcucblist, .curtail = &rcu_sched_ctrlblk.rcucblist, + RCU_TRACE(.name = "rcu_sched") }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .donetail = &rcu_bh_ctrlblk.rcucblist, .curtail = &rcu_bh_ctrlblk.rcucblist, + RCU_TRACE(.name = "rcu_bh") }; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -131,6 +129,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), + RCU_TRACE(.rcb.name = "rcu_preempt") }; static int rcu_preempted_readers_exp(void); @@ -247,6 +246,13 @@ static void show_tiny_preempt_stats(struct seq_file *m) #include "rtmutex_common.h" +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO + +/* Controls for rcu_kthread() kthread. */ +static struct task_struct *rcu_kthread_task; +static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); +static unsigned long have_rcu_kthread_work; + /* * Carry out RCU priority boosting on the task indicated by ->boost_tasks, * and advance ->boost_tasks to the next task in the ->blkd_tasks list. @@ -306,8 +312,8 @@ static int rcu_boost(void) rt_mutex_lock(&mtx); rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - return rcu_preempt_ctrlblk.boost_tasks != NULL || - rcu_preempt_ctrlblk.exp_tasks != NULL; + return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || + ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; } /* @@ -334,7 +340,7 @@ static int rcu_initiate_boost(void) if (rcu_preempt_ctrlblk.exp_tasks == NULL) rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; - invoke_rcu_kthread(); + invoke_rcu_callbacks(); } else RCU_TRACE(rcu_initiate_boost_trace()); return 1; @@ -353,14 +359,6 @@ static void rcu_preempt_boost_start_gp(void) #else /* #ifdef CONFIG_RCU_BOOST */ /* - * If there is no RCU priority boosting, we don't boost. - */ -static int rcu_boost(void) -{ - return 0; -} - -/* * If there is no RCU priority boosting, we don't initiate boosting, * but we do indicate whether there are blocked readers blocking the * current grace period. @@ -427,7 +425,7 @@ static void rcu_preempt_cpu_qs(void) /* If there are done callbacks, cause them to be invoked. */ if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); } /* @@ -648,7 +646,7 @@ static void rcu_preempt_check_callbacks(void) rcu_preempt_cpu_qs(); if (&rcu_preempt_ctrlblk.rcb.rcucblist != rcu_preempt_ctrlblk.rcb.donetail) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); if (rcu_preempt_gp_in_progress() && rcu_cpu_blocking_cur_gp() && rcu_preempt_running_reader()) @@ -674,7 +672,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) */ static void rcu_preempt_process_callbacks(void) { - rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); + __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); } /* @@ -697,20 +695,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); -void rcu_barrier(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - /* * synchronize_rcu - wait until a grace period has elapsed. * @@ -864,15 +848,6 @@ static void show_tiny_preempt_stats(struct seq_file *m) #endif /* #ifdef CONFIG_RCU_TRACE */ /* - * Because preemptible RCU does not exist, it is never necessary to - * boost preempted RCU readers. - */ -static int rcu_boost(void) -{ - return 0; -} - -/* * Because preemptible RCU does not exist, it never has any callbacks * to check. */ @@ -898,6 +873,103 @@ static void rcu_preempt_process_callbacks(void) #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + +/* + * Wake up rcu_kthread() to process callbacks now eligible for invocation + * or to boost readers. + */ +static void invoke_rcu_callbacks(void) +{ + have_rcu_kthread_work = 1; + wake_up(&rcu_kthread_wq); +} + +#ifdef CONFIG_RCU_TRACE + +/* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return rcu_kthread_task == current; +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +/* + * This kthread invokes RCU callbacks whose grace periods have + * elapsed. It is awakened as needed, and takes the place of the + * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. + * This is a kthread, but it is never stopped, at least not until + * the system goes down. + */ +static int rcu_kthread(void *arg) +{ + unsigned long work; + unsigned long morework; + unsigned long flags; + + for (;;) { + wait_event_interruptible(rcu_kthread_wq, + have_rcu_kthread_work != 0); + morework = rcu_boost(); + local_irq_save(flags); + work = have_rcu_kthread_work; + have_rcu_kthread_work = morework; + local_irq_restore(flags); + if (work) + rcu_process_callbacks(NULL); + schedule_timeout_interruptible(1); /* Leave CPU for others. */ + } + + return 0; /* Not reached, but needed to shut gcc up. */ +} + +/* + * Spawn the kthread that invokes RCU callbacks. + */ +static int __init rcu_spawn_kthreads(void) +{ + struct sched_param sp; + + rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); + sp.sched_priority = RCU_BOOST_PRIO; + sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); + return 0; +} +early_initcall(rcu_spawn_kthreads); + +#else /* #ifdef CONFIG_RCU_BOOST */ + +/* + * Start up softirq processing of callbacks. + */ +void invoke_rcu_callbacks(void) +{ + raise_softirq(RCU_SOFTIRQ); +} + +#ifdef CONFIG_RCU_TRACE + +/* + * There is no callback kthread, so this thread is never it. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return false; +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +void rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #include <linux/kernel_stat.h> @@ -913,12 +985,6 @@ void __init rcu_scheduler_starting(void) #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -#ifdef CONFIG_RCU_BOOST -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO -#else /* #ifdef CONFIG_RCU_BOOST */ -#define RCU_BOOST_PRIO 1 -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - #ifdef CONFIG_RCU_TRACE #ifdef CONFIG_RCU_BOOST diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 98f51b13bb7e..88f17b8a3b1d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -61,9 +61,11 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ -static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ -static int fqs_holdoff = 0; /* Hold time within burst (us). */ +static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ +static int fqs_holdoff; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ +static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ +static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ @@ -73,7 +75,7 @@ module_param(nreaders, int, 0444); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); module_param(nfakewriters, int, 0444); MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -module_param(stat_interval, int, 0444); +module_param(stat_interval, int, 0644); MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); module_param(verbose, bool, 0444); MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); @@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(onoff_interval, int, 0444); +MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +module_param(shutdown_secs, int, 0444); +MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); module_param(test_boost, int, 0444); MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); module_param(test_boost_interval, int, 0444); @@ -119,6 +125,10 @@ static struct task_struct *shuffler_task; static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; +static struct task_struct *shutdown_task; +#ifdef CONFIG_HOTPLUG_CPU +static struct task_struct *onoff_task; +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ #define RCU_TORTURE_PIPE_LEN 10 @@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static long n_rcu_torture_timers; +static long n_offline_attempts; +static long n_offline_successes; +static long n_online_attempts; +static long n_online_successes; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; @@ -160,6 +174,8 @@ static int stutter_pause_test; #define RCUTORTURE_RUNNABLE_INIT 0 #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; +module_param(rcutorture_runnable, int, 0444); +MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 @@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; #define rcu_can_boost() 0 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD; */ static DEFINE_MUTEX(fullstop_mutex); +/* Forward reference. */ +static void rcu_torture_cleanup(void); + /* * Detect and respond to a system shutdown. */ @@ -480,30 +500,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); } -struct rcu_bh_torture_synchronize { - struct rcu_head head; - struct completion completion; -}; - -static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) -{ - struct rcu_bh_torture_synchronize *rcu; - - rcu = container_of(head, struct rcu_bh_torture_synchronize, head); - complete(&rcu->completion); -} - -static void rcu_bh_torture_synchronize(void) -{ - struct rcu_bh_torture_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} - static struct rcu_torture_ops rcu_bh_ops = { .init = NULL, .cleanup = NULL, @@ -512,7 +508,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferred_free = rcu_bh_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, + .sync = synchronize_rcu_bh, .cb_barrier = rcu_barrier_bh, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -528,7 +524,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, + .sync = synchronize_rcu_bh, .cb_barrier = NULL, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -536,6 +532,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .name = "rcu_bh_sync" }; +static struct rcu_torture_ops rcu_bh_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu_bh_expedited, + .cb_barrier = NULL, + .fqs = rcu_bh_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh_expedited" +}; + /* * Definitions for srcu torture testing. */ @@ -620,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = { .name = "srcu" }; +static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) +{ + return srcu_read_lock_raw(&srcu_ctl); +} + +static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) +{ + srcu_read_unlock_raw(&srcu_ctl, idx); +} + +static struct rcu_torture_ops srcu_raw_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock_raw, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock_raw, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu_raw" +}; + static void srcu_torture_synchronize_expedited(void) { synchronize_srcu_expedited(&srcu_ctl); @@ -659,11 +695,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); } -static void sched_torture_synchronize(void) -{ - synchronize_sched(); -} - static struct rcu_torture_ops sched_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, @@ -672,7 +703,7 @@ static struct rcu_torture_ops sched_ops = { .readunlock = sched_torture_read_unlock, .completed = rcu_no_completed, .deferred_free = rcu_sched_torture_deferred_free, - .sync = sched_torture_synchronize, + .sync = synchronize_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, .stats = NULL, @@ -688,7 +719,7 @@ static struct rcu_torture_ops sched_sync_ops = { .readunlock = sched_torture_read_unlock, .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, - .sync = sched_torture_synchronize, + .sync = synchronize_sched, .cb_barrier = NULL, .fqs = rcu_sched_force_quiescent_state, .stats = NULL, @@ -754,7 +785,7 @@ static int rcu_torture_boost(void *arg) do { /* Wait for the next test interval. */ oldstarttime = boost_starttime; - while (jiffies - oldstarttime > ULONG_MAX / 2) { + while (ULONG_CMP_LT(jiffies, oldstarttime)) { schedule_timeout_uninterruptible(1); rcu_stutter_wait("rcu_torture_boost"); if (kthread_should_stop() || @@ -765,7 +796,7 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; call_rcu_time = jiffies; - while (jiffies - endtime > ULONG_MAX / 2) { + while (ULONG_CMP_LT(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!rbi.inflight) { smp_mb(); /* RCU core before ->inflight = 1. */ @@ -792,7 +823,8 @@ static int rcu_torture_boost(void *arg) * interval. Besides, we are running at RT priority, * so delays should be relatively rare. */ - while (oldstarttime == boost_starttime) { + while (oldstarttime == boost_starttime && + !kthread_should_stop()) { if (mutex_trylock(&boost_mutex)) { boost_starttime = jiffies + test_boost_interval * HZ; @@ -809,11 +841,11 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); /* Clean up and exit. */ VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); - destroy_rcu_head_on_stack(&rbi.rcu); rcutorture_shutdown_absorb("rcu_torture_boost"); while (!kthread_should_stop() || rbi.inflight) schedule_timeout_uninterruptible(1); smp_mb(); /* order accesses to ->inflight before stack-frame death. */ + destroy_rcu_head_on_stack(&rbi.rcu); return 0; } @@ -831,11 +863,13 @@ rcu_torture_fqs(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (jiffies - fqs_resume_time > LONG_MAX) { + while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + !kthread_should_stop()) { schedule_timeout_interruptible(1); } fqs_burst_remaining = fqs_duration; - while (fqs_burst_remaining > 0) { + while (fqs_burst_remaining > 0 && + !kthread_should_stop()) { cur_ops->fqs(); udelay(fqs_holdoff); fqs_burst_remaining -= fqs_holdoff; @@ -923,6 +957,18 @@ rcu_torture_fakewriter(void *arg) return 0; } +void rcutorture_trace_dump(void) +{ + static atomic_t beenhere = ATOMIC_INIT(0); + + if (atomic_read(&beenhere)) + return; + if (atomic_xchg(&beenhere, 1) != 0) + return; + do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); + ftrace_dump(DUMP_ALL); +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -944,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -961,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } + if (pipe_count > 1) + rcutorture_trace_dump(); __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { @@ -1004,6 +1053,7 @@ rcu_torture_reader(void *arg) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); @@ -1019,6 +1069,8 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } + if (pipe_count > 1) + rcutorture_trace_dump(); __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { @@ -1066,7 +1118,8 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d rtbke: %ld rtbre: %ld " - "rtbf: %ld rtb: %ld nt: %ld", + "rtbf: %ld rtb: %ld nt: %ld " + "onoff: %ld/%ld:%ld/%ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), @@ -1078,7 +1131,11 @@ rcu_torture_printk(char *page) n_rcu_torture_boost_rterror, n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers); + n_rcu_torture_timers, + n_online_successes, + n_online_attempts, + n_offline_successes, + n_offline_attempts); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || @@ -1242,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d\n", + "test_boost_duration=%d shutdown_secs=%d " + "onoff_interval=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration); + test_boost_interval, test_boost_duration, shutdown_secs, + onoff_interval); } static struct notifier_block rcutorture_shutdown_nb = { @@ -1280,8 +1339,9 @@ static int rcutorture_booster_init(int cpu) /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, - "rcu_torture_boost"); + boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, + cpu_to_node(cpu), + "rcu_torture_boost"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); @@ -1296,6 +1356,131 @@ static int rcutorture_booster_init(int cpu) return 0; } +/* + * Cause the rcutorture test to shutdown the system after the test has + * run for the time specified by the shutdown_secs module parameter. + */ +static int +rcu_torture_shutdown(void *arg) +{ + long delta; + unsigned long jiffies_snap; + + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); + jiffies_snap = ACCESS_ONCE(jiffies); + while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + !kthread_should_stop()) { + delta = shutdown_time - jiffies_snap; + if (verbose) + printk(KERN_ALERT "%s" TORTURE_FLAG + "rcu_torture_shutdown task: %lu " + "jiffies remaining\n", + torture_type, delta); + schedule_timeout_interruptible(delta); + jiffies_snap = ACCESS_ONCE(jiffies); + } + if (kthread_should_stop()) { + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); + return 0; + } + + /* OK, shut down the system. */ + + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); + shutdown_task = NULL; /* Avoid self-kill deadlock. */ + rcu_torture_cleanup(); /* Get the success/failure message. */ + kernel_power_off(); /* Shut down the system. */ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Execute random CPU-hotplug operations at the interval specified + * by the onoff_interval. + */ +static int +rcu_torture_onoff(void *arg) +{ + int cpu; + int maxcpu = -1; + DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + while (!kthread_should_stop()) { + cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); + if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { + if (verbose) + printk(KERN_ALERT "%s" TORTURE_FLAG + "rcu_torture_onoff task: offlining %d\n", + torture_type, cpu); + n_offline_attempts++; + if (cpu_down(cpu) == 0) { + if (verbose) + printk(KERN_ALERT "%s" TORTURE_FLAG + "rcu_torture_onoff task: " + "offlined %d\n", + torture_type, cpu); + n_offline_successes++; + } + } else if (cpu_is_hotpluggable(cpu)) { + if (verbose) + printk(KERN_ALERT "%s" TORTURE_FLAG + "rcu_torture_onoff task: onlining %d\n", + torture_type, cpu); + n_online_attempts++; + if (cpu_up(cpu) == 0) { + if (verbose) + printk(KERN_ALERT "%s" TORTURE_FLAG + "rcu_torture_onoff task: " + "onlined %d\n", + torture_type, cpu); + n_online_successes++; + } + } + schedule_timeout_interruptible(onoff_interval * HZ); + } + VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); + return 0; +} + +static int +rcu_torture_onoff_init(void) +{ + if (onoff_interval <= 0) + return 0; + onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); + if (IS_ERR(onoff_task)) { + onoff_task = NULL; + return PTR_ERR(onoff_task); + } + return 0; +} + +static void rcu_torture_onoff_cleanup(void) +{ + if (onoff_task == NULL) + return; + VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); + kthread_stop(onoff_task); +} + +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void +rcu_torture_onoff_init(void) +{ +} + +static void rcu_torture_onoff_cleanup(void) +{ +} + +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + static int rcutorture_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1400,6 +1585,11 @@ rcu_torture_cleanup(void) for_each_possible_cpu(i) rcutorture_booster_cleanup(i); } + if (shutdown_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); + kthread_stop(shutdown_task); + } + rcu_torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1424,8 +1614,8 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, - &rcu_bh_ops, &rcu_bh_sync_ops, - &srcu_ops, &srcu_expedited_ops, + &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, + &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); @@ -1616,6 +1806,18 @@ rcu_torture_init(void) } } } + if (shutdown_secs > 0) { + shutdown_time = jiffies + shutdown_secs * HZ; + shutdown_task = kthread_run(rcu_torture_shutdown, NULL, + "rcu_torture_shutdown"); + if (IS_ERR(shutdown_task)) { + firsterr = PTR_ERR(shutdown_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); + shutdown_task = NULL; + goto unwind; + } + } + rcu_torture_onoff_init(); register_reboot_notifier(&rcutorture_shutdown_nb); rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ba06207b1dd3..6c4a6722abfd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -38,7 +38,7 @@ #include <linux/nmi.h> #include <linux/atomic.h> #include <linux/bitops.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/completion.h> #include <linux/moduleparam.h> #include <linux/percpu.h> @@ -52,13 +52,16 @@ #include <linux/prefetch.h> #include "rcutree.h" +#include <trace/events/rcu.h> + +#include "rcu.h" /* Data structures. */ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; #define RCU_STATE_INITIALIZER(structname) { \ - .level = { &structname.node[0] }, \ + .level = { &structname##_state.node[0] }, \ .levelcnt = { \ NUM_RCU_LVL_0, /* root of hierarchy. */ \ NUM_RCU_LVL_1, \ @@ -66,20 +69,20 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; NUM_RCU_LVL_3, \ NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ }, \ - .signaled = RCU_GP_IDLE, \ + .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ .name = #structname, \ } -struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); +struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; @@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); -#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ - /* * Track the rcutorture test sequence number and the update version * number within a given test. The rcutorture_testseq is incremented @@ -156,44 +157,50 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) * Note a quiescent state. Because we do not need to know * how many quiescent states passed, just if there was at least * one since the start of the grace period, this just sets a flag. + * The caller must have disabled preemption. */ void rcu_sched_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesc_completed = rdp->gpnum - 1; + rdp->passed_quiesce_gpnum = rdp->gpnum; barrier(); - rdp->passed_quiesc = 1; + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); + rdp->passed_quiesce = 1; } void rcu_bh_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc_completed = rdp->gpnum - 1; + rdp->passed_quiesce_gpnum = rdp->gpnum; barrier(); - rdp->passed_quiesc = 1; + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); + rdp->passed_quiesce = 1; } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. + * The caller must have disabled preemption. */ void rcu_note_context_switch(int cpu) { + trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); + trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -#ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = 1, + .dynticks_nesting = DYNTICK_TASK_NESTING, .dynticks = ATOMIC_INIT(1), }; -#endif /* #ifdef CONFIG_NO_HZ */ -static int blimit = 10; /* Maximum callbacks per softirq. */ +static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -314,15 +321,16 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) * trust its state not to change because interrupts are disabled. */ if (cpu_is_offline(rdp->cpu)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); rdp->offline_fqs++; return 1; } - /* If preemptible RCU, no point in sending reschedule IPI. */ - if (rdp->preemptible) - return 0; - - /* The CPU is online, so send it a reschedule IPI. */ + /* + * The CPU is online, so send it a reschedule IPI. This forces + * it through the scheduler, and (inefficiently) also handles cases + * where idle loops fail to inform RCU about the CPU being idle. + */ if (rdp->cpu != smp_processor_id()) smp_send_reschedule(rdp->cpu); else @@ -333,64 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -#ifdef CONFIG_NO_HZ +/* + * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle + * + * If the new value of the ->dynticks_nesting counter now is zero, + * we really have entered idle, and must do the appropriate accounting. + * The caller must have disabled interrupts. + */ +static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) +{ + trace_rcu_dyntick("Start", oldval, 0); + if (!is_idle_task(current)) { + struct task_struct *idle = idle_task(smp_processor_id()); + + trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); + ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ + } + rcu_prepare_for_idle(smp_processor_id()); + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); +} /** - * rcu_enter_nohz - inform RCU that current CPU is entering nohz + * rcu_idle_enter - inform RCU that current CPU is entering idle * - * Enter nohz mode, in other words, -leave- the mode in which RCU + * Enter idle mode, in other words, -leave- the mode in which RCU * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in nohz mode, a possibility - * handled by rcu_irq_enter() and rcu_irq_exit()). + * critical sections can occur in irq handlers in idle, a possibility + * handled by irq_enter() and irq_exit().) + * + * We crowbar the ->dynticks_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. */ -void rcu_enter_nohz(void) +void rcu_idle_enter(void) { unsigned long flags; + long long oldval; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (--rdtp->dynticks_nesting) { - local_irq_restore(flags); - return; - } - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + oldval = rdtp->dynticks_nesting; + rdtp->dynticks_nesting = 0; + rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); - - /* If the interrupt queued a callback, get out of dyntick mode. */ - if (in_irq() && - (__get_cpu_var(rcu_sched_data).nxtlist || - __get_cpu_var(rcu_bh_data).nxtlist || - rcu_preempt_needs_cpu(smp_processor_id()))) - set_need_resched(); } -/* - * rcu_exit_nohz - inform RCU that current CPU is leaving nohz +/** + * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle + * + * Exit from an interrupt handler, which might possibly result in entering + * idle mode, in other words, leaving the mode in which read-side critical + * sections can occur. + * + * This code assumes that the idle loop never does anything that might + * result in unbalanced calls to irq_enter() and irq_exit(). If your + * architecture violates this assumption, RCU will give you what you + * deserve, good and hard. But very infrequently and irreproducibly. * - * Exit nohz mode, in other words, -enter- the mode in which RCU - * read-side critical sections normally occur. + * Use things like work queues to work around this limitation. + * + * You have been warned. */ -void rcu_exit_nohz(void) +void rcu_irq_exit(void) { unsigned long flags; + long long oldval; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks_nesting++) { - local_irq_restore(flags); - return; - } + oldval = rdtp->dynticks_nesting; + rdtp->dynticks_nesting--; + WARN_ON_ONCE(rdtp->dynticks_nesting < 0); + if (rdtp->dynticks_nesting) + trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); + else + rcu_idle_enter_common(rdtp, oldval); + local_irq_restore(flags); +} + +/* + * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle + * + * If the new value of the ->dynticks_nesting counter was previously zero, + * we really have exited idle, and must do the appropriate accounting. + * The caller must have disabled interrupts. + */ +static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) +{ smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + rcu_cleanup_after_idle(smp_processor_id()); + trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); + if (!is_idle_task(current)) { + struct task_struct *idle = idle_task(smp_processor_id()); + + trace_rcu_dyntick("Error on exit: not idle task", + oldval, rdtp->dynticks_nesting); + ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ + } +} + +/** + * rcu_idle_exit - inform RCU that current CPU is leaving idle + * + * Exit idle mode, in other words, -enter- the mode in which RCU + * read-side critical sections can occur. + * + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to + * allow for the possibility of usermode upcalls messing up our count + * of interrupt nesting level during the busy period that is just + * now starting. + */ +void rcu_idle_exit(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + long long oldval; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + WARN_ON_ONCE(oldval != 0); + rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; + rcu_idle_exit_common(rdtp, oldval); + local_irq_restore(flags); +} + +/** + * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle + * + * Enter an interrupt handler, which might possibly result in exiting + * idle mode, in other words, entering the mode in which read-side critical + * sections can occur. + * + * Note that the Linux kernel is fully capable of entering an interrupt + * handler that it never exits, for example when doing upcalls to + * user mode! This code assumes that the idle loop never does upcalls to + * user mode. If your architecture does do upcalls from the idle loop (or + * does anything else that results in unbalanced calls to the irq_enter() + * and irq_exit() functions), RCU will give you what you deserve, good + * and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + */ +void rcu_irq_enter(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + long long oldval; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + rdtp->dynticks_nesting++; + WARN_ON_ONCE(rdtp->dynticks_nesting == 0); + if (oldval) + trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); + else + rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } @@ -437,27 +562,37 @@ void rcu_nmi_exit(void) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } +#ifdef CONFIG_PROVE_RCU + /** - * rcu_irq_enter - inform RCU of entry to hard irq context + * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * - * If the CPU was idle with dynamic ticks active, this updates the - * rdtp->dynticks to let the RCU handling know that the CPU is active. + * If the current CPU is in its idle loop and is neither in an interrupt + * or NMI handler, return true. */ -void rcu_irq_enter(void) +int rcu_is_cpu_idle(void) { - rcu_exit_nohz(); + int ret; + + preempt_disable(); + ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; + preempt_enable(); + return ret; } +EXPORT_SYMBOL(rcu_is_cpu_idle); + +#endif /* #ifdef CONFIG_PROVE_RCU */ /** - * rcu_irq_exit - inform RCU of exit from hard irq context + * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle * - * If the CPU was idle with dynamic ticks active, update the rdp->dynticks - * to put let the RCU handling be aware that the CPU is going back to idle - * with no ticks. + * If the current CPU is idle or running at a first-level (not nested) + * interrupt from idle, return true. The caller must have at least + * disabled preemption. */ -void rcu_irq_exit(void) +int rcu_is_cpu_rrupt_from_idle(void) { - rcu_enter_nohz(); + return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } #ifdef CONFIG_SMP @@ -470,7 +605,7 @@ void rcu_irq_exit(void) static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); - return 0; + return (rdp->dynticks_snap & 0x1) == 0; } /* @@ -481,11 +616,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { - unsigned long curr; - unsigned long snap; + unsigned int curr; + unsigned int snap; - curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); - snap = (unsigned long)rdp->dynticks_snap; + curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); + snap = (unsigned int)rdp->dynticks_snap; /* * If the CPU passed through or entered a dynticks idle phase with @@ -495,7 +630,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { + if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); rdp->dynticks_fqs++; return 1; } @@ -506,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -#else /* #ifdef CONFIG_NO_HZ */ - -#ifdef CONFIG_SMP - -static int dyntick_save_progress_counter(struct rcu_data *rdp) -{ - return 0; -} - -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) -{ - return rcu_implicit_offline_qs(rdp); -} - -#endif /* #ifdef CONFIG_SMP */ - -#endif /* #else #ifdef CONFIG_NO_HZ */ - -int rcu_cpu_stall_suppress __read_mostly; - static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; @@ -537,6 +653,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) int cpu; long delta; unsigned long flags; + int ndetected; struct rcu_node *rnp = rcu_get_root(rsp); /* Only let one CPU complain about others per time interval. */ @@ -553,7 +670,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * Now rat on any tasks that got kicked up to the root rcu_node * due to CPU offlining. */ - rcu_print_task_stall(rnp); + ndetected = rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -565,17 +682,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rsp->name); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); - rcu_print_task_stall(rnp); + ndetected += rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (rnp->qsmask == 0) continue; for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) + if (rnp->qsmask & (1UL << cpu)) { printk(" %d", rnp->grplo + cpu); + ndetected++; + } } printk("} (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); - trigger_all_cpu_backtrace(); + if (ndetected == 0) + printk(KERN_ERR "INFO: Stall ended before state dump start\n"); + else if (!trigger_all_cpu_backtrace()) + dump_stack(); /* If so configured, complain about tasks blocking the grace period. */ @@ -596,7 +718,8 @@ static void print_cpu_stall(struct rcu_state *rsp) */ printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", rsp->name, smp_processor_id(), jiffies - rsp->gp_start); - trigger_all_cpu_backtrace(); + if (!trigger_all_cpu_backtrace()) + dump_stack(); raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) @@ -678,9 +801,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct * go looking for one. */ rdp->gpnum = rnp->gpnum; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); if (rnp->qsmask & rdp->grpmask) { rdp->qs_pending = 1; - rdp->passed_quiesc = 0; + rdp->passed_quiesce = 0; } else rdp->qs_pending = 0; } @@ -741,6 +865,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); /* * If we were in an extended quiescent state, we may have @@ -826,33 +951,33 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); - if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { - if (cpu_needs_another_gp(rsp, rdp)) - rsp->fqs_need_gp = 1; - if (rnp->completed == rsp->completed) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + if (!rcu_scheduler_fully_active || + !cpu_needs_another_gp(rsp, rdp)) { + /* + * Either the scheduler hasn't yet spawned the first + * non-idle task or this CPU does not need another + * grace period. Either way, don't start a new grace + * period. + */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + if (rsp->fqs_active) { /* - * Propagate new ->completed value to rcu_node structures - * so that other CPUs don't have to wait until the start - * of the next grace period to process their callbacks. + * This CPU needs a grace period, but force_quiescent_state() + * is running. Tell it to start one on this CPU's behalf. */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->completed = rsp->completed; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - local_irq_restore(flags); + rsp->fqs_need_gp = 1; + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } /* Advance to a new grace period and initialize state. */ rsp->gpnum++; - WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); - rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ + trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); + WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); + rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); @@ -862,9 +987,12 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; - rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -901,12 +1029,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } rnp = rcu_get_root(rsp); raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } @@ -922,6 +1053,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { unsigned long gp_duration; + struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); @@ -933,8 +1066,42 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; - rsp->completed = rsp->gpnum; - rsp->signaled = RCU_GP_IDLE; + + /* + * We know the grace period is complete, but to everyone else + * it appears to still be ongoing. But it is also the case + * that to everyone else it looks like there is nothing that + * they can do to advance the grace period. It is therefore + * safe for us to drop the lock in order to mark the grace + * period as completed in all of the rcu_node structures. + * + * But if this CPU needs another grace period, it will take + * care of this while initializing the next grace period. + * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL + * because the callbacks have not yet been advanced: Those + * callbacks are waiting on the grace period that just now + * completed. + */ + if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * Propagate new ->completed value to rcu_node structures + * so that other CPUs don't have to wait until the start + * of the next grace period to process their callbacks. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->gpnum; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + rnp = rcu_get_root(rsp); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + } + + rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ + trace_rcu_grace_period(rsp->name, rsp->completed, "end"); + rsp->fqs_state = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } @@ -962,6 +1129,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } rnp->qsmask &= ~mask; + trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, + mask, rnp->qsmask, rnp->level, + rnp->grplo, rnp->grphi, + !!rnp->gp_tasks); if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ @@ -1000,7 +1171,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * based on quiescent states detected in an earlier grace period! */ static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) { unsigned long flags; unsigned long mask; @@ -1008,17 +1179,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); - if (lastcomp != rnp->completed) { + if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { /* - * Someone beat us to it for this grace period, so leave. - * The race with GP start is resolved by the fact that we - * hold the leaf rcu_node lock, so that the per-CPU bits - * cannot yet be initialized -- so we would simply find our - * CPU's bit already cleared in rcu_report_qs_rnp() if this - * race occurred. + * The grace period in which this quiescent state was + * recorded has ended, so don't report it upwards. + * We will instead need a new quiescent state that lies + * within the current grace period. */ - rdp->passed_quiesc = 0; /* try again later! */ + rdp->passed_quiesce = 0; /* need qs for new gp. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -1062,14 +1231,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesc) + if (!rdp->passed_quiesce) return; /* * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); + rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); } #ifdef CONFIG_HOTPLUG_CPU @@ -1130,11 +1299,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + else + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - + !!(rnp->qsmask & mask), + "cpuofl"); break; } - if (rnp == rdp->mynode) + if (rnp == rdp->mynode) { + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - + !!(rnp->qsmask & mask), + "cpuofl"); need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - else + } else raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ mask = rnp->grpmask; rnp = rnp->parent; @@ -1153,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) else raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp); + rcu_report_exp_rnp(rsp, rnp, true); rcu_node_kthread_setaffinity(rnp, -1); } @@ -1190,17 +1368,24 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int count; + int bl, count; /* If no callbacks are ready, just return.*/ - if (!cpu_has_callbacks_ready_to_invoke(rdp)) + if (!cpu_has_callbacks_ready_to_invoke(rdp)) { + trace_rcu_batch_start(rsp->name, 0, 0); + trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), + need_resched(), is_idle_task(current), + rcu_is_callbacks_kthread()); return; + } /* * Extract the list of ready callbacks, disabling to prevent * races with call_rcu() from interrupt handlers. */ local_irq_save(flags); + bl = rdp->blimit; + trace_rcu_batch_start(rsp->name, rdp->qlen, bl); list = rdp->nxtlist; rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; @@ -1216,13 +1401,19 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - __rcu_reclaim(list); + __rcu_reclaim(rsp->name, list); list = next; - if (++count >= rdp->blimit) + /* Stop only if limit reached and CPU has something to do. */ + if (++count >= bl && + (need_resched() || + (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; } local_irq_save(flags); + trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), + is_idle_task(current), + rcu_is_callbacks_kthread()); /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; @@ -1250,7 +1441,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); - /* Re-raise the RCU softirq if there are callbacks remaining. */ + /* Re-invoke RCU core processing if there are callbacks remaining. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) invoke_rcu_core(); } @@ -1258,17 +1449,16 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* * Check to see if this CPU is in a non-context-switch quiescent state * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule the RCU softirq handler. + * Also schedule RCU core processing. * - * This function must be called with hardirqs disabled. It is normally + * This function must be called from hardirq context. It is normally * invoked from the scheduling-clock interrupt. If rcu_pending returns * false, there is no point in invoking rcu_check_callbacks(). */ void rcu_check_callbacks(int cpu, int user) { - if (user || - (idle_cpu(cpu) && rcu_scheduler_active && - !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + trace_rcu_utilization("Start scheduler-tick"); + if (user || rcu_is_cpu_rrupt_from_idle()) { /* * Get here if this CPU took its interrupt from user @@ -1299,6 +1489,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) invoke_rcu_core(); + trace_rcu_utilization("End scheduler-tick"); } #ifdef CONFIG_SMP @@ -1360,10 +1551,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - if (!rcu_gp_in_progress(rsp)) + trace_rcu_utilization("Start fqs"); + if (!rcu_gp_in_progress(rsp)) { + trace_rcu_utilization("End fqs"); return; /* No grace period in progress, nothing to force. */ + } if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ + trace_rcu_utilization("End fqs"); return; /* Someone else is already on the job. */ } if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) @@ -1377,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) goto unlock_fqs_ret; /* no GP in progress, time updated. */ } rsp->fqs_active = 1; - switch (rsp->signaled) { + switch (rsp->fqs_state) { case RCU_GP_IDLE: case RCU_GP_INIT: @@ -1393,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) force_qs_rnp(rsp, dyntick_save_progress_counter); raw_spin_lock(&rnp->lock); /* irqs already disabled */ if (rcu_gp_in_progress(rsp)) - rsp->signaled = RCU_FORCE_QS; + rsp->fqs_state = RCU_FORCE_QS; break; case RCU_FORCE_QS: @@ -1412,11 +1607,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ rsp->fqs_need_gp = 0; rcu_start_gp(rsp, flags); /* releases rnp->lock */ + trace_rcu_utilization("End fqs"); return; } raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ unlock_fqs_ret: raw_spin_unlock_irqrestore(&rsp->fqslock, flags); + trace_rcu_utilization("End fqs"); } #else /* #ifdef CONFIG_SMP */ @@ -1429,9 +1626,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) #endif /* #else #ifdef CONFIG_SMP */ /* - * This does the RCU processing work from softirq context for the - * specified rcu_state and rcu_data structures. This may be called - * only from the CPU to whom the rdp belongs. + * This does the RCU core processing work for the specified rcu_state + * and rcu_data structures. This may be called only from the CPU to + * whom the rdp belongs. */ static void __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) @@ -1468,24 +1665,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Do softirq processing for the current CPU. + * Do RCU core processing for the current CPU. */ static void rcu_process_callbacks(struct softirq_action *unused) { + trace_rcu_utilization("Start RCU core"); __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); - - /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ - rcu_needs_cpu_flush(); + trace_rcu_utilization("End RCU core"); } /* - * Wake up the current CPU's kthread. This replaces raise_softirq() - * in earlier versions of RCU. Note that because we are running on - * the current CPU with interrupts disabled, the rcu_cpu_kthread_task - * cannot disappear out from under us. + * Schedule RCU callback invocation. If the specified type of RCU + * does not support RCU priority boosting, just do a direct call, + * otherwise wake up the per-CPU kernel kthread. Note that because we + * are running on the current CPU with interrupts disabled, the + * rcu_cpu_kthread_task cannot disappear out from under us. */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { @@ -1530,6 +1727,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp->nxttail[RCU_NEXT_TAIL] = &head->next; rdp->qlen++; + if (__is_kfree_rcu_offset((unsigned long)func)) + trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, + rdp->qlen); + else + trace_rcu_callback(rsp->name, head, rdp->qlen); + /* If interrupts were disabled, don't dive into RCU core. */ if (irqs_disabled_flags(flags)) { local_irq_restore(flags); @@ -1613,18 +1816,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); */ void synchronize_sched(void) { - struct rcu_synchronize rcu; - if (rcu_blocking_is_gp()) return; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + wait_rcu_gp(call_rcu_sched); } EXPORT_SYMBOL_GPL(synchronize_sched); @@ -1639,18 +1833,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { - struct rcu_synchronize rcu; - if (rcu_blocking_is_gp()) return; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_bh(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + wait_rcu_gp(call_rcu_bh); } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); @@ -1671,7 +1856,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) check_cpu_stall(rsp, rdp); /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->qs_pending && !rdp->passed_quiesc) { + if (rcu_scheduler_fully_active && + rdp->qs_pending && !rdp->passed_quiesce) { /* * If force_quiescent_state() coming soon and this CPU @@ -1683,7 +1869,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, jiffies)) set_need_resched(); - } else if (rdp->qs_pending && rdp->passed_quiesc) { + } else if (rdp->qs_pending && rdp->passed_quiesce) { rdp->n_rp_report_qs++; return 1; } @@ -1741,7 +1927,7 @@ static int rcu_pending(int cpu) * by the current CPU, even if none need be done immediately, returning * 1 if so. */ -static int rcu_needs_cpu_quick_check(int cpu) +static int rcu_cpu_has_callbacks(int cpu) { /* RCU callbacks either ready or pending? */ return per_cpu(rcu_sched_data, cpu).nxtlist || @@ -1842,10 +2028,11 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rdp->qlen = 0; -#ifdef CONFIG_NO_HZ rdp->dynticks = &per_cpu(rcu_dynticks, cpu); -#endif /* #ifdef CONFIG_NO_HZ */ + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); + WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; + rdp->rsp = rsp; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1865,13 +2052,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->passed_quiesc = 0; /* We could be racing with new GP, */ - rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ rdp->preemptible = preemptible; rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; + rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; + atomic_set(&rdp->dynticks->dynticks, + (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); + rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* @@ -1891,9 +2080,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rnp->qsmaskinit |= mask; mask = rnp->grpmask; if (rnp == rdp->mynode) { - rdp->gpnum = rnp->completed; /* if GP in progress... */ + /* + * If there is a grace period in progress, we will + * set up to wait for it next time we run the + * RCU core code. + */ + rdp->gpnum = rnp->completed; rdp->completed = rnp->completed; - rdp->passed_quiesc_completed = rnp->completed - 1; + rdp->passed_quiesce = 0; + rdp->qs_pending = 0; + rdp->passed_quiesce_gpnum = rnp->gpnum - 1; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; @@ -1919,6 +2116,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; + trace_rcu_utilization("Start CPU hotplug"); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: @@ -1944,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_send_cbs_to_online(&rcu_bh_state); rcu_send_cbs_to_online(&rcu_sched_state); rcu_preempt_send_cbs_to_online(); + rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -1954,6 +2153,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, default: break; } + trace_rcu_utilization("End CPU hotplug"); return NOTIFY_OK; } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 01b2ccda26fb..fddff92d6676 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,9 +84,10 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - int dynticks_nesting; /* Track irq/process nesting level. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ - atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ + long long dynticks_nesting; /* Track irq/process nesting level. */ + /* Process level is worth LLONG_MAX/2. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for idle, else odd. */ }; /* RCU's kthread states for tracing. */ @@ -230,9 +231,9 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ - unsigned long passed_quiesc_completed; - /* Value of completed at time of qs. */ - bool passed_quiesc; /* User-mode/idle loop etc. */ + unsigned long passed_quiesce_gpnum; + /* gpnum at time of quiescent state. */ + bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool preemptible; /* Preemptible RCU? */ @@ -274,16 +275,12 @@ struct rcu_data { /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ -#ifdef CONFIG_NO_HZ /* 3) dynticks interface. */ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ -#endif /* #ifdef CONFIG_NO_HZ */ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ -#ifdef CONFIG_NO_HZ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ -#endif /* #ifdef CONFIG_NO_HZ */ unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long resched_ipi; /* Sent a resched IPI. */ @@ -299,18 +296,15 @@ struct rcu_data { unsigned long n_rp_need_nothing; int cpu; + struct rcu_state *rsp; }; -/* Values for signaled field in struct rcu_state. */ +/* Values for fqs_state field in struct rcu_state. */ #define RCU_GP_IDLE 0 /* No grace period in progress. */ #define RCU_GP_INIT 1 /* Grace period being initialized. */ #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ -#ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK -#else /* #ifdef CONFIG_NO_HZ */ -#define RCU_SIGNAL_INIT RCU_FORCE_QS -#endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ @@ -360,7 +354,7 @@ struct rcu_state { /* The following fields are guarded by the root rcu_node's lock. */ - u8 signaled ____cacheline_internodealigned_in_smp; + u8 fqs_state ____cacheline_internodealigned_in_smp; /* Force QS state. */ u8 fqs_active; /* force_quiescent_state() */ /* is running. */ @@ -417,6 +411,13 @@ extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DECLARE_PER_CPU(char, rcu_cpu_has_work); +#endif /* #ifdef CONFIG_RCU_BOOST */ + #ifndef RCU_TREE_NONCORE /* Forward declarations for rcutree_plugin.h */ @@ -430,7 +431,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); -static void rcu_print_task_stall(struct rcu_node *rnp); +static int rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_stall_reset(void); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU @@ -443,17 +444,18 @@ static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); -static void rcu_needs_cpu_flush(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); +static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, @@ -466,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); #endif /* #ifdef CONFIG_RCU_BOOST */ static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); +static void rcu_prepare_for_idle_init(int cpu); +static void rcu_cleanup_after_idle(int cpu); +static void rcu_prepare_for_idle(int cpu); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8aafbb80b8b0..8bb35d73e1f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -27,6 +27,14 @@ #include <linux/delay.h> #include <linux/stop_machine.h> +#define RCU_KTHREAD_PRIO 1 + +#ifdef CONFIG_RCU_BOOST +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO +#else +#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO +#endif + /* * Check the RCU kernel configuration parameters and print informative * messages about anything out of the ordinary. If you like #ifdef, you @@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU -struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); +struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; @@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesc_completed = rdp->gpnum - 1; + rdp->passed_quiesce_gpnum = rdp->gpnum; barrier(); - rdp->passed_quiesc = 1; + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); + rdp->passed_quiesce = 1; current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; } @@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu) if (rnp->qsmask & rdp->grpmask) rnp->gp_tasks = &t->rcu_node_entry; } + trace_rcu_preempt_task(rdp->rsp->name, + t->pid, + (rnp->qsmask & rdp->grpmask) + ? rnp->gpnum + : rnp->gpnum + 1); raw_spin_unlock_irqrestore(&rnp->lock, flags); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special) { @@ -297,8 +312,12 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; + int empty_exp_now; unsigned long flags; struct list_head *np; +#ifdef CONFIG_RCU_BOOST + struct rt_mutex *rbmp = NULL; +#endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; int special; @@ -344,6 +363,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); + t->rcu_blocked_node = NULL; + trace_rcu_unlock_preempted_task("rcu_preempt", + rnp->gpnum, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) rnp->gp_tasks = np; if (&t->rcu_node_entry == rnp->exp_tasks) @@ -351,38 +373,44 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; - /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ - if (t->rcu_boosted) { - special |= RCU_READ_UNLOCK_BOOSTED; - t->rcu_boosted = 0; + /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ + if (t->rcu_boost_mutex) { + rbmp = t->rcu_boost_mutex; + t->rcu_boost_mutex = NULL; } #endif /* #ifdef CONFIG_RCU_BOOST */ - t->rcu_blocked_node = NULL; /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. - * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. + * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, + * so we must take a snapshot of the expedited state. */ - if (empty) - raw_spin_unlock_irqrestore(&rnp->lock, flags); - else + empty_exp_now = !rcu_preempted_readers_exp(rnp); + if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { + trace_rcu_quiescent_state_report("preempt_rcu", + rnp->gpnum, + 0, rnp->qsmask, + rnp->level, + rnp->grplo, + rnp->grphi, + !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rnp, flags); + } else + raw_spin_unlock_irqrestore(&rnp->lock, flags); #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (special & RCU_READ_UNLOCK_BOOSTED) { - rt_mutex_unlock(t->rcu_boost_mutex); - t->rcu_boost_mutex = NULL; - } + if (rbmp) + rt_mutex_unlock(rbmp); #endif /* #ifdef CONFIG_RCU_BOOST */ /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ - if (!empty_exp && !rcu_preempted_readers_exp(rnp)) - rcu_report_exp_rnp(&rcu_preempt_state, rnp); + if (!empty_exp && empty_exp_now) + rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); } else { local_irq_restore(flags); } @@ -399,10 +427,10 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ if (t->rcu_read_lock_nesting != 1) --t->rcu_read_lock_nesting; else { + barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) @@ -466,16 +494,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. */ -static void rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp) { struct task_struct *t; + int ndetected = 0; if (!rcu_preempt_blocked_readers_cgp(rnp)) - return; + return 0; t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { printk(" P%d", t->pid); + ndetected++; + } + return ndetected; } /* @@ -656,18 +688,9 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { - struct rcu_synchronize rcu; - if (!rcu_scheduler_active) return; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + wait_rcu_gp(call_rcu); } EXPORT_SYMBOL_GPL(synchronize_rcu); @@ -709,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * + * Most callers will set the "wake" flag, but the task initiating the + * expedited grace period need not wake itself. + * * Caller must hold sync_rcu_preempt_exp_mutex. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) { unsigned long flags; unsigned long mask; @@ -724,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - wake_up(&sync_rcu_preempt_exp_wq); + if (wake) + wake_up(&sync_rcu_preempt_exp_wq); break; } mask = rnp->grpmask; @@ -757,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) must_wait = 1; } if (!must_wait) - rcu_report_exp_rnp(rsp, rnp); + rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ } /* @@ -968,8 +996,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ -static void rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp) { + return 0; } /* @@ -1048,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); * report on tasks preempted in RCU read-side critical sections during * expedited RCU grace periods. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) { - return; } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -1199,12 +1228,12 @@ static int rcu_boost(struct rcu_node *rnp) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; - t->rcu_boosted = 1; raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; + return ACCESS_ONCE(rnp->exp_tasks) != NULL || + ACCESS_ONCE(rnp->boost_tasks) != NULL; } /* @@ -1228,9 +1257,12 @@ static int rcu_boost_kthread(void *arg) int spincnt = 0; int more2boost; + trace_rcu_utilization("Start boost kthread@init"); for (;;) { rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + trace_rcu_utilization("End boost kthread@rcu_wait"); rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + trace_rcu_utilization("Start boost kthread@rcu_wait"); rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); if (more2boost) @@ -1238,11 +1270,14 @@ static int rcu_boost_kthread(void *arg) else spincnt = 0; if (spincnt > 10) { + trace_rcu_utilization("End boost kthread@rcu_yield"); rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + trace_rcu_utilization("Start boost kthread@rcu_yield"); spincnt = 0; } } /* NOTREACHED */ + trace_rcu_utilization("End boost kthread@notreached"); return 0; } @@ -1291,15 +1326,22 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_save(flags); __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { - local_irq_restore(flags); - return; - } - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && + current != __this_cpu_read(rcu_cpu_kthread_task)) + wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); local_irq_restore(flags); } /* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return __get_cpu_var(rcu_cpu_kthread_task) == current; +} + +/* * Set the affinity of the boost kthread. The CPU-hotplug locks are * held, so no one should be messing with the existence of the boost * kthread. @@ -1343,13 +1385,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (rnp->boost_kthread_task != NULL) return 0; t = kthread_create(rcu_boost_kthread, (void *)rnp, - "rcub%d", rnp_index); + "rcub/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = RCU_KTHREAD_PRIO; + sp.sched_priority = RCU_BOOST_PRIO; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ return 0; @@ -1444,6 +1486,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) { struct sched_param sp; struct timer_list yield_timer; + int prio = current->rt_priority; setup_timer_on_stack(&yield_timer, f, arg); mod_timer(&yield_timer, jiffies + 2); @@ -1451,7 +1494,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); set_user_nice(current, 19); schedule(); - sp.sched_priority = RCU_KTHREAD_PRIO; + set_user_nice(current, 0); + sp.sched_priority = prio; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); del_timer(&yield_timer); } @@ -1489,7 +1533,8 @@ static int rcu_cpu_kthread_should_stop(int cpu) /* * Per-CPU kernel thread that invokes RCU callbacks. This replaces the - * earlier RCU softirq. + * RCU softirq used in flavors and configurations of RCU that do not + * support RCU priority boosting. */ static int rcu_cpu_kthread(void *arg) { @@ -1500,9 +1545,12 @@ static int rcu_cpu_kthread(void *arg) char work; char *workp = &per_cpu(rcu_cpu_has_work, cpu); + trace_rcu_utilization("Start CPU kthread@init"); for (;;) { *statusp = RCU_KTHREAD_WAITING; + trace_rcu_utilization("End CPU kthread@rcu_wait"); rcu_wait(*workp != 0 || kthread_should_stop()); + trace_rcu_utilization("Start CPU kthread@rcu_wait"); local_bh_disable(); if (rcu_cpu_kthread_should_stop(cpu)) { local_bh_enable(); @@ -1523,11 +1571,14 @@ static int rcu_cpu_kthread(void *arg) spincnt = 0; if (spincnt > 10) { *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization("End CPU kthread@rcu_yield"); rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); + trace_rcu_utilization("Start CPU kthread@rcu_yield"); spincnt = 0; } } *statusp = RCU_KTHREAD_STOPPED; + trace_rcu_utilization("End CPU kthread@term"); return 0; } @@ -1560,7 +1611,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) if (!rcu_scheduler_fully_active || per_cpu(rcu_cpu_kthread_task, cpu) != NULL) return 0; - t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); + t = kthread_create_on_node(rcu_cpu_kthread, + (void *)(long)cpu, + cpu_to_node(cpu), + "rcuc/%d", cpu); if (IS_ERR(t)) return PTR_ERR(t); if (cpu_online(cpu)) @@ -1669,7 +1723,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, return 0; if (rnp->node_kthread_task == NULL) { t = kthread_create(rcu_node_kthread, (void *)rnp, - "rcun%d", rnp_index); + "rcun/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); @@ -1731,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void) WARN_ON_ONCE(1); } +static bool rcu_is_callbacks_kthread(void) +{ + return false; +} + static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } @@ -1866,7 +1925,7 @@ void synchronize_sched_expedited(void) * grace period works for us. */ get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started) - 1; + snap = atomic_read(&sync_sched_expedited_started); smp_mb(); /* ensure read is before try_stop_cpus(). */ } @@ -1898,113 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); * 1 if so. This function is part of the RCU implementation; it is -not- * an exported member of the RCU API. * - * Because we have preemptible RCU, just check whether this CPU needs - * any flavor of RCU. Do not chew up lots of CPU cycles with preemption - * disabled in a most-likely vain attempt to cause RCU not to need this CPU. + * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs + * any flavor of RCU. */ int rcu_needs_cpu(int cpu) { - return rcu_needs_cpu_quick_check(cpu); + return rcu_cpu_has_callbacks(cpu); +} + +/* + * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. + */ +static void rcu_prepare_for_idle_init(int cpu) +{ } /* - * Check to see if we need to continue a callback-flush operations to - * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle - * entry is not configured, so we never do need to. + * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up + * after it. */ -static void rcu_needs_cpu_flush(void) +static void rcu_cleanup_after_idle(int cpu) +{ +} + +/* + * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, + * is nothing. + */ +static void rcu_prepare_for_idle(int cpu) { } #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#define RCU_NEEDS_CPU_FLUSHES 5 +/* + * This code is invoked when a CPU goes idle, at which point we want + * to have the CPU do everything required for RCU so that it can enter + * the energy-efficient dyntick-idle mode. This is handled by a + * state machine implemented by rcu_prepare_for_idle() below. + * + * The following three proprocessor symbols control this state machine: + * + * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt + * to satisfy RCU. Beyond this point, it is better to incur a periodic + * scheduling-clock interrupt than to loop through the state machine + * at full power. + * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are + * optional if RCU does not need anything immediately from this + * CPU, even if this CPU still has RCU callbacks queued. The first + * times through the state machine are mandatory: we need to give + * the state machine a chance to communicate a quiescent state + * to the RCU core. + * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted + * to sleep in dyntick-idle mode with RCU callbacks pending. This + * is sized to be roughly one RCU grace period. Those energy-efficiency + * benchmarkers who might otherwise be tempted to set this to a large + * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your + * system. And if you are -that- concerned about energy efficiency, + * just power the system down and be done with it! + * + * The values below work well in practice. If future workloads require + * adjustment, they can be converted into kernel config parameters, though + * making the state machine smarter might be a better option. + */ +#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ +#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ +#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ + static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); +static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); +static ktime_t rcu_idle_gp_wait; /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + */ +int rcu_needs_cpu(int cpu) +{ + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) + return 0; + /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ + return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; +} + +/* + * Timer handler used to force CPU to start pushing its remaining RCU + * callbacks in the case where it entered dyntick-idle mode with callbacks + * pending. The hander doesn't really need to do anything because the + * real work is done upon re-entry to idle, or by the next scheduling-clock + * interrupt should idle not be re-entered. + */ +static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) +{ + trace_rcu_prep_idle("Timer"); + return HRTIMER_NORESTART; +} + +/* + * Initialize the timer used to pull CPUs out of dyntick-idle mode. + */ +static void rcu_prepare_for_idle_init(int cpu) +{ + static int firsttime = 1; + struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + + hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtp->function = rcu_idle_gp_timer_func; + if (firsttime) { + unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); + + rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); + firsttime = 0; + } +} + +/* + * Clean up for exit from idle. Because we are exiting from idle, there + * is no longer any point to rcu_idle_gp_timer, so cancel it. This will + * do nothing if this timer is not active, so just cancel it unconditionally. + */ +static void rcu_cleanup_after_idle(int cpu) +{ + hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); +} + +/* + * Check to see if any RCU-related work can be done by the current CPU, + * and if so, schedule a softirq to get it done. This function is part + * of the RCU implementation; it is -not- an exported member of the RCU API. * - * Because we are not supporting preemptible RCU, attempt to accelerate - * any current grace periods so that RCU no longer needs this CPU, but - * only if all other CPUs are already in dynticks-idle mode. This will - * allow the CPU cores to be powered down immediately, as opposed to after - * waiting many milliseconds for grace periods to elapse. + * The idea is for the current CPU to clear out all work required by the + * RCU core for the current grace period, so that this CPU can be permitted + * to enter dyntick-idle mode. In some cases, it will need to be awakened + * at the end of the grace period by whatever CPU ends the grace period. + * This allows CPUs to go dyntick-idle more quickly, and to reduce the + * number of wakeups by a modest integer factor. * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. + * + * The caller must have disabled interrupts. */ -int rcu_needs_cpu(int cpu) +static void rcu_prepare_for_idle(int cpu) { - int c = 0; - int snap; - int thatcpu; - - /* Check for being in the holdoff period. */ - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) - return rcu_needs_cpu_quick_check(cpu); - - /* Don't bother unless we are the last non-dyntick-idle CPU. */ - for_each_online_cpu(thatcpu) { - if (thatcpu == cpu) - continue; - snap = atomic_add_return(0, &per_cpu(rcu_dynticks, - thatcpu).dynticks); - smp_mb(); /* Order sampling of snap with end of grace period. */ - if ((snap & 0x1) != 0) { - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - return rcu_needs_cpu_quick_check(cpu); - } + unsigned long flags; + + local_irq_save(flags); + + /* + * If there are no callbacks on this CPU, enter dyntick-idle mode. + * Also reset state to avoid prejudicing later attempts. + */ + if (!rcu_cpu_has_callbacks(cpu)) { + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + per_cpu(rcu_dyntick_drain, cpu) = 0; + local_irq_restore(flags); + trace_rcu_prep_idle("No callbacks"); + return; + } + + /* + * If in holdoff mode, just return. We will presumably have + * refrained from disabling the scheduling-clock tick. + */ + if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { + local_irq_restore(flags); + trace_rcu_prep_idle("In holdoff"); + return; } /* Check and update the rcu_dyntick_drain sequencing. */ if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* First time through, initialize the counter. */ - per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; + per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; + } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && + !rcu_pending(cpu)) { + /* Can we go dyntick-idle despite still having callbacks? */ + trace_rcu_prep_idle("Dyntick with callbacks"); + per_cpu(rcu_dyntick_drain, cpu) = 0; + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_wait, HRTIMER_MODE_REL); + return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - return rcu_needs_cpu_quick_check(cpu); + local_irq_restore(flags); + trace_rcu_prep_idle("Begin holdoff"); + invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ + return; } - /* Do one step pushing remaining RCU callbacks through. */ + /* + * Do one step of pushing the remaining RCU callbacks through + * the RCU core state machine. + */ +#ifdef CONFIG_TREE_PREEMPT_RCU + if (per_cpu(rcu_preempt_data, cpu).nxtlist) { + local_irq_restore(flags); + rcu_preempt_qs(cpu); + force_quiescent_state(&rcu_preempt_state, 0); + local_irq_save(flags); + } +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { + local_irq_restore(flags); rcu_sched_qs(cpu); force_quiescent_state(&rcu_sched_state, 0); - c = c || per_cpu(rcu_sched_data, cpu).nxtlist; + local_irq_save(flags); } if (per_cpu(rcu_bh_data, cpu).nxtlist) { + local_irq_restore(flags); rcu_bh_qs(cpu); force_quiescent_state(&rcu_bh_state, 0); - c = c || per_cpu(rcu_bh_data, cpu).nxtlist; + local_irq_save(flags); } - /* If RCU callbacks are still pending, RCU still needs this CPU. */ - if (c) + /* + * If RCU callbacks are still pending, RCU still needs this CPU. + * So try forcing the callbacks through the grace period. + */ + if (rcu_cpu_has_callbacks(cpu)) { + local_irq_restore(flags); + trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); - return c; -} - -/* - * Check to see if we need to continue a callback-flush operations to - * allow the last CPU to enter dyntick-idle mode. - */ -static void rcu_needs_cpu_flush(void) -{ - int cpu = smp_processor_id(); - unsigned long flags; - - if (per_cpu(rcu_dyntick_drain, cpu) <= 0) - return; - local_irq_save(flags); - (void)rcu_needs_cpu(cpu); - local_irq_restore(flags); + } else { + local_irq_restore(flags); + trace_rcu_prep_idle("Callbacks drained"); + } } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 3b0c0986afc0..654cfe67f0d1 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -48,11 +48,6 @@ #ifdef CONFIG_RCU_BOOST -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DECLARE_PER_CPU(char, rcu_cpu_has_work); - static char convert_kthread_status(unsigned int kthread_status) { if (kthread_status > RCU_KTHREAD_MAX) @@ -66,19 +61,17 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", + seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->completed, rdp->gpnum, - rdp->passed_quiesc, rdp->passed_quiesc_completed, + rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); -#ifdef CONFIG_NO_HZ - seq_printf(m, " dt=%d/%d/%d df=%lu", + seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); -#endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); seq_printf(m, " ql=%ld qs=%c%c%c%c", rdp->qlen, @@ -144,15 +137,13 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->cpu, cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", rdp->completed, rdp->gpnum, - rdp->passed_quiesc, rdp->passed_quiesc_completed, + rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); -#ifdef CONFIG_NO_HZ - seq_printf(m, ",%d,%d,%d,%lu", + seq_printf(m, ",%d,%llx,%d,%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); -#endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != @@ -175,10 +166,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); -#ifdef CONFIG_NO_HZ + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); -#endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST seq_puts(m, "\"kt\",\"ktl\""); @@ -283,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", - rsp->completed, gpnum, rsp->signaled, + rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, diff --git a/kernel/relay.c b/kernel/relay.c index 859ea5a9605f..4335e1d7ee2d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -15,7 +15,7 @@ #include <linux/errno.h> #include <linux/stddef.h> #include <linux/slab.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/string.h> #include <linux/relay.h> #include <linux/vmalloc.h> @@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf, */ static struct dentry *create_buf_file_default_callback(const char *filename, struct dentry *parent, - int mode, + umode_t mode, struct rchan_buf *buf, int *is_global) { diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 34683efa2cce..6d269cce7aa1 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -159,8 +159,7 @@ int res_counter_memparse_write_strategy(const char *buf, return 0; } - /* FIXME - make memparse() take const char* args */ - *res = memparse((char *)buf, &end); + *res = memparse(buf, &end); if (*end != '\0') return -EINVAL; diff --git a/kernel/resource.c b/kernel/resource.c index c8dc249da5ce..7640b3a947d0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -7,7 +7,7 @@ * Arbitrary resource management. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/errno.h> #include <linux/ioport.h> #include <linux/init.h> diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 3c7cbc2c33be..16502d3a71c8 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -18,7 +18,7 @@ */ #include <linux/sched.h> #include <linux/delay.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/spinlock.h> #include <linux/kallsyms.h> #include <linux/syscalls.h> @@ -29,61 +29,6 @@ #include "rtmutex_common.h" -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (raw_spin_is_locked(¤t->pi_lock)) \ - raw_spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -static int rt_trace_on = 1; - static void printk_task(struct task_struct *p) { if (p) @@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } /* @@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, { struct task_struct *task; - if (!rt_trace_on || detect || !act_waiter) + if (!debug_locks || detect || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); @@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) { struct task_struct *task; - if (!waiter->deadlock_lock || !rt_trace_on) + if (!waiter->deadlock_lock || !debug_locks) return; rcu_read_lock(); @@ -149,10 +94,14 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) return; } - TRACE_OFF_NOLOCK(); + if (!debug_locks_off()) { + rcu_read_unlock(); + return; + } printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); + printk("%s\n", print_tainted()); printk( "--------------------------------------------\n"); printk("%s/%d is deadlocking current task %s/%d\n\n", task->comm, task_pid_nr(task), @@ -180,7 +129,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); - local_irq_disable(); } void debug_rt_mutex_lock(struct rt_mutex *lock) @@ -189,7 +137,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock) void debug_rt_mutex_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } void @@ -199,7 +147,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -213,8 +161,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 5c9ccd380966..98ec49475460 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -6,11 +6,11 @@ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> * */ +#include <linux/device.h> #include <linux/kthread.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/spinlock.h> -#include <linux/sysdev.h> #include <linux/timer.h> #include <linux/freezer.h> @@ -27,7 +27,7 @@ struct test_thread_data { int opdata; int mutexes[MAX_RT_TEST_MUTEXES]; int event; - struct sys_device sysdev; + struct device dev; }; static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; @@ -271,7 +271,7 @@ static int test_func(void *data) * * opcode:data */ -static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, +static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct sched_param schedpar; @@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut char cmdbuf[32]; int op, dat, tid, ret; - td = container_of(dev, struct test_thread_data, sysdev); - tid = td->sysdev.id; + td = container_of(dev, struct test_thread_data, dev); + tid = td->dev.id; /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(cmdbuf)) @@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut * @dev: thread to query * @buf: char buffer to be filled with thread status info */ -static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, +static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, char *buf) { struct test_thread_data *td; @@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute char *curr = buf; int i; - td = container_of(dev, struct test_thread_data, sysdev); - tsk = threads[td->sysdev.id]; + td = container_of(dev, struct test_thread_data, dev); + tsk = threads[td->dev.id]; spin_lock(&rttest_lock); @@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute spin_unlock(&rttest_lock); curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->sysdev.id].owner); + mutexes[td->dev.id].owner); return curr - buf; } -static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); -static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); +static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); +static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); -static struct sysdev_class rttest_sysclass = { +static struct bus_type rttest_subsys = { .name = "rttest", + .dev_name = "rttest", }; static int init_test_thread(int id) { - thread_data[id].sysdev.cls = &rttest_sysclass; - thread_data[id].sysdev.id = id; + thread_data[id].dev.bus = &rttest_subsys; + thread_data[id].dev.id = id; threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); if (IS_ERR(threads[id])) return PTR_ERR(threads[id]); - return sysdev_register(&thread_data[id].sysdev); + return device_register(&thread_data[id].dev); } static int init_rttest(void) @@ -393,7 +394,7 @@ static int init_rttest(void) for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) rt_mutex_init(&mutexes[i]); - ret = sysdev_class_register(&rttest_sysclass); + ret = subsys_system_register(&rttest_subsys, NULL); if (ret) return ret; @@ -401,10 +402,10 @@ static int init_rttest(void) ret = init_test_thread(i); if (ret) break; - ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); + ret = device_create_file(&thread_data[i].dev, &dev_attr_status); if (ret) break; - ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); + ret = device_create_file(&thread_data[i].dev, &dev_attr_command); if (ret) break; } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 255e1662acdb..a242e691c993 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -11,7 +11,7 @@ * See Documentation/rt-mutex-design.txt for details. */ #include <linux/spinlock.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/timer.h> diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 9f48f3d82e9b..b152f74f02de 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -7,7 +7,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/rwsem.h> #include <asm/system.h> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile new file mode 100644 index 000000000000..9a7dd35102a3 --- /dev/null +++ b/kernel/sched/Makefile @@ -0,0 +1,20 @@ +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_clock.o = -pg +endif + +ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) +# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is +# needed for x86 only. Why this used to be enabled for all architectures is beyond +# me. I suspect most platforms don't need this, but until we know that for sure +# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k +# to get a correct value for the wait-channel (WCHAN in ps). --davidm +CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer +endif + +obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o +obj-$(CONFIG_SMP) += cpupri.o +obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o +obj-$(CONFIG_SCHEDSTATS) += stats.o +obj-$(CONFIG_SCHED_DEBUG) += debug.o + + diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c index 429242f3c484..e8a1f83ee0e7 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched/auto_group.c @@ -1,15 +1,19 @@ #ifdef CONFIG_SCHED_AUTOGROUP +#include "sched.h" + #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> #include <linux/utsname.h> +#include <linux/security.h> +#include <linux/export.h> unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; -static void __init autogroup_init(struct task_struct *init_task) +void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; kref_init(&autogroup_default.kref); @@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task) init_task->signal->autogroup = &autogroup_default; } -static inline void autogroup_free(struct task_group *tg) +void autogroup_free(struct task_group *tg) { kfree(tg->autogroup); } @@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) return ag; } -#ifdef CONFIG_RT_GROUP_SCHED -static void free_rt_sched_group(struct task_group *tg); -#endif - static inline struct autogroup *autogroup_create(void) { struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); @@ -108,8 +108,7 @@ out_fail: return autogroup_kref_get(&autogroup_default); } -static inline bool -task_wants_autogroup(struct task_struct *p, struct task_group *tg) +bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; @@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } -static inline bool task_group_is_autogroup(struct task_group *tg) -{ - return !!tg->autogroup; -} - -static inline struct task_group * -autogroup_task_group(struct task_struct *p, struct task_group *tg) -{ - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (enabled && task_wants_autogroup(p, tg)) - return p->signal->autogroup->tg; - - return tg; -} - static void autogroup_move_group(struct task_struct *p, struct autogroup *ag) { @@ -263,7 +246,7 @@ out: #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SCHED_DEBUG -static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +int autogroup_path(struct task_group *tg, char *buf, int buflen) { if (!task_group_is_autogroup(tg)) return 0; diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h index c2f0e7248dca..8bd047142816 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched/auto_group.h @@ -1,5 +1,8 @@ #ifdef CONFIG_SCHED_AUTOGROUP +#include <linux/kref.h> +#include <linux/rwsem.h> + struct autogroup { /* * reference doesn't mean how many thread attach to this @@ -13,9 +16,28 @@ struct autogroup { int nice; }; -static inline bool task_group_is_autogroup(struct task_group *tg); +extern void autogroup_init(struct task_struct *init_task); +extern void autogroup_free(struct task_group *tg); + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); + static inline struct task_group * -autogroup_task_group(struct task_struct *p, struct task_group *tg); +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +extern int autogroup_path(struct task_group *tg, char *buf, int buflen); #else /* !CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c index 9d8af0b3fb64..c685e31492df 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched/clock.c @@ -62,7 +62,7 @@ */ #include <linux/spinlock.h> #include <linux/hardirq.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/percpu.h> #include <linux/ktime.h> #include <linux/sched.h> diff --git a/kernel/sched.c b/kernel/sched/core.c index 5670028a9c16..df00cb09263e 100644 --- a/kernel/sched.c +++ b/kernel/sched/core.c @@ -1,5 +1,5 @@ /* - * kernel/sched.c + * kernel/sched/core.c * * Kernel scheduler and related syscalls * @@ -56,7 +56,6 @@ #include <linux/percpu.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/stop_machine.h> #include <linux/sysctl.h> #include <linux/syscalls.h> #include <linux/times.h> @@ -71,593 +70,46 @@ #include <linux/ctype.h> #include <linux/ftrace.h> #include <linux/slab.h> +#include <linux/init_task.h> #include <asm/tlb.h> #include <asm/irq_regs.h> -#include <asm/mutex.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif -#include "sched_cpupri.h" -#include "workqueue_sched.h" -#include "sched_autogroup.h" +#include "sched.h" +#include "../workqueue_sched.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* - * Helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) - -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT - -/* - * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define DEF_TIMESLICE (100 * HZ / 1000) - -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) - -static inline int rt_policy(int policy) -{ - if (policy == SCHED_FIFO || policy == SCHED_RR) - return 1; - return 0; -} - -static inline int task_has_rt_policy(struct task_struct *p) -{ - return rt_policy(p->policy); -} - -/* - * This is the priority-queue data structure of the RT scheduling class: - */ -struct rt_prio_array { - DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; -}; - -struct rt_bandwidth { - /* nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; - ktime_t rt_period; - u64 rt_runtime; - struct hrtimer rt_period_timer; -}; - -static struct rt_bandwidth def_rt_bandwidth; - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); - -static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) -{ - struct rt_bandwidth *rt_b = - container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - - if (!overrun) - break; - - idle = do_sched_rt_period_timer(rt_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -static -void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) -{ - rt_b->rt_period = ns_to_ktime(period); - rt_b->rt_runtime = runtime; - - raw_spin_lock_init(&rt_b->rt_runtime_lock); - - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rt_b->rt_period_timer.function = sched_rt_period_timer; -} - -static inline int rt_bandwidth_enabled(void) +void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { - return sysctl_sched_rt_runtime >= 0; -} - -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - ktime_t now; - - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; + unsigned long delta; + ktime_t soft, hard, now; - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - - raw_spin_lock(&rt_b->rt_runtime_lock); for (;;) { - unsigned long delta; - ktime_t soft, hard; - - if (hrtimer_active(&rt_b->rt_period_timer)) + if (hrtimer_active(period_timer)) break; - now = hrtimer_cb_get_time(&rt_b->rt_period_timer); - hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + now = hrtimer_cb_get_time(period_timer); + hrtimer_forward(period_timer, now, period); - soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); - hard = hrtimer_get_expires(&rt_b->rt_period_timer); + soft = hrtimer_get_softexpires(period_timer); + hard = hrtimer_get_expires(period_timer); delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); + __hrtimer_start_range_ns(period_timer, soft, delta, + HRTIMER_MODE_ABS_PINNED, 0); } - raw_spin_unlock(&rt_b->rt_runtime_lock); } -#ifdef CONFIG_RT_GROUP_SCHED -static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - hrtimer_cancel(&rt_b->rt_period_timer); -} -#endif - -/* - * sched_domains_mutex serializes calls to init_sched_domains, - * detach_destroy_domains and partition_sched_domains. - */ -static DEFINE_MUTEX(sched_domains_mutex); - -#ifdef CONFIG_CGROUP_SCHED - -#include <linux/cgroup.h> - -struct cfs_rq; - -static LIST_HEAD(task_groups); - -/* task group related information */ -struct task_group { - struct cgroup_subsys_state css; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each cpu */ - struct sched_entity **se; - /* runqueue "owned" by this group on each cpu */ - struct cfs_rq **cfs_rq; - unsigned long shares; - - atomic_t load_weight; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; - - struct rt_bandwidth rt_bandwidth; -#endif - - struct rcu_head rcu; - struct list_head list; - - struct task_group *parent; - struct list_head siblings; - struct list_head children; - -#ifdef CONFIG_SCHED_AUTOGROUP - struct autogroup *autogroup; -#endif -}; - -/* task_group_lock serializes the addition/removal of task groups */ -static DEFINE_SPINLOCK(task_group_lock); - -#ifdef CONFIG_FAIR_GROUP_SCHED - -# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD - -/* - * A weight of 0 or 1 can cause arithmetics problems. - * A weight of a cfs_rq is the sum of weights of which entities - * are queued on this cfs_rq, so a weight of a entity should not be - * too large, so as the shares value of a task group. - * (The default weight is 1024 - so there's no practical - * limitation from this.) - */ -#define MIN_SHARES (1UL << 1) -#define MAX_SHARES (1UL << 18) - -static int root_task_group_load = ROOT_TASK_GROUP_LOAD; -#endif - -/* Default task group. - * Every task in system belong to this group at bootup. - */ -struct task_group root_task_group; - -#endif /* CONFIG_CGROUP_SCHED */ - -/* CFS-related fields in a runqueue */ -struct cfs_rq { - struct load_weight load; - unsigned long nr_running; - - u64 exec_clock; - u64 min_vruntime; -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; -#endif - - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; - - struct list_head tasks; - struct list_head *balance_iterator; - - /* - * 'curr' points to currently running entity on this cfs_rq. - * It is set to NULL otherwise (i.e when none are currently running). - */ - struct sched_entity *curr, *next, *last, *skip; - -#ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. - */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ - -#ifdef CONFIG_SMP - /* - * the part of load.weight contributed by tasks - */ - unsigned long task_weight; - - /* - * h_load = weight * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long h_load; - - /* - * Maintaining per-cpu shares distribution for group scheduling - * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time - */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; - - unsigned long load_contribution; -#endif -#endif -}; - -/* Real-Time classes' related field in a runqueue: */ -struct rt_rq { - struct rt_prio_array active; - unsigned long rt_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - struct { - int curr; /* highest queued rt task prio */ -#ifdef CONFIG_SMP - int next; /* next highest */ -#endif - } highest_prio; -#endif -#ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - unsigned long rt_nr_total; - int overloaded; - struct plist_head pushable_tasks; -#endif - int rt_throttled; - u64 rt_time; - u64 rt_runtime; - /* Nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; - -#ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; - - struct rq *rq; - struct list_head leaf_rt_rq_list; - struct task_group *tg; -#endif -}; - -#ifdef CONFIG_SMP - -/* - * We add the notion of a root-domain which will be used to define per-domain - * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new - * exclusive cpuset is created, we also create and attach a new root-domain - * object. - * - */ -struct root_domain { - atomic_t refcount; - atomic_t rto_count; - struct rcu_head rcu; - cpumask_var_t span; - cpumask_var_t online; - - /* - * The "RT overload" flag: it gets set if a CPU has more than - * one runnable RT task. - */ - cpumask_var_t rto_mask; - struct cpupri cpupri; -}; - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -static struct root_domain def_root_domain; - -#endif /* CONFIG_SMP */ - -/* - * This is the main, per-CPU runqueue data structure. - * - * Locking rule: those places that want to lock multiple runqueues - * (such as the load balancing or the thread migration code), lock - * acquire operations must be ordered by ascending &runqueue. - */ -struct rq { - /* runqueue lock: */ - raw_spinlock_t lock; - - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ - u64 nohz_stamp; - unsigned char nohz_balance_kick; -#endif - int skip_clock_update; - - /* capture load from *all* tasks on this cpu: */ - struct load_weight load; - unsigned long nr_load_updates; - u64 nr_switches; - - struct cfs_rq cfs; - struct rt_rq rt; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this cpu: */ - struct list_head leaf_cfs_rq_list; -#endif -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif - - /* - * This is part of a global counter where only the total sum - * over all CPUs matters. A task can increase this counter on - * one CPU and if it got migrated afterwards it may decrease - * it on another CPU. Always updated under the runqueue lock: - */ - unsigned long nr_uninterruptible; - - struct task_struct *curr, *idle, *stop; - unsigned long next_balance; - struct mm_struct *prev_mm; - - u64 clock; - u64 clock_task; - - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; - - unsigned long cpu_power; - - unsigned char idle_at_tick; - /* For active balancing */ - int post_schedule; - int active_balance; - int push_cpu; - struct cpu_stop_work active_balance_work; - /* cpu of this runqueue: */ - int cpu; - int online; - - unsigned long avg_load_per_task; - - u64 rt_avg; - u64 age_stamp; - u64 idle_stamp; - u64 avg_idle; -#endif - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - u64 prev_irq_time; -#endif -#ifdef CONFIG_PARAVIRT - u64 prev_steal_time; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - u64 prev_steal_time_rq; -#endif - - /* calc_load related fields */ - unsigned long calc_load_update; - long calc_load_active; - -#ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP - int hrtick_csd_pending; - struct call_single_data hrtick_csd; -#endif - struct hrtimer hrtick_timer; -#endif - -#ifdef CONFIG_SCHEDSTATS - /* latency stats */ - struct sched_info rq_sched_info; - unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ - - /* sys_sched_yield() stats */ - unsigned int yld_count; - - /* schedule() stats */ - unsigned int sched_switch; - unsigned int sched_count; - unsigned int sched_goidle; - - /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; -#endif - -#ifdef CONFIG_SMP - struct task_struct *wake_list; -#endif -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); - - -static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); - -static inline int cpu_of(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq->cpu; -#else - return 0; -#endif -} - -#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) - -/* - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. - * - * The domain tree of any CPU may only be accessed from within - * preempt-disabled sections. - */ -#define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() (&__raw_get_cpu_var(runqueues)) - -#ifdef CONFIG_CGROUP_SCHED - -/* - * Return the group to which this tasks belongs. - * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. - */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; - p->se.parent = task_group(p)->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = task_group(p)->rt_rq[cpu]; - p->rt.parent = task_group(p)->rt_se[cpu]; -#endif -} - -#else /* CONFIG_CGROUP_SCHED */ - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} - -#endif /* CONFIG_CGROUP_SCHED */ +DEFINE_MUTEX(sched_domains_mutex); +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static void update_rq_clock_task(struct rq *rq, s64 delta); -static void update_rq_clock(struct rq *rq) +void update_rq_clock(struct rq *rq) { s64 delta; @@ -670,44 +122,14 @@ static void update_rq_clock(struct rq *rq) } /* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: - */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug static const -#endif - -/** - * runqueue_is_locked - Returns true if the current cpu runqueue is locked - * @cpu: the processor in question. - * - * This interface allows printk to be called with the runqueue lock - * held and know whether or not it is OK to wake up the klogd. - */ -int runqueue_is_locked(int cpu) -{ - return raw_spin_is_locked(&cpu_rq(cpu)->lock); -} - -/* * Debugging: various feature bits */ #define SCHED_FEAT(name, enabled) \ - __SCHED_FEAT_##name , - -enum { -#include "sched_features.h" -}; - -#undef SCHED_FEAT - -#define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | const_debug unsigned int sysctl_sched_features = -#include "sched_features.h" +#include "features.h" 0; #undef SCHED_FEAT @@ -717,7 +139,7 @@ const_debug unsigned int sysctl_sched_features = #name , static __read_mostly char *sched_feat_names[] = { -#include "sched_features.h" +#include "features.h" NULL }; @@ -727,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v) { int i; - for (i = 0; sched_feat_names[i]; i++) { + for (i = 0; i < __SCHED_FEAT_NR; i++) { if (!(sysctl_sched_features & (1UL << i))) seq_puts(m, "NO_"); seq_printf(m, "%s ", sched_feat_names[i]); @@ -737,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v) return 0; } +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true jump_label_key_enabled +#define jump_label_key__false jump_label_key_disabled + +#define SCHED_FEAT(name, enabled) \ + jump_label_key__##enabled , + +struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ + if (jump_label_enabled(&sched_feat_keys[i])) + jump_label_dec(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ + if (!jump_label_enabled(&sched_feat_keys[i])) + jump_label_inc(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + static ssize_t sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -760,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf, cmp += 3; } - for (i = 0; sched_feat_names[i]; i++) { + for (i = 0; i < __SCHED_FEAT_NR; i++) { if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) + if (neg) { sysctl_sched_features &= ~(1UL << i); - else + sched_feat_disable(i); + } else { sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); + } break; } } - if (!sched_feat_names[i]) + if (i == __SCHED_FEAT_NR) return -EINVAL; *ppos += cnt; @@ -799,10 +254,7 @@ static __init int sched_init_debug(void) return 0; } late_initcall(sched_init_debug); - -#endif - -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. @@ -824,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; */ unsigned int sysctl_sched_rt_period = 1000000; -static __read_mostly int scheduler_running; +__read_mostly int scheduler_running; /* * part of the period that we allow rt tasks to run in us. @@ -832,112 +284,7 @@ static __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -static inline u64 global_rt_period(void) -{ - return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; -} - -static inline u64 global_rt_runtime(void) -{ - if (sysctl_sched_rt_runtime < 0) - return RUNTIME_INF; - - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; -} - -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif - -static inline int task_current(struct rq *rq, struct task_struct *p) -{ - return rq->curr == p; -} - -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->on_cpu; -#else - return task_current(rq, p); -#endif -} - -#ifndef __ARCH_WANT_UNLOCKED_CTXSW -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -} -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - raw_spin_unlock_irq(&rq->lock); -} - -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - raw_spin_unlock_irq(&rq->lock); -#else - raw_spin_unlock(&rq->lock); -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* * __task_rq_lock - lock the rq @p resides on. @@ -1020,20 +367,6 @@ static struct rq *this_rq_lock(void) * rq->lock. */ -/* - * Use hrtick when: - * - enabled by features - * - hrtimer is actually high res - */ -static inline int hrtick_enabled(struct rq *rq) -{ - if (!sched_feat(HRTICK)) - return 0; - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); -} - static void hrtick_clear(struct rq *rq) { if (hrtimer_active(&rq->hrtick_timer)) @@ -1077,7 +410,7 @@ static void __hrtick_start(void *arg) * * called with rq->lock held and irqs disabled */ -static void hrtick_start(struct rq *rq, u64 delay) +void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = ktime_add_ns(timer->base->get_time(), delay); @@ -1121,7 +454,7 @@ static __init void init_hrtick(void) * * called with rq->lock held and irqs disabled */ -static void hrtick_start(struct rq *rq, u64 delay) +void hrtick_start(struct rq *rq, u64 delay) { __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); @@ -1172,7 +505,7 @@ static inline void init_hrtick(void) #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -static void resched_task(struct task_struct *p) +void resched_task(struct task_struct *p) { int cpu; @@ -1193,7 +526,7 @@ static void resched_task(struct task_struct *p) smp_send_reschedule(cpu); } -static void resched_cpu(int cpu) +void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -1272,14 +605,22 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } -#endif /* CONFIG_NO_HZ */ +static inline bool got_nohz_idle_kick(void) +{ + int cpu = smp_processor_id(); + return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); +} -static u64 sched_avg_period(void) +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) { - return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; + return false; } -static void sched_avg_update(struct rq *rq) +#endif /* CONFIG_NO_HZ */ + +void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); @@ -1295,200 +636,34 @@ static void sched_avg_update(struct rq *rq) } } -static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ - rq->rt_avg += rt_delta; - sched_avg_update(rq); -} - #else /* !CONFIG_SMP */ -static void resched_task(struct task_struct *p) +void resched_task(struct task_struct *p) { assert_raw_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } - -static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ -} - -static void sched_avg_update(struct rq *rq) -{ -} #endif /* CONFIG_SMP */ -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - -#define WMULT_SHIFT 32 - -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) - -/* - * delta *= weight / lw - */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) -{ - u64 tmp; - - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; - - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); - - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; - } - - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); -} - -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -static inline void update_load_set(struct load_weight *lw, unsigned long w) -{ - lw->weight = w; - lw->inv_weight = 0; -} - +#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ + (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) /* - * To aid in avoiding the subversion of "niceness" due to uneven distribution - * of tasks with abnormal "nice" values across CPUs the contribution that - * each task makes to its run queue's load is weighted according to its - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a - * scaled version of the new time slice allocation that they receive on time - * slice expiry etc. - */ - -#define WEIGHT_IDLEPRIO 3 -#define WMULT_IDLEPRIO 1431655765 - -/* - * Nice levels are multiplicative, with a gentle 10% change for every - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to - * nice 1, it will get ~10% less CPU time than another CPU-bound task - * that remained on nice 0. + * Iterate task_group tree rooted at *from, calling @down when first entering a + * node and @up when leaving it for the final time. * - * The "10% effect" is relative and cumulative: from _any_ nice level, - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. - * If a task goes up by ~10% and another task goes down by ~10% then - * the relative distance between them is ~25%.) - */ -static const int prio_to_weight[40] = { - /* -20 */ 88761, 71755, 56483, 46273, 36291, - /* -15 */ 29154, 23254, 18705, 14949, 11916, - /* -10 */ 9548, 7620, 6100, 4904, 3906, - /* -5 */ 3121, 2501, 1991, 1586, 1277, - /* 0 */ 1024, 820, 655, 526, 423, - /* 5 */ 335, 272, 215, 172, 137, - /* 10 */ 110, 87, 70, 56, 45, - /* 15 */ 36, 29, 23, 18, 15, -}; - -/* - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. - * - * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions - * into multiplications: - */ -static const u32 prio_to_wmult[40] = { - /* -20 */ 48388, 59856, 76040, 92818, 118348, - /* -15 */ 147320, 184698, 229616, 287308, 360437, - /* -10 */ 449829, 563644, 704093, 875809, 1099582, - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, -}; - -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ - - CPUACCT_STAT_NSTATS, -}; - -#ifdef CONFIG_CGROUP_CPUACCT -static void cpuacct_charge(struct task_struct *tsk, u64 cputime); -static void cpuacct_update_stats(struct task_struct *tsk, - enum cpuacct_stat_index idx, cputime_t val); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -static inline void cpuacct_update_stats(struct task_struct *tsk, - enum cpuacct_stat_index idx, cputime_t val) {} -#endif - -static inline void inc_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_add(&rq->load, load); -} - -static inline void dec_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_sub(&rq->load, load); -} - -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) -typedef int (*tg_visitor)(struct task_group *, void *); - -/* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. + * Caller must hold rcu_lock or sufficient equivalent. */ -static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; int ret; - rcu_read_lock(); - parent = &root_task_group; + parent = from; + down: ret = (*down)(parent, data); if (ret) - goto out_unlock; + goto out; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1497,273 +672,24 @@ up: continue; } ret = (*up)(parent, data); - if (ret) - goto out_unlock; + if (ret || parent == from) + goto out; child = parent; parent = parent->parent; if (parent) goto up; -out_unlock: - rcu_read_unlock(); - +out: return ret; } -static int tg_nop(struct task_group *tg, void *data) +int tg_nop(struct task_group *tg, void *data) { return 0; } #endif -#ifdef CONFIG_SMP -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - -static unsigned long power_of(int cpu) -{ - return cpu_rq(cpu)->cpu_power; -} - -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); - - if (nr_running) - rq->avg_load_per_task = rq->load.weight / nr_running; - else - rq->avg_load_per_task = 0; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_PREEMPT - -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - -/* - * fair double_lock_balance: Safely acquires both rq->locks in a fair - * way at the expense of forcing extra atomic operations in all - * invocations. This assures that the double_lock is acquired using the - * same underlying policy as the spinlock_t on this architecture, which - * reduces latency compared to the unfair variant below. However, it - * also adds more overhead and therefore may reduce throughput. - */ -static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - raw_spin_unlock(&this_rq->lock); - double_rq_lock(this_rq, busiest); - - return 1; -} - -#else -/* - * Unfair double_lock_balance: Optimizes throughput at the expense of - * latency by eliminating extra atomic operations when the locks are - * already in proper order on entry. This favors lower cpu-ids and will - * grant the double lock to lower cpus over higher ids under contention, - * regardless of entry order into the function. - */ -static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - int ret = 0; - - if (unlikely(!raw_spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - raw_spin_unlock(&this_rq->lock); - raw_spin_lock(&busiest->lock); - raw_spin_lock_nested(&this_rq->lock, - SINGLE_DEPTH_NESTING); - ret = 1; - } else - raw_spin_lock_nested(&busiest->lock, - SINGLE_DEPTH_NESTING); - } - return ret; -} - -#endif /* CONFIG_PREEMPT */ - -/* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) -{ - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - raw_spin_unlock(&this_rq->lock); - BUG_ON(1); - } - - return _double_lock_balance(this_rq, busiest); -} - -static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) -{ - raw_spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); -} - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - raw_spin_lock(&rq1->lock); - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); - } else { - raw_spin_lock(&rq2->lock); - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); - } - } -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - raw_spin_unlock(&rq1->lock); - if (rq1 != rq2) - raw_spin_unlock(&rq2->lock); - else - __release(rq2->lock); -} - -#else /* CONFIG_SMP */ - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - BUG_ON(rq1 != rq2); - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - BUG_ON(rq1 != rq2); - raw_spin_unlock(&rq1->lock); - __release(rq2->lock); -} - -#endif - -static void calc_load_account_idle(struct rq *this_rq); -static void update_sysctl(void); -static int get_update_sysctl_factor(void); -static void update_cpu_load(struct rq *this_rq); - -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - -static const struct sched_class rt_sched_class; - -#define sched_class_highest (&stop_sched_class) -#define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) - -#include "sched_stats.h" - -static void inc_nr_running(struct rq *rq) -{ - rq->nr_running++; -} - -static void dec_nr_running(struct rq *rq) -{ - rq->nr_running--; -} +void update_cpu_load(struct rq *this_rq); static void set_load_weight(struct task_struct *p) { @@ -1800,25 +726,23 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) /* * activate_task - move a task to the runqueue. */ -static void activate_task(struct rq *rq, struct task_struct *p, int flags) +void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); - inc_nr_running(rq); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) +void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible++; dequeue_task(rq, p, flags); - dec_nr_running(rq); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -2004,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int irqtime_account_hi_update(void) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; u64 latest_ns; int ret = 0; local_irq_save(flags); latest_ns = this_cpu_read(cpu_hardirq_time); - if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) ret = 1; local_irq_restore(flags); return ret; @@ -2019,14 +943,14 @@ static int irqtime_account_hi_update(void) static int irqtime_account_si_update(void) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; u64 latest_ns; int ret = 0; local_irq_save(flags); latest_ns = this_cpu_read(cpu_softirq_time); - if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) ret = 1; local_irq_restore(flags); return ret; @@ -2038,15 +962,6 @@ static int irqtime_account_si_update(void) #endif -#include "sched_idletask.c" -#include "sched_fair.c" -#include "sched_rt.c" -#include "sched_autogroup.c" -#include "sched_stoptask.c" -#ifdef CONFIG_SCHED_DEBUG -# include "sched_debug.c" -#endif - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -2144,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio); } -static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { const struct sched_class *class; @@ -2170,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP -/* - * Is this task likely cache-hot: - */ -static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) -{ - s64 delta; - - if (p->sched_class != &fair_sched_class) - return 0; - - if (unlikely(p->policy == SCHED_IDLE)) - return 0; - - /* - * Buddy candidates are cache hot: - */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) - return 1; - - if (sysctl_sched_migration_cost == -1) - return 1; - if (sysctl_sched_migration_cost == 0) - return 0; - - delta = now - p->se.exec_start; - - return delta < (s64)sysctl_sched_migration_cost; -} - void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -2390,11 +1273,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); if (dest_cpu < nr_cpu_ids) return dest_cpu; @@ -2431,7 +1314,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || !cpu_online(cpu))) cpu = select_fallback_rq(task_cpu(p), p); @@ -2556,42 +1439,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_do_pending(struct task_struct *list) +static void sched_ttwu_pending(void) { struct rq *rq = this_rq(); + struct llist_node *llist = llist_del_all(&rq->wake_list); + struct task_struct *p; raw_spin_lock(&rq->lock); - while (list) { - struct task_struct *p = list; - list = list->wake_entry; + while (llist) { + p = llist_entry(llist, struct task_struct, wake_entry); + llist = llist_next(llist); ttwu_do_activate(rq, p, 0); } raw_spin_unlock(&rq->lock); } -#ifdef CONFIG_HOTPLUG_CPU - -static void sched_ttwu_pending(void) -{ - struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) - return; - - sched_ttwu_do_pending(list); -} - -#endif /* CONFIG_HOTPLUG_CPU */ - void scheduler_ipi(void) { - struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) return; /* @@ -2608,25 +1475,21 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); - sched_ttwu_do_pending(list); + sched_ttwu_pending(); + + /* + * Check if someone kicked us for doing the nohz idle load balance. + */ + if (unlikely(got_nohz_idle_kick() && !need_resched())) { + this_rq()->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu) { - struct rq *rq = cpu_rq(cpu); - struct task_struct *next = rq->wake_list; - - for (;;) { - struct task_struct *old = next; - - p->wake_entry = next; - next = cmpxchg(&rq->wake_list, old, p); - if (next == old) - break; - } - - if (!next) + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) smp_send_reschedule(cpu); } @@ -2648,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) } #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ + +static inline int ttwu_share_cache(int this_cpu, int that_cpu) +{ + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); +} #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu) @@ -2655,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; @@ -2848,19 +1716,23 @@ void sched_fork(struct task_struct *p) p->state = TASK_RUNNING; /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + + /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { + if (task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; - p->normal_prio = p->static_prio; - } - - if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); - p->normal_prio = p->static_prio; - set_load_weight(p); - } + p->rt_priority = 0; + } else if (PRIO_TO_NICE(p->static_prio) < 0) + p->static_prio = NICE_TO_PRIO(0); + + p->prio = p->normal_prio = __normal_prio(p); + set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has @@ -2869,11 +1741,6 @@ void sched_fork(struct task_struct *p) p->sched_reset_on_fork = 0; } - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; @@ -3070,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); + trace_sched_stat_sleeptime(current, rq->clock); fire_sched_in_preempt_notifiers(current); if (mm) @@ -3305,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) */ static atomic_long_t calc_load_tasks_idle; -static void calc_load_account_idle(struct rq *this_rq) +void calc_load_account_idle(struct rq *this_rq) { long delta; @@ -3449,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks) */ } #else -static void calc_load_account_idle(struct rq *this_rq) +void calc_load_account_idle(struct rq *this_rq) { } @@ -3592,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ -static void update_cpu_load(struct rq *this_rq) +void update_cpu_load(struct rq *this_rq) { unsigned long this_load = this_rq->load.weight; unsigned long curr_jiffies = jiffies; @@ -3670,8 +2538,10 @@ unlock: #endif DEFINE_PER_CPU(struct kernel_stat, kstat); +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kstat); +EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* * Return any ns on the sched_clock that have not yet been accounted in @@ -3724,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_CGROUP_CPUACCT +struct cgroup_subsys cpuacct_subsys; +struct cpuacct root_cpuacct; +#endif + +static inline void task_group_account_field(struct task_struct *p, int index, + u64 tmp) +{ +#ifdef CONFIG_CGROUP_CPUACCT + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; +#endif + /* + * Since all updates are sure to touch the root cgroup, we + * get ourselves ahead and touch it first. If the root cgroup + * is the only cgroup, then nothing else should be necessary. + * + */ + __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + +#ifdef CONFIG_CGROUP_CPUACCT + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(p); + while (ca && (ca != &root_cpuacct)) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += tmp; + ca = parent_ca(ca); + } + rcu_read_unlock(); +#endif +} + + /* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to @@ -3733,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p) void account_user_time(struct task_struct *p, cputime_t cputime, cputime_t cputime_scaled) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; + int index; /* Add user time to process. */ - p->utime = cputime_add(p->utime, cputime); - p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); + p->utime += cputime; + p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); + index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + /* Add user time to cpustat. */ - tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) - cpustat->nice = cputime64_add(cpustat->nice, tmp); - else - cpustat->user = cputime64_add(cpustat->user, tmp); + task_group_account_field(p, index, (__force u64) cputime); - cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); /* Account for user time used */ acct_update_integrals(p); } @@ -3762,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime, static void account_guest_time(struct task_struct *p, cputime_t cputime, cputime_t cputime_scaled) { - cputime64_t tmp; - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - - tmp = cputime_to_cputime64(cputime); + u64 *cpustat = kcpustat_this_cpu->cpustat; /* Add guest time to process. */ - p->utime = cputime_add(p->utime, cputime); - p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); + p->utime += cputime; + p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); - p->gtime = cputime_add(p->gtime, cputime); + p->gtime += cputime; /* Add guest time to cpustat. */ if (TASK_NICE(p) > 0) { - cpustat->nice = cputime64_add(cpustat->nice, tmp); - cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + cpustat[CPUTIME_NICE] += (__force u64) cputime; + cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; } else { - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); + cpustat[CPUTIME_USER] += (__force u64) cputime; + cpustat[CPUTIME_GUEST] += (__force u64) cputime; } } @@ -3792,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, */ static inline void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, cputime64_t *target_cputime64) + cputime_t cputime_scaled, int index) { - cputime64_t tmp = cputime_to_cputime64(cputime); - /* Add system time to process. */ - p->stime = cputime_add(p->stime, cputime); - p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + p->stime += cputime; + p->stimescaled += cputime_scaled; account_group_system_time(p, cputime); /* Add system time to cpustat. */ - *target_cputime64 = cputime64_add(*target_cputime64, tmp); - cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + task_group_account_field(p, index, (__force u64) cputime); /* Account for system time used */ acct_update_integrals(p); @@ -3819,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t *target_cputime64; + int index; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { account_guest_time(p, cputime, cputime_scaled); @@ -3828,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset, } if (hardirq_count() - hardirq_offset) - target_cputime64 = &cpustat->irq; + index = CPUTIME_IRQ; else if (in_serving_softirq()) - target_cputime64 = &cpustat->softirq; + index = CPUTIME_SOFTIRQ; else - target_cputime64 = &cpustat->system; + index = CPUTIME_SYSTEM; - __account_system_time(p, cputime, cputime_scaled, target_cputime64); + __account_system_time(p, cputime, cputime_scaled, index); } /* @@ -3843,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset, */ void account_steal_time(cputime_t cputime) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); + u64 *cpustat = kcpustat_this_cpu->cpustat; - cpustat->steal = cputime64_add(cpustat->steal, cputime64); + cpustat[CPUTIME_STEAL] += (__force u64) cputime; } /* @@ -3855,14 +2749,13 @@ void account_steal_time(cputime_t cputime) */ void account_idle_time(cputime_t cputime) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); + u64 *cpustat = kcpustat_this_cpu->cpustat; struct rq *rq = this_rq(); if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; else - cpustat->idle = cputime64_add(cpustat->idle, cputime64); + cpustat[CPUTIME_IDLE] += (__force u64) cputime; } static __always_inline bool steal_account_process_tick(void) @@ -3912,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) { cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + u64 *cpustat = kcpustat_this_cpu->cpustat; if (steal_account_process_tick()) return; if (irqtime_account_hi_update()) { - cpustat->irq = cputime64_add(cpustat->irq, tmp); + cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; } else if (irqtime_account_si_update()) { - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; } else if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. @@ -3929,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * Also, p->stime needs to be updated for ksoftirqd. */ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - &cpustat->softirq); + CPUTIME_SOFTIRQ); } else if (user_tick) { account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); } else if (p == rq->idle) { @@ -3938,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); } else { __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - &cpustat->system); + CPUTIME_SYSTEM); } } @@ -4037,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); + cputime_t rtime, utime = p->utime, total = utime + p->stime; /* * Use CFS's precise accounting: @@ -4045,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - u64 temp = rtime; + u64 temp = (__force u64) rtime; - temp *= utime; - do_div(temp, total); - utime = (cputime_t)temp; + temp *= (__force u64) utime; + do_div(temp, (__force u32) total); + utime = (__force cputime_t) temp; } else utime = rtime; @@ -4057,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) * Compare with previous values, to keep monotonicity: */ p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); + p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); *ut = p->prev_utime; *st = p->prev_stime; @@ -4074,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) thread_group_cputime(p, &cputime); - total = cputime_add(cputime.utime, cputime.stime); + total = cputime.utime + cputime.stime; rtime = nsecs_to_cputime(cputime.sum_exec_runtime); if (total) { - u64 temp = rtime; + u64 temp = (__force u64) rtime; - temp *= cputime.utime; - do_div(temp, total); - utime = (cputime_t)temp; + temp *= (__force u64) cputime.utime; + do_div(temp, (__force u32) total); + utime = (__force cputime_t) temp; } else utime = rtime; sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, - cputime_sub(rtime, sig->prev_utime)); + sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); *ut = sig->prev_utime; *st = sig->prev_stime; @@ -4116,7 +3007,7 @@ void scheduler_tick(void) perf_event_task_tick(); #ifdef CONFIG_SMP - rq->idle_at_tick = idle_cpu(cpu); + rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif } @@ -4187,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); + if (oops_in_progress) + return; + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", prev->comm, prev->pid, preempt_count()); @@ -4213,6 +3107,7 @@ static inline void schedule_debug(struct task_struct *prev) */ if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) __schedule_bug(prev); + rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -4239,7 +3134,7 @@ pick_next_task(struct rq *rq) * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.nr_running)) { + if (likely(rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; @@ -4676,6 +3571,9 @@ EXPORT_SYMBOL(wait_for_completion); * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not * interruptible. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) @@ -4690,6 +3588,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout); * * This waits for completion of a specific task to be signaled. It is * interruptible. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_interruptible(struct completion *x) { @@ -4707,6 +3607,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_interruptible_timeout(struct completion *x, @@ -4722,6 +3625,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); * * This waits to be signaled for completion of a specific task. It can be * interrupted by a kill signal. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_killable(struct completion *x) { @@ -4740,6 +3645,9 @@ EXPORT_SYMBOL(wait_for_completion_killable); * This waits for either a completion of a specific task to be * signaled or for a specified timeout to expire. It can be * interrupted by a kill signal. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_killable_timeout(struct completion *x, @@ -5025,7 +3933,20 @@ EXPORT_SYMBOL(task_nice); */ int idle_cpu(int cpu) { - return cpu_curr(cpu) == cpu_rq(cpu)->idle; + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle) + return 0; + + if (rq->nr_running) + return 0; + +#ifdef CONFIG_SMP + if (!llist_empty(&rq->wake_list)) + return 0; +#endif + + return 1; } /** @@ -5691,6 +4612,13 @@ again: */ if (preempt && rq != p_rq) resched_task(p_rq->curr); + } else { + /* + * We might have set it in task_yield_fair(), but are + * not going to schedule(), so don't want to skip + * the next update. + */ + rq->skip_clock_update = 0; } out: @@ -5858,7 +4786,7 @@ void sched_show_task(struct task_struct *p) free = stack_not_used(p); #endif printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent), + task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); @@ -5875,7 +4803,7 @@ void show_state_filter(unsigned long state_filter) printk(KERN_INFO " task PC stack pid father\n"); #endif - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -5891,7 +4819,7 @@ void show_state_filter(unsigned long state_filter) #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: */ @@ -5952,62 +4880,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); -} - -/* - * In a system that switches off the HZ timer nohz_cpu_mask - * indicates which cpus entered this state. This is used - * in the rcu update to wait only for active cpus. For system - * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_BITS_NONE. - */ -cpumask_var_t nohz_cpu_mask; - -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static int get_update_sysctl_factor(void) -{ - unsigned int cpus = min_t(int, num_online_cpus(), 8); - unsigned int factor; - - switch (sysctl_sched_tunable_scaling) { - case SCHED_TUNABLESCALING_NONE: - factor = 1; - break; - case SCHED_TUNABLESCALING_LINEAR: - factor = cpus; - break; - case SCHED_TUNABLESCALING_LOG: - default: - factor = 1 + ilog2(cpus); - break; - } - - return factor; -} - -static void update_sysctl(void) -{ - unsigned int factor = get_update_sysctl_factor(); - -#define SET_SYSCTL(name) \ - (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); - SET_SYSCTL(sched_latency); - SET_SYSCTL(sched_wakeup_granularity); -#undef SET_SYSCTL -} - -static inline void sched_init_granularity(void) -{ - update_sysctl(); +#if defined(CONFIG_SMP) + sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif } #ifdef CONFIG_SMP @@ -6015,10 +4890,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { if (p->sched_class && p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); - else { - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); - } + + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* @@ -6116,7 +4990,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; /* @@ -6222,6 +5096,9 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); + for ( ; ; ) { /* * There's this thread running, bail when that's the only @@ -6299,7 +5176,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - mode_t mode, proc_handler *proc_handler) + umode_t mode, proc_handler *proc_handler) { entry->procname = procname; entry->data = data; @@ -6799,6 +5676,12 @@ out: return -ENOMEM; } +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +struct root_domain def_root_domain; + static void init_defrootdomain(void) { init_rootdomain(&def_root_domain); @@ -6870,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) } /* + * Keep a special pointer to the highest sched_domain that has + * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this + * allows us to avoid some pointer chasing select_idle_sibling(). + * + * Also keep a unique ID per domain (we use the first cpu number in + * the cpumask of the domain), this allows us to quickly tell if + * two cpus are in the same cache domain, see ttwu_share_cache(). + */ +DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_id); + +static void update_top_cache_domain(int cpu) +{ + struct sched_domain *sd; + int id = cpu; + + sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); + per_cpu(sd_llc_id, cpu) = id; +} + +/* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ @@ -6908,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); destroy_sched_domains(tmp, cpu); + + update_top_cache_domain(cpu); } /* cpus with isolated domains */ @@ -6923,8 +5833,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -#define SD_NODES_PER_DOMAIN 16 - #ifdef CONFIG_NUMA /** @@ -7069,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) continue; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(i)); + GFP_KERNEL, cpu_to_node(cpu)); if (!sg) goto fail; @@ -7207,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) return; update_group_power(sd, cpu); + atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; } /* @@ -7761,54 +6675,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *page) +static ssize_t sched_mc_power_savings_show(struct device *dev, + struct device_attribute *attr, + char *buf) { - return sprintf(page, "%u\n", sched_mc_power_savings); + return sprintf(buf, "%u\n", sched_mc_power_savings); } -static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, - struct sysdev_class_attribute *attr, +static ssize_t sched_mc_power_savings_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); } -static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, - sched_mc_power_savings_show, - sched_mc_power_savings_store); +static DEVICE_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, - struct sysdev_class_attribute *attr, - char *page) +static ssize_t sched_smt_power_savings_show(struct device *dev, + struct device_attribute *attr, + char *buf) { - return sprintf(page, "%u\n", sched_smt_power_savings); + return sprintf(buf, "%u\n", sched_smt_power_savings); } -static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, - struct sysdev_class_attribute *attr, +static ssize_t sched_smt_power_savings_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); } -static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, +static DEVICE_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, sched_smt_power_savings_store); #endif -int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +int __init sched_create_sysfs_power_savings_entries(struct device *dev) { int err = 0; #ifdef CONFIG_SCHED_SMT if (smt_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_smt_power_savings.attr); + err = device_create_file(dev, &dev_attr_sched_smt_power_savings); #endif #ifdef CONFIG_SCHED_MC if (!err && mc_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_mc_power_savings.attr); + err = device_create_file(dev, &dev_attr_sched_mc_power_savings); #endif return err; } @@ -7844,29 +6756,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, } } -static int update_runtime(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; @@ -7915,103 +6804,11 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static void init_cfs_rq(struct cfs_rq *cfs_rq) -{ - cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifndef CONFIG_64BIT - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif -} - -static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) -{ - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; - rt_rq->rt_nr_migratory = 0; - rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks); -#endif - - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - raw_spin_lock_init(&rt_rq->rt_runtime_lock); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, - struct sched_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - cfs_rq->tg = tg; - cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; +#ifdef CONFIG_CGROUP_SCHED +struct task_group root_task_group; #endif - tg->cfs_rq[cpu] = cfs_rq; - tg->se[cpu] = se; - - /* se could be NULL for root_task_group */ - if (!se) - return; - - if (!parent) - se->cfs_rq = &rq->cfs; - else - se->cfs_rq = parent->my_q; - - se->my_q = cfs_rq; - update_load_set(&se->load, 0); - se->parent = parent; -} -#endif - -#ifdef CONFIG_RT_GROUP_SCHED -static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, - struct sched_rt_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; - rt_rq->tg = tg; - - tg->rt_rq[cpu] = rt_rq; - tg->rt_se[cpu] = rt_se; - - if (!rt_se) - return; - - if (!parent) - rt_se->rt_rq = &rq->rt; - else - rt_se->rt_rq = parent->my_q; - - rt_se->my_q = rt_rq; - rt_se->parent = parent; - INIT_LIST_HEAD(&rt_se->run_list); -} -#endif +DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); void __init sched_init(void) { @@ -8069,9 +6866,17 @@ void __init sched_init(void) #ifdef CONFIG_CGROUP_SCHED list_add(&root_task_group.list, &task_groups); INIT_LIST_HEAD(&root_task_group.children); + INIT_LIST_HEAD(&root_task_group.siblings); autogroup_init(&init_task); + #endif /* CONFIG_CGROUP_SCHED */ +#ifdef CONFIG_CGROUP_CPUACCT + root_cpuacct.cpustat = &kernel_cpustat; + root_cpuacct.cpuusage = alloc_percpu(u64); + /* Too early, not expected to fail */ + BUG_ON(!root_cpuacct.cpuusage); +#endif for_each_possible_cpu(i) { struct rq *rq; @@ -8083,7 +6888,7 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = root_task_group_load; + root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); /* * How much cpu bandwidth does root_task_group get? @@ -8104,6 +6909,7 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8132,8 +6938,7 @@ void __init sched_init(void) rq->avg_idle = 2*sysctl_sched_migration_cost; rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ - rq->nohz_balance_kick = 0; - init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); + rq->nohz_flags = 0; #endif #endif init_rq_hrtick(rq); @@ -8146,10 +6951,6 @@ void __init sched_init(void) INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif -#ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#endif - #ifdef CONFIG_RT_MUTEXES plist_head_init(&init_task.pi_waiters); #endif @@ -8175,21 +6976,13 @@ void __init sched_init(void) */ current->sched_class = &fair_sched_class; - /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); -#ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); - alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); - atomic_set(&nohz.load_balancer, nr_cpu_ids); - atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); - atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); -#endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); -#endif /* SMP */ +#endif + init_sched_fair_class(); scheduler_running = 1; } @@ -8206,6 +6999,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || system_state != SYSTEM_RUNNING || oops_in_progress) return; @@ -8340,165 +7134,10 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED -static void free_fair_sched_group(struct task_group *tg) -{ - int i; - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); - } - - kfree(tg->cfs_rq); - kfree(tg->se); -} - -static -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se; - int i; - - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->cfs_rq) - goto err; - tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->se) - goto err; - - tg->shares = NICE_0_LOAD; - - for_each_possible_cpu(i) { - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!cfs_rq) - goto err; - - se = kzalloc_node(sizeof(struct sched_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!se) - goto err_free_rq; - - init_cfs_rq(cfs_rq); - init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); - } - - return 1; - -err_free_rq: - kfree(cfs_rq); -err: - return 0; -} - -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; - - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} -#else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline void free_fair_sched_group(struct task_group *tg) -{ -} - -static inline -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static void free_rt_sched_group(struct task_group *tg) -{ - int i; - - if (tg->rt_se) - destroy_rt_bandwidth(&tg->rt_bandwidth); - - for_each_possible_cpu(i) { - if (tg->rt_rq) - kfree(tg->rt_rq[i]); - if (tg->rt_se) - kfree(tg->rt_se[i]); - } - - kfree(tg->rt_rq); - kfree(tg->rt_se); -} - -static -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; - int i; - - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_rq) - goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_se) - goto err; - - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); - - for_each_possible_cpu(i) { - rt_rq = kzalloc_node(sizeof(struct rt_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_rq) - goto err; - - rt_se = kzalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_se) - goto err_free_rq; - - init_rt_rq(rt_rq, cpu_rq(i)); - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); - } - - return 1; - -err_free_rq: - kfree(rt_rq); -err: - return 0; -} -#else /* !CONFIG_RT_GROUP_SCHED */ -static inline void free_rt_sched_group(struct task_group *tg) -{ -} - -static inline -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - #ifdef CONFIG_CGROUP_SCHED +/* task_group_lock serializes the addition/removal of task groups */ +static DEFINE_SPINLOCK(task_group_lock); + static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -8603,47 +7242,13 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED -static DEFINE_MUTEX(shares_mutex); - -int sched_group_set_shares(struct task_group *tg, unsigned long shares) +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) +static unsigned long to_ratio(u64 period, u64 runtime) { - int i; - unsigned long flags; - - /* - * We can't change the weight of the root cgroup. - */ - if (!tg->se[0]) - return -EINVAL; - - shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - - mutex_lock(&shares_mutex); - if (tg->shares == shares) - goto done; - - tg->shares = shares; - for_each_possible_cpu(i) { - struct rq *rq = cpu_rq(i); - struct sched_entity *se; - - se = tg->se[i]; - /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } - -done: - mutex_unlock(&shares_mutex); - return 0; -} + if (runtime == RUNTIME_INF) + return 1ULL << 20; -unsigned long sched_group_shares(struct task_group *tg) -{ - return tg->shares; + return div64_u64(runtime << 20, period); } #endif @@ -8653,21 +7258,13 @@ unsigned long sched_group_shares(struct task_group *tg) */ static DEFINE_MUTEX(rt_constraints_mutex); -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} - /* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + if (rt_task(p) && task_rq(p)->rt.tg == tg) return 1; } while_each_thread(g, p); @@ -8680,7 +7277,7 @@ struct rt_schedulable_data { u64 rt_runtime; }; -static int tg_schedulable(struct task_group *tg, void *data) +static int tg_rt_schedulable(struct task_group *tg, void *data) { struct rt_schedulable_data *d = data; struct task_group *child; @@ -8738,16 +7335,22 @@ static int tg_schedulable(struct task_group *tg, void *data) static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { + int ret; + struct rt_schedulable_data data = { .tg = tg, .rt_period = period, .rt_runtime = runtime, }; - return walk_tg_tree(tg_schedulable, tg_nop, &data); + rcu_read_lock(); + ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_unlock(); + + return ret; } -static int tg_set_bandwidth(struct task_group *tg, +static int tg_set_rt_bandwidth(struct task_group *tg, u64 rt_period, u64 rt_runtime) { int i, err = 0; @@ -8786,7 +7389,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) if (rt_runtime_us < 0) rt_runtime = RUNTIME_INF; - return tg_set_bandwidth(tg, rt_period, rt_runtime); + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } long sched_group_rt_runtime(struct task_group *tg) @@ -8811,7 +7414,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) if (rt_period == 0) return -EINVAL; - return tg_set_bandwidth(tg, rt_period, rt_runtime); + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } long sched_group_rt_period(struct task_group *tg) @@ -8953,24 +7556,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) sched_destroy_group(tg); } -static int -cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) - return -EINVAL; + if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + return -EINVAL; #else - /* We don't support RT-tasks being in separate groups */ - if (tsk->sched_class != &fair_sched_class) - return -EINVAL; + /* We don't support RT-tasks being in separate groups */ + if (task->sched_class != &fair_sched_class) + return -EINVAL; #endif + } return 0; } -static void -cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { - sched_move_task(tsk); + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) + sched_move_task(task); } static void @@ -9001,6 +7611,237 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) return (u64) scale_load_down(tg->shares); } + +#ifdef CONFIG_CFS_BANDWIDTH +static DEFINE_MUTEX(cfs_constraints_mutex); + +const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ +const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); + +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +{ + int i, ret = 0, runtime_enabled, runtime_was_enabled; + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + if (tg == &root_task_group) + return -EINVAL; + + /* + * Ensure we have at some amount of bandwidth every period. This is + * to prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota < min_cfs_quota_period || period < min_cfs_quota_period) + return -EINVAL; + + /* + * Likewise, bound things on the otherside by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period > max_cfs_quota_period) + return -EINVAL; + + mutex_lock(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); + if (ret) + goto out_unlock; + + runtime_enabled = quota != RUNTIME_INF; + runtime_was_enabled = cfs_b->quota != RUNTIME_INF; + account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); + raw_spin_lock_irq(&cfs_b->lock); + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + + __refill_cfs_bandwidth_runtime(cfs_b); + /* restart the period timer (if active) to handle new period expiry */ + if (runtime_enabled && cfs_b->timer_active) { + /* force a reprogram */ + cfs_b->timer_active = 0; + __start_cfs_bandwidth(cfs_b); + } + raw_spin_unlock_irq(&cfs_b->lock); + + for_each_possible_cpu(i) { + struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + struct rq *rq = cfs_rq->rq; + + raw_spin_lock_irq(&rq->lock); + cfs_rq->runtime_enabled = runtime_enabled; + cfs_rq->runtime_remaining = 0; + + if (cfs_rq->throttled) + unthrottle_cfs_rq(cfs_rq); + raw_spin_unlock_irq(&rq->lock); + } +out_unlock: + mutex_unlock(&cfs_constraints_mutex); + + return ret; +} + +int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +{ + u64 quota, period; + + period = ktime_to_ns(tg->cfs_bandwidth.period); + if (cfs_quota_us < 0) + quota = RUNTIME_INF; + else + quota = (u64)cfs_quota_us * NSEC_PER_USEC; + + return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_quota(struct task_group *tg) +{ + u64 quota_us; + + if (tg->cfs_bandwidth.quota == RUNTIME_INF) + return -1; + + quota_us = tg->cfs_bandwidth.quota; + do_div(quota_us, NSEC_PER_USEC); + + return quota_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ + u64 quota, period; + + period = (u64)cfs_period_us * NSEC_PER_USEC; + quota = tg->cfs_bandwidth.quota; + + return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_period(struct task_group *tg) +{ + u64 cfs_period_us; + + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); + do_div(cfs_period_us, NSEC_PER_USEC); + + return cfs_period_us; +} + +static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_quota(cgroup_tg(cgrp)); +} + +static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, + s64 cfs_quota_us) +{ + return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 cfs_period_us) +{ + return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +struct cfs_schedulable_data { + struct task_group *tg; + u64 period, quota; +}; + +/* + * normalize group quota/period to be quota/max_period + * note: units are usecs + */ +static u64 normalize_cfs_quota(struct task_group *tg, + struct cfs_schedulable_data *d) +{ + u64 quota, period; + + if (tg == d->tg) { + period = d->period; + quota = d->quota; + } else { + period = tg_get_cfs_period(tg); + quota = tg_get_cfs_quota(tg); + } + + /* note: these should typically be equivalent */ + if (quota == RUNTIME_INF || quota == -1) + return RUNTIME_INF; + + return to_ratio(period, quota); +} + +static int tg_cfs_schedulable_down(struct task_group *tg, void *data) +{ + struct cfs_schedulable_data *d = data; + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + s64 quota = 0, parent_quota = -1; + + if (!tg->parent) { + quota = RUNTIME_INF; + } else { + struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; + + quota = normalize_cfs_quota(tg, d); + parent_quota = parent_b->hierarchal_quota; + + /* + * ensure max(child_quota) <= parent_quota, inherit when no + * limit is set + */ + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF && quota > parent_quota) + return -EINVAL; + } + cfs_b->hierarchal_quota = quota; + + return 0; +} + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) +{ + int ret; + struct cfs_schedulable_data data = { + .tg = tg, + .period = period, + .quota = quota, + }; + + if (quota != RUNTIME_INF) { + do_div(data.period, NSEC_PER_USEC); + do_div(data.quota, NSEC_PER_USEC); + } + + rcu_read_lock(); + ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); + rcu_read_unlock(); + + return ret; +} + +static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + cb->fill(cb, "nr_periods", cfs_b->nr_periods); + cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); + cb->fill(cb, "throttled_time", cfs_b->throttled_time); + + return 0; +} +#endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9035,6 +7876,22 @@ static struct cftype cpu_files[] = { .write_u64 = cpu_shares_write_u64, }, #endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .name = "cfs_quota_us", + .read_s64 = cpu_cfs_quota_read_s64, + .write_s64 = cpu_cfs_quota_write_s64, + }, + { + .name = "cfs_period_us", + .read_u64 = cpu_cfs_period_read_u64, + .write_u64 = cpu_cfs_period_write_u64, + }, + { + .name = "stat", + .read_map = cpu_stats_show, + }, +#endif #ifdef CONFIG_RT_GROUP_SCHED { .name = "rt_runtime_us", @@ -9058,8 +7915,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .create = cpu_cgroup_create, .destroy = cpu_cgroup_destroy, - .can_attach_task = cpu_cgroup_can_attach_task, - .attach_task = cpu_cgroup_attach_task, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, @@ -9077,38 +7934,16 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; - struct cpuacct *parent; -}; - -struct cgroup_subsys cpuacct_subsys; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_create( struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); - int i; + struct cpuacct *ca; + + if (!cgrp->parent) + return &root_cpuacct.css; + ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) goto out; @@ -9116,18 +7951,13 @@ static struct cgroup_subsys_state *cpuacct_create( if (!ca->cpuusage) goto out_free_ca; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - if (percpu_counter_init(&ca->cpustat[i], 0)) - goto out_free_counters; - - if (cgrp->parent) - ca->parent = cgroup_ca(cgrp->parent); + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; return &ca->css; -out_free_counters: - while (--i >= 0) - percpu_counter_destroy(&ca->cpustat[i]); +out_free_cpuusage: free_percpu(ca->cpuusage); out_free_ca: kfree(ca); @@ -9140,10 +7970,8 @@ static void cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); - int i; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - percpu_counter_destroy(&ca->cpustat[i]); + free_percpu(ca->cpustat); free_percpu(ca->cpuusage); kfree(ca); } @@ -9236,16 +8064,31 @@ static const char *cpuacct_stat_desc[] = { }; static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) + struct cgroup_map_cb *cb) { struct cpuacct *ca = cgroup_ca(cgrp); - int i; + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { - s64 val = percpu_counter_read(&ca->cpustat[i]); - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[i], val); + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; } + + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + return 0; } @@ -9275,7 +8118,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) * * called with rq->lock held. */ -static void cpuacct_charge(struct task_struct *tsk, u64 cputime) +void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; int cpu; @@ -9289,7 +8132,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); - for (; ca; ca = ca->parent) { + for (; ca; ca = parent_ca(ca)) { u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } @@ -9297,45 +8140,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } -/* - * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large - * in cputime_t units. As a result, cpuacct_update_stats calls - * percpu_counter_add with values large enough to always overflow the - * per cpu batch limit causing bad SMP scalability. - * - * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we - * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled - * and enabled. We cap it at INT_MAX which is the largest allowed batch value. - */ -#ifdef CONFIG_SMP -#define CPUACCT_BATCH \ - min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) -#else -#define CPUACCT_BATCH 0 -#endif - -/* - * Charge the system/user time to the task's accounting group. - */ -static void cpuacct_update_stats(struct task_struct *tsk, - enum cpuacct_stat_index idx, cputime_t val) -{ - struct cpuacct *ca; - int batch = CPUACCT_BATCH; - - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(tsk); - - do { - __percpu_counter_add(&ca->cpustat[idx], val, batch); - ca = ca->parent; - } while (ca); - rcu_read_unlock(); -} - struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .create = cpuacct_create, @@ -9344,4 +8148,3 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ - diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c index 2722dc1b4138..b0d798eaf130 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched/cpupri.c @@ -1,5 +1,5 @@ /* - * kernel/sched_cpupri.c + * kernel/sched/cpupri.c * * CPU priority management * @@ -28,7 +28,7 @@ */ #include <linux/gfp.h> -#include "sched_cpupri.h" +#include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ static int convert_prio(int prio) @@ -47,9 +47,6 @@ static int convert_prio(int prio) return cpupri; } -#define for_each_cpupri_active(array, idx) \ - for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) - /** * cpupri_find - find the best (lowest-pri) CPU in the system * @cp: The cpupri context @@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, int idx = 0; int task_pri = convert_prio(p->prio); - for_each_cpupri_active(cp->pri_active, idx) { - struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + if (task_pri >= MAX_RT_PRIO) + return 0; - if (idx >= task_pri) - break; + for (idx = 0; idx < task_pri; idx++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + int skip = 0; + + if (!atomic_read(&(vec)->count)) + skip = 1; + /* + * When looking at the vector, we need to read the counter, + * do a memory barrier, then read the mask. + * + * Note: This is still all racey, but we can deal with it. + * Ideally, we only want to look at masks that are set. + * + * If a mask is not set, then the only thing wrong is that we + * did a little more work than necessary. + * + * If we read a zero count but the mask is set, because of the + * memory barriers, that can only happen when the highest prio + * task for a run queue has left the run queue, in which case, + * it will be followed by a pull. If the task we are processing + * fails to find a proper place to go, that pull request will + * pull this task if the run queue is running at a lower + * priority. + */ + smp_rmb(); + + /* Need to do the rmb for every iteration */ + if (skip) + continue; if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; @@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) { int *currpri = &cp->cpu_to_pri[cpu]; int oldpri = *currpri; - unsigned long flags; + int do_mb = 0; newpri = convert_prio(newpri); @@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * If the cpu was currently mapped to a different value, we * need to map it to the new value then remove the old value. * Note, we must add the new value first, otherwise we risk the - * cpu being cleared from pri_active, and this cpu could be - * missed for a push or pull. + * cpu being missed by the priority loop in cpupri_find. */ if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; - raw_spin_lock_irqsave(&vec->lock, flags); - cpumask_set_cpu(cpu, vec->mask); - vec->count++; - if (vec->count == 1) - set_bit(newpri, cp->pri_active); - - raw_spin_unlock_irqrestore(&vec->lock, flags); + /* + * When adding a new vector, we update the mask first, + * do a write memory barrier, and then update the count, to + * make sure the vector is visible when count is set. + */ + smp_mb__before_atomic_inc(); + atomic_inc(&(vec)->count); + do_mb = 1; } if (likely(oldpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - raw_spin_lock_irqsave(&vec->lock, flags); - - vec->count--; - if (!vec->count) - clear_bit(oldpri, cp->pri_active); + /* + * Because the order of modification of the vec->count + * is important, we must make sure that the update + * of the new prio is seen before we decrement the + * old prio. This makes sure that the loop sees + * one or the other when we raise the priority of + * the run queue. We don't care about when we lower the + * priority, as that will trigger an rt pull anyway. + * + * We only need to do a memory barrier if we updated + * the new priority vec. + */ + if (do_mb) + smp_mb__after_atomic_inc(); + + /* + * When removing from the vector, we decrement the counter first + * do a memory barrier and then clear the mask. + */ + atomic_dec(&(vec)->count); + smp_mb__after_atomic_inc(); cpumask_clear_cpu(cpu, vec->mask); - - raw_spin_unlock_irqrestore(&vec->lock, flags); } *currpri = newpri; @@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; - raw_spin_lock_init(&vec->lock); - vec->count = 0; + atomic_set(&vec->count, 0); if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h index 9fc7d386fea4..f6d756173491 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched/cpupri.h @@ -4,7 +4,6 @@ #include <linux/sched.h> #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) -#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) #define CPUPRI_INVALID -1 #define CPUPRI_IDLE 0 @@ -12,14 +11,12 @@ /* values 2-101 are RT priorities 0-99 */ struct cpupri_vec { - raw_spinlock_t lock; - int count; - cpumask_var_t mask; + atomic_t count; + cpumask_var_t mask; }; struct cpupri { struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - long pri_active[CPUPRI_NR_PRI_WORDS]; int cpu_to_pri[NR_CPUS]; }; diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c index a6710a112b4f..2a075e10004b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched/debug.c @@ -1,5 +1,5 @@ /* - * kernel/time/sched_debug.c + * kernel/sched/debug.c * * Print the CFS rbtree * @@ -16,6 +16,8 @@ #include <linux/kallsyms.h> #include <linux/utsname.h> +#include "sched.h" + static DEFINE_SPINLOCK(sched_debug_lock); /* @@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v) return 0; } -static void sysrq_sched_debug_show(void) +void sysrq_sched_debug_show(void) { sched_debug_show(NULL, NULL); } diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c index bc8ee9993814..84adb2d66cbd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched/fair.c @@ -23,6 +23,13 @@ #include <linux/latencytop.h> #include <linux/sched.h> #include <linux/cpumask.h> +#include <linux/slab.h> +#include <linux/profile.h> +#include <linux/interrupt.h> + +#include <trace/events/sched.h> + +#include "sched.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -89,7 +96,124 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; */ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; -static const struct sched_class fair_sched_class; +#ifdef CONFIG_CFS_BANDWIDTH +/* + * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool + * each time a cfs_rq requests quota. + * + * Note: in the case that the slice exceeds the runtime remaining (either due + * to consumption or the quota being specified to be smaller than the slice) + * we will always only issue the remaining available time. + * + * default: 5 msec, units: microseconds + */ +unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static int get_update_sysctl_factor(void) +{ + unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } + + return factor; +} + +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); +#undef SET_SYSCTL +} + +void sched_init_granularity(void) +{ + update_sysctl(); +} + +#if BITS_PER_LONG == 32 +# define WMULT_CONST (~0UL) +#else +# define WMULT_CONST (1UL << 32) +#endif + +#define WMULT_SHIFT 32 + +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +/* + * delta *= weight / lw + */ +static unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, + struct load_weight *lw) +{ + u64 tmp; + + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; + + if (!lw->inv_weight) { + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; + } + + /* + * Check whether we'd overflow the 64-bit multiplication: + */ + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); +} + + +const struct sched_class fair_sched_class; /************************************************************** * CFS operations on generic schedulable entities: @@ -292,6 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -397,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; @@ -418,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) } #ifdef CONFIG_SCHED_DEBUG -static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -583,6 +709,8 @@ static void update_curr(struct cfs_rq *cfs_rq) cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } + + account_cfs_rq_runtime(cfs_rq, delta_exec); } static inline void @@ -666,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) - inc_cpu_load(rq_of(cfs_rq), se->load.weight); + update_load_add(&rq_of(cfs_rq)->load, se->load.weight); if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); list_add(&se->group_node, &cfs_rq->tasks); @@ -679,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) - dec_cpu_load(rq_of(cfs_rq), se->load.weight); + update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); list_del_init(&se->group_node); @@ -688,6 +816,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_FAIR_GROUP_SCHED +/* we need this in update_cfs_load and load-balance functions below */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); # ifdef CONFIG_SMP static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, int global_update) @@ -710,7 +840,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) u64 now, delta; unsigned long load = cfs_rq->load.weight; - if (cfs_rq->tg == &root_task_group) + if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) return; now = rq_of(cfs_rq)->clock_task; @@ -752,19 +882,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } +static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +{ + long tg_weight; + + /* + * Use this CPU's actual weight instead of the last load_contribution + * to gain a more accurate current total weight. See + * update_cfs_rq_load_contribution(). + */ + tg_weight = atomic_read(&tg->load_weight); + tg_weight -= cfs_rq->load_contribution; + tg_weight += cfs_rq->load.weight; + + return tg_weight; +} + static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { - long load_weight, load, shares; + long tg_weight, load, shares; + tg_weight = calc_tg_weight(tg, cfs_rq); load = cfs_rq->load.weight; - load_weight = atomic_read(&tg->load_weight); - load_weight += load; - load_weight -= cfs_rq->load_contribution; - shares = (tg->shares * load); - if (load_weight) - shares /= load_weight; + if (tg_weight) + shares /= tg_weight; if (shares < MIN_SHARES) shares = MIN_SHARES; @@ -819,7 +962,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) tg = cfs_rq->tg; se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se) + if (!se || throttled_hierarchy(cfs_rq)) return; #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) @@ -860,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.sleep_max)) se->statistics.sleep_max = delta; - se->statistics.sleep_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -877,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.block_max)) se->statistics.block_max = delta; - se->statistics.block_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -887,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) trace_sched_stat_iowait(tsk, delta); } + trace_sched_stat_blocked(tsk, delta); + /* * Blocking time is in units of nanosecs, so shift by * 20 to get a milliseconds-range estimation of the @@ -950,6 +1093,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } +static void check_enqueue_throttle(struct cfs_rq *cfs_rq); + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -979,8 +1124,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_running == 1) { list_add_leaf_cfs_rq(cfs_rq); + check_enqueue_throttle(cfs_rq); + } } static void __clear_buddies_last(struct sched_entity *se) @@ -1028,6 +1175,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) __clear_buddies_skip(se); } +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -1066,6 +1215,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; + /* return excess runtime on last dequeue */ + return_cfs_rq_runtime(cfs_rq); + update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); } @@ -1077,6 +1229,8 @@ static void check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { unsigned long ideal_runtime, delta_exec; + struct sched_entity *se; + s64 delta; ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; @@ -1095,22 +1249,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * narrow margin doesn't have to wait for a full slice. * This also mitigates buddy induced latencies under load. */ - if (!sched_feat(WAKEUP_PREEMPT)) - return; - if (delta_exec < sysctl_sched_min_granularity) return; - if (cfs_rq->nr_running > 1) { - struct sched_entity *se = __pick_first_entity(cfs_rq); - s64 delta = curr->vruntime - se->vruntime; + se = __pick_first_entity(cfs_rq); + delta = curr->vruntime - se->vruntime; - if (delta < 0) - return; + if (delta < 0) + return; - if (delta > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); - } + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); } static void @@ -1185,6 +1334,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) return se; } +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); + static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* @@ -1194,6 +1345,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) if (prev->on_rq) update_curr(cfs_rq); + /* throttle cfs_rqs exceeding runtime */ + check_cfs_rq_runtime(cfs_rq); + check_spread(cfs_rq, prev); if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); @@ -1233,10 +1387,742 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) return; #endif - if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) + if (cfs_rq->nr_running > 1) check_preempt_tick(cfs_rq, curr); } + +/************************************************** + * CFS bandwidth control machinery + */ + +#ifdef CONFIG_CFS_BANDWIDTH + +#ifdef HAVE_JUMP_LABEL +static struct jump_label_key __cfs_bandwidth_used; + +static inline bool cfs_bandwidth_used(void) +{ + return static_branch(&__cfs_bandwidth_used); +} + +void account_cfs_bandwidth_used(int enabled, int was_enabled) +{ + /* only need to count groups transitioning between enabled/!enabled */ + if (enabled && !was_enabled) + jump_label_inc(&__cfs_bandwidth_used); + else if (!enabled && was_enabled) + jump_label_dec(&__cfs_bandwidth_used); +} +#else /* HAVE_JUMP_LABEL */ +static bool cfs_bandwidth_used(void) +{ + return true; +} + +void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +#endif /* HAVE_JUMP_LABEL */ + +/* + * default period for cfs group bandwidth. + * default: 0.1s, units: nanoseconds + */ +static inline u64 default_cfs_period(void) +{ + return 100000000ULL; +} + +static inline u64 sched_cfs_bandwidth_slice(void) +{ + return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; +} + +/* + * Replenish runtime according to assigned quota and update expiration time. + * We use sched_clock_cpu directly instead of rq->clock to avoid adding + * additional synchronization around rq->lock. + * + * requires cfs_b->lock + */ +void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) +{ + u64 now; + + if (cfs_b->quota == RUNTIME_INF) + return; + + now = sched_clock_cpu(smp_processor_id()); + cfs_b->runtime = cfs_b->quota; + cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); +} + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return &tg->cfs_bandwidth; +} + +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + u64 amount = 0, min_amount, expires; + + /* note: this is a positive sum as runtime_remaining <= 0 */ + min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota == RUNTIME_INF) + amount = min_amount; + else { + /* + * If the bandwidth pool has become inactive, then at least one + * period must have elapsed since the last consumption. + * Refresh the global state and ensure bandwidth timer becomes + * active. + */ + if (!cfs_b->timer_active) { + __refill_cfs_bandwidth_runtime(cfs_b); + __start_cfs_bandwidth(cfs_b); + } + + if (cfs_b->runtime > 0) { + amount = min(cfs_b->runtime, min_amount); + cfs_b->runtime -= amount; + cfs_b->idle = 0; + } + } + expires = cfs_b->runtime_expires; + raw_spin_unlock(&cfs_b->lock); + + cfs_rq->runtime_remaining += amount; + /* + * we may have advanced our local expiration to account for allowed + * spread between our sched_clock and the one on which runtime was + * issued. + */ + if ((s64)(expires - cfs_rq->runtime_expires) > 0) + cfs_rq->runtime_expires = expires; + + return cfs_rq->runtime_remaining > 0; +} + +/* + * Note: This depends on the synchronization provided by sched_clock and the + * fact that rq->clock snapshots this value. + */ +static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct rq *rq = rq_of(cfs_rq); + + /* if the deadline is ahead of our clock, nothing to do */ + if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) + return; + + if (cfs_rq->runtime_remaining < 0) + return; + + /* + * If the local deadline has passed we have to consider the + * possibility that our sched_clock is 'fast' and the global deadline + * has not truly expired. + * + * Fortunately we can check determine whether this the case by checking + * whether the global deadline has advanced. + */ + + if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + /* extend local deadline, drift is bounded above by 2 ticks */ + cfs_rq->runtime_expires += TICK_NSEC; + } else { + /* global deadline is ahead, expiration has passed */ + cfs_rq->runtime_remaining = 0; + } +} + +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) +{ + /* dock delta_exec before expiring quota (as it could span periods) */ + cfs_rq->runtime_remaining -= delta_exec; + expire_cfs_rq_runtime(cfs_rq); + + if (likely(cfs_rq->runtime_remaining > 0)) + return; + + /* + * if we're unable to extend our runtime we resched so that the active + * hierarchy can be throttled + */ + if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) + resched_task(rq_of(cfs_rq)->curr); +} + +static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) +{ + if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) + return; + + __account_cfs_rq_runtime(cfs_rq, delta_exec); +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->throttled; +} + +/* check whether cfs_rq, or any parent, is throttled */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->throttle_count; +} + +/* + * Ensure that neither of the group entities corresponding to src_cpu or + * dest_cpu are members of a throttled hierarchy when performing group + * load-balance operations. + */ +static inline int throttled_lb_pair(struct task_group *tg, + int src_cpu, int dest_cpu) +{ + struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + + src_cfs_rq = tg->cfs_rq[src_cpu]; + dest_cfs_rq = tg->cfs_rq[dest_cpu]; + + return throttled_hierarchy(src_cfs_rq) || + throttled_hierarchy(dest_cfs_rq); +} + +/* updated child weight may affect parent so we have to do this bottom up */ +static int tg_unthrottle_up(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count--; +#ifdef CONFIG_SMP + if (!cfs_rq->throttle_count) { + u64 delta = rq->clock_task - cfs_rq->load_stamp; + + /* leaving throttled state, advance shares averaging windows */ + cfs_rq->load_stamp += delta; + cfs_rq->load_last += delta; + + /* update entity weight now that we are on_rq again */ + update_cfs_shares(cfs_rq); + } +#endif + + return 0; +} + +static int tg_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + /* group is entering throttled state, record last load */ + if (!cfs_rq->throttle_count) + update_cfs_load(cfs_rq, 0); + cfs_rq->throttle_count++; + + return 0; +} + +static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + long task_delta, dequeue = 1; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + /* account load preceding throttle */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); + + task_delta = cfs_rq->h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + break; + + if (dequeue) + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + qcfs_rq->h_nr_running -= task_delta; + + if (qcfs_rq->load.weight) + dequeue = 0; + } + + if (!se) + rq->nr_running -= task_delta; + + cfs_rq->throttled = 1; + cfs_rq->throttled_timestamp = rq->clock; + raw_spin_lock(&cfs_b->lock); + list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + raw_spin_unlock(&cfs_b->lock); +} + +void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + int enqueue = 1; + long task_delta; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + cfs_rq->throttled = 0; + raw_spin_lock(&cfs_b->lock); + cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; + list_del_rcu(&cfs_rq->throttled_list); + raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled_timestamp = 0; + + update_rq_clock(rq); + /* update hierarchical throttle state */ + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + + if (!cfs_rq->load.weight) + return; + + task_delta = cfs_rq->h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + enqueue = 0; + + cfs_rq = cfs_rq_of(se); + if (enqueue) + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + + if (cfs_rq_throttled(cfs_rq)) + break; + } + + if (!se) + rq->nr_running += task_delta; + + /* determine whether we need to wake up potentially idle cpu */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_task(rq->curr); +} + +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, + u64 remaining, u64 expires) +{ + struct cfs_rq *cfs_rq; + u64 runtime = remaining; + + rcu_read_lock(); + list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, + throttled_list) { + struct rq *rq = rq_of(cfs_rq); + + raw_spin_lock(&rq->lock); + if (!cfs_rq_throttled(cfs_rq)) + goto next; + + runtime = -cfs_rq->runtime_remaining + 1; + if (runtime > remaining) + runtime = remaining; + remaining -= runtime; + + cfs_rq->runtime_remaining += runtime; + cfs_rq->runtime_expires = expires; + + /* we check whether we're throttled above */ + if (cfs_rq->runtime_remaining > 0) + unthrottle_cfs_rq(cfs_rq); + +next: + raw_spin_unlock(&rq->lock); + + if (!remaining) + break; + } + rcu_read_unlock(); + + return remaining; +} + +/* + * Responsible for refilling a task_group's bandwidth and unthrottling its + * cfs_rqs as appropriate. If there has been no activity within the last + * period the timer is deactivated until scheduling resumes; cfs_b->idle is + * used to track this state. + */ +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +{ + u64 runtime, runtime_expires; + int idle = 1, throttled; + + raw_spin_lock(&cfs_b->lock); + /* no need to continue the timer with no bandwidth constraint */ + if (cfs_b->quota == RUNTIME_INF) + goto out_unlock; + + throttled = !list_empty(&cfs_b->throttled_cfs_rq); + /* idle depends on !throttled (for the case of a large deficit) */ + idle = cfs_b->idle && !throttled; + cfs_b->nr_periods += overrun; + + /* if we're going inactive then everything else can be deferred */ + if (idle) + goto out_unlock; + + __refill_cfs_bandwidth_runtime(cfs_b); + + if (!throttled) { + /* mark as potentially idle for the upcoming period */ + cfs_b->idle = 1; + goto out_unlock; + } + + /* account preceding periods in which throttling occurred */ + cfs_b->nr_throttled += overrun; + + /* + * There are throttled entities so we must first use the new bandwidth + * to unthrottle them before making it generally available. This + * ensures that all existing debts will be paid before a new cfs_rq is + * allowed to run. + */ + runtime = cfs_b->runtime; + runtime_expires = cfs_b->runtime_expires; + cfs_b->runtime = 0; + + /* + * This check is repeated as we are holding onto the new bandwidth + * while we unthrottle. This can potentially race with an unthrottled + * group trying to acquire new bandwidth from the global pool. + */ + while (throttled && runtime > 0) { + raw_spin_unlock(&cfs_b->lock); + /* we can't nest cfs_b->lock while distributing bandwidth */ + runtime = distribute_cfs_runtime(cfs_b, runtime, + runtime_expires); + raw_spin_lock(&cfs_b->lock); + + throttled = !list_empty(&cfs_b->throttled_cfs_rq); + } + + /* return (any) remaining runtime */ + cfs_b->runtime = runtime; + /* + * While we are ensured activity in the period following an + * unthrottle, this also covers the case in which the new bandwidth is + * insufficient to cover the existing bandwidth deficit. (Forcing the + * timer to remain active while there are any throttled entities.) + */ + cfs_b->idle = 0; +out_unlock: + if (idle) + cfs_b->timer_active = 0; + raw_spin_unlock(&cfs_b->lock); + + return idle; +} + +/* a cfs_rq won't donate quota below this amount */ +static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; +/* minimum remaining period time to redistribute slack quota */ +static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; +/* how long we wait to gather additional slack before distributing */ +static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; + +/* are we near the end of the current quota period? */ +static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) +{ + struct hrtimer *refresh_timer = &cfs_b->period_timer; + u64 remaining; + + /* if the call-back is running a quota refresh is already occurring */ + if (hrtimer_callback_running(refresh_timer)) + return 1; + + /* is a quota refresh about to occur? */ + remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); + if (remaining < min_expire) + return 1; + + return 0; +} + +static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) +{ + u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; + + /* if there's a quota refresh soon don't bother with slack */ + if (runtime_refresh_within(cfs_b, min_left)) + return; + + start_bandwidth_timer(&cfs_b->slack_timer, + ns_to_ktime(cfs_bandwidth_slack_period)); +} + +/* we know any runtime found here is valid as update_curr() precedes return */ +static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; + + if (slack_runtime <= 0) + return; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota != RUNTIME_INF && + cfs_rq->runtime_expires == cfs_b->runtime_expires) { + cfs_b->runtime += slack_runtime; + + /* we are under rq->lock, defer unthrottling using a timer */ + if (cfs_b->runtime > sched_cfs_bandwidth_slice() && + !list_empty(&cfs_b->throttled_cfs_rq)) + start_cfs_slack_bandwidth(cfs_b); + } + raw_spin_unlock(&cfs_b->lock); + + /* even if it's not valid for return we don't want to try again */ + cfs_rq->runtime_remaining -= slack_runtime; +} + +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + if (!cfs_bandwidth_used()) + return; + + if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + return; + + __return_cfs_rq_runtime(cfs_rq); +} + +/* + * This is done with a timer (instead of inline with bandwidth return) since + * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. + */ +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) +{ + u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + u64 expires; + + /* confirm we're still not at a refresh boundary */ + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + return; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { + runtime = cfs_b->runtime; + cfs_b->runtime = 0; + } + expires = cfs_b->runtime_expires; + raw_spin_unlock(&cfs_b->lock); + + if (!runtime) + return; + + runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + + raw_spin_lock(&cfs_b->lock); + if (expires == cfs_b->runtime_expires) + cfs_b->runtime = runtime; + raw_spin_unlock(&cfs_b->lock); +} + +/* + * When a group wakes up we want to make sure that its quota is not already + * expired/exceeded, otherwise it may be allowed to steal additional ticks of + * runtime as update_curr() throttling can not not trigger until it's on-rq. + */ +static void check_enqueue_throttle(struct cfs_rq *cfs_rq) +{ + if (!cfs_bandwidth_used()) + return; + + /* an active group must be handled by the update_curr()->put() path */ + if (!cfs_rq->runtime_enabled || cfs_rq->curr) + return; + + /* ensure the group is not already throttled */ + if (cfs_rq_throttled(cfs_rq)) + return; + + /* update runtime allocation */ + account_cfs_rq_runtime(cfs_rq, 0); + if (cfs_rq->runtime_remaining <= 0) + throttle_cfs_rq(cfs_rq); +} + +/* conditionally throttle active cfs_rq's from put_prev_entity() */ +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + if (!cfs_bandwidth_used()) + return; + + if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) + return; + + /* + * it's possible for a throttled entity to be forced into a running + * state (e.g. set_curr_task), in this case we're finished. + */ + if (cfs_rq_throttled(cfs_rq)) + return; + + throttle_cfs_rq(cfs_rq); +} + +static inline u64 default_cfs_period(void); +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); + +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); + + return HRTIMER_NORESTART; +} + +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, cfs_b->period); + + if (!overrun) + break; + + idle = do_sched_cfs_period_timer(cfs_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + raw_spin_lock_init(&cfs_b->lock); + cfs_b->runtime = 0; + cfs_b->quota = RUNTIME_INF; + cfs_b->period = ns_to_ktime(default_cfs_period()); + + INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->slack_timer.function = sched_cfs_slack_timer; +} + +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + cfs_rq->runtime_enabled = 0; + INIT_LIST_HEAD(&cfs_rq->throttled_list); +} + +/* requires cfs_b->lock, may release to reprogram timer */ +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + /* + * The timer may be active because we're trying to set a new bandwidth + * period or because we're racing with the tear-down path + * (timer_active==0 becomes visible before the hrtimer call-back + * terminates). In either case we ensure that it's re-programmed + */ + while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + raw_spin_unlock(&cfs_b->lock); + /* ensure cfs_b->lock is available while we wait */ + hrtimer_cancel(&cfs_b->period_timer); + + raw_spin_lock(&cfs_b->lock); + /* if someone else restarted the timer then we're done */ + if (cfs_b->timer_active) + return; + } + + cfs_b->timer_active = 1; + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); +} + +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + hrtimer_cancel(&cfs_b->period_timer); + hrtimer_cancel(&cfs_b->slack_timer); +} + +void unthrottle_offline_cfs_rqs(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + + if (!cfs_rq->runtime_enabled) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = cfs_b->quota; + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + } +} + +#else /* CONFIG_CFS_BANDWIDTH */ +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static inline int throttled_lb_pair(struct task_group *tg, + int src_cpu, int dest_cpu) +{ + return 0; +} + +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +#endif + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return NULL; +} +static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +void unthrottle_offline_cfs_rqs(struct rq *rq) {} + +#endif /* CONFIG_CFS_BANDWIDTH */ + /************************************************** * CFS operations on tasks: */ @@ -1249,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { + if (cfs_rq->nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -1280,7 +2166,7 @@ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; - if (curr->sched_class != &fair_sched_class) + if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) @@ -1313,16 +2199,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); + + /* + * end evaluation on encountering a throttled cfs_rq + * + * note: in the case of encountering a throttled cfs_rq we will + * post the final h_nr_running increment below. + */ + if (cfs_rq_throttled(cfs_rq)) + break; + cfs_rq->h_nr_running++; + flags = ENQUEUE_WAKEUP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_running++; + + if (cfs_rq_throttled(cfs_rq)) + break; update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } + if (!se) + inc_nr_running(rq); hrtick_update(rq); } @@ -1343,6 +2246,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + /* + * end evaluation on encountering a throttled cfs_rq + * + * note: in the case of encountering a throttled cfs_rq we will + * post the final h_nr_running decrement below. + */ + if (cfs_rq_throttled(cfs_rq)) + break; + cfs_rq->h_nr_running--; + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* @@ -1361,15 +2274,76 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_running--; + + if (cfs_rq_throttled(cfs_rq)) + break; update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } + if (!se) + dec_nr_running(rq); hrtick_update(rq); } #ifdef CONFIG_SMP +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +static unsigned long power_of(int cpu) +{ + return cpu_rq(cpu)->cpu_power; +} + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + + if (nr_running) + return rq->load.weight / nr_running; + + return 0; +} + static void task_waking_fair(struct task_struct *p) { @@ -1399,42 +2373,105 @@ static void task_waking_fair(struct task_struct *p) * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. + * + * Calculate the effective load difference if @wl is added (subtracted) to @tg + * on this @cpu and results in a total addition (subtraction) of @wg to the + * total group weight. + * + * Given a runqueue weight distribution (rw_i) we can compute a shares + * distribution (s_i) using: + * + * s_i = rw_i / \Sum rw_j (1) + * + * Suppose we have 4 CPUs and our @tg is a direct child of the root group and + * has 7 equal weight tasks, distributed as below (rw_i), with the resulting + * shares distribution (s_i): + * + * rw_i = { 2, 4, 1, 0 } + * s_i = { 2/7, 4/7, 1/7, 0 } + * + * As per wake_affine() we're interested in the load of two CPUs (the CPU the + * task used to run on and the CPU the waker is running on), we need to + * compute the effect of waking a task on either CPU and, in case of a sync + * wakeup, compute the effect of the current task going to sleep. + * + * So for a change of @wl to the local @cpu with an overall group weight change + * of @wl we can compute the new shares distribution (s'_i) using: + * + * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) + * + * Suppose we're interested in CPUs 0 and 1, and want to compute the load + * differences in waking a task to CPU 0. The additional task changes the + * weight and shares distributions like: + * + * rw'_i = { 3, 4, 1, 0 } + * s'_i = { 3/8, 4/8, 1/8, 0 } + * + * We can then compute the difference in effective weight by using: + * + * dw_i = S * (s'_i - s_i) (3) + * + * Where 'S' is the group weight as seen by its parent. + * + * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) + * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - + * 4/7) times the weight of the group. */ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - if (!tg->parent) + if (!tg->parent) /* the trivial, non-cgroup case */ return wl; for_each_sched_entity(se) { - long lw, w; + long w, W; tg = se->my_q->tg; - w = se->my_q->load.weight; - /* use this cpu's instantaneous contribution */ - lw = atomic_read(&tg->load_weight); - lw -= se->my_q->load_contribution; - lw += w + wg; + /* + * W = @wg + \Sum rw_j + */ + W = wg + calc_tg_weight(tg, se->my_q); - wl += w; + /* + * w = rw_i + @wl + */ + w = se->my_q->load.weight + wl; - if (lw > 0 && wl < lw) - wl = (wl * tg->shares) / lw; + /* + * wl = S * s'_i; see (2) + */ + if (W > 0 && w < W) + wl = (w * tg->shares) / W; else wl = tg->shares; - /* zero point is MIN_SHARES */ + /* + * Per the above, wl is the new se->load.weight value; since + * those are clipped to [MIN_SHARES, ...) do so now. See + * calc_cfs_shares(). + */ if (wl < MIN_SHARES) wl = MIN_SHARES; + + /* + * wl = dw_i = S * (s'_i - s_i); see (3) + */ wl -= se->load.weight; + + /* + * Recursively apply this logic to all parent groups to compute + * the final effective load change on the root group. Since + * only the @tg group gets extra weight, all parent groups can + * only redistribute existing shares. @wl is the shift in shares + * resulting from this level per the above. + */ wg = 0; } return wl; } - #else static inline unsigned long effective_load(struct task_group *tg, int cpu, @@ -1547,7 +2584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, /* Skip over this group if it has no CPUs allowed */ if (!cpumask_intersects(sched_group_cpus(group), - &p->cpus_allowed)) + tsk_cpus_allowed(p))) continue; local_group = cpumask_test_cpu(this_cpu, @@ -1593,7 +2630,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int i; /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1613,6 +2650,7 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; + struct sched_group *sg; int i; /* @@ -1633,25 +2671,28 @@ static int select_idle_sibling(struct task_struct *p, int target) * Otherwise, iterate the domains and find an elegible idle cpu. */ rcu_read_lock(); - for_each_domain(target, sd) { - if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) - break; - for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (idle_cpu(i)) { - target = i; - break; + sd = rcu_dereference(per_cpu(sd_llc, target)); + for_each_lower_domain(sd) { + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; + + for_each_cpu(i, sched_group_cpus(sg)) { + if (!idle_cpu(i)) + goto next; } - } - /* - * Lets stop looking for an idle sibling when we reached - * the domain that spans the current cpu and prev_cpu. - */ - if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && - cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) - break; + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; +next: + sg = sg->next; + } while (sg != sd->groups); } +done: rcu_read_unlock(); return target; @@ -1679,8 +2720,11 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int want_sd = 1; int sync = wake_flags & WF_SYNC; + if (p->rt.nr_cpus_allowed == 1) + return prev_cpu; + if (sd_flag & SD_BALANCE_WAKE) { - if (cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) want_affine = 1; new_cpu = prev_cpu; } @@ -1875,6 +2919,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(se == pse)) return; + /* + * This is possible from callers such as pull_task(), in which we + * unconditionally check_prempt_curr() after an enqueue (which may have + * lead to a throttle). This both saves work and prevents false + * next-buddy nomination below. + */ + if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + return; + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { set_next_buddy(pse); next_buddy_marked = 1; @@ -1883,6 +2936,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. + * + * Note: this also catches the edge-case of curr being in a throttled + * group (e.g. via set_curr_task), since update_curr() (in the + * enqueue of curr) will have resulted in resched being set. This + * prevents us from potentially nominating it as a false LAST_BUDDY + * below. */ if (test_tsk_need_resched(curr)) return; @@ -1899,10 +2958,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(p->policy != SCHED_NORMAL)) return; - - if (!sched_feat(WAKEUP_PREEMPT)) - return; - find_matching_se(&se, &pse); update_curr(cfs_rq_of(se)); BUG_ON(!pse); @@ -1952,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) } while (cfs_rq); p = task_of(se); - hrtick_start_fair(rq, p); + if (hrtick_enabled(rq)) + hrtick_start_fair(rq, p); return p; } @@ -1996,6 +3052,12 @@ static void yield_task_fair(struct rq *rq) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq->skip_clock_update = 1; } set_skip_buddy(se); @@ -2005,7 +3067,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp { struct sched_entity *se = &p->se; - if (!se->on_rq) + /* throttled hierarchies are not runnable */ + if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; /* Tell the scheduler that we'd really like pse to run next. */ @@ -2035,12 +3098,50 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, } /* + * Is this task likely cache-hot: + */ +static int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +{ + s64 delta; + + if (p->sched_class != &fair_sched_class) + return 0; + + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + + /* + * Buddy candidates are cache hot: + */ + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) + return 1; + + if (sysctl_sched_migration_cost == -1) + return 1; + if (sysctl_sched_migration_cost == 0) + return 0; + + delta = now - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +} + +#define LBF_ALL_PINNED 0x01 +#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ +#define LBF_HAD_BREAK 0x04 +#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ +#define LBF_ABORT 0x10 + +/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *lb_flags) { int tsk_cache_hot = 0; /* @@ -2049,11 +3150,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } - *all_pinned = 0; + *lb_flags &= ~LBF_ALL_PINNED; if (task_running(rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); @@ -2102,6 +3203,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, for_each_leaf_cfs_rq(busiest, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), + busiest->cpu, this_cpu)) + break; if (!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) @@ -2124,7 +3228,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, + enum cpu_idle_type idle, int *lb_flags, struct cfs_rq *busiest_cfs_rq) { int loops = 0, pulled = 0; @@ -2135,12 +3239,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, goto out; list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { - if (loops++ > sysctl_sched_nr_migrate) + if (loops++ > sysctl_sched_nr_migrate) { + *lb_flags |= LBF_NEED_BREAK; break; + } if ((p->se.load.weight >> 1) > rem_load_move || !can_migrate_task(p, busiest, this_cpu, sd, idle, - all_pinned)) + lb_flags)) continue; pull_task(busiest, p, this_rq, this_cpu); @@ -2153,8 +3259,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE) + if (idle == CPU_NEWLY_IDLE) { + *lb_flags |= LBF_ABORT; break; + } #endif /* @@ -2217,8 +3325,13 @@ static void update_shares(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) + for_each_leaf_cfs_rq(rq, cfs_rq) { + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + continue; + update_shares_cpu(cfs_rq->tg, cpu); + } rcu_read_unlock(); } @@ -2254,7 +3367,7 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *lb_flags) { long rem_load_move = max_load_move; struct cfs_rq *busiest_cfs_rq; @@ -2267,17 +3380,21 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long busiest_weight = busiest_cfs_rq->load.weight; u64 rem_load, moved_load; + if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + break; + /* - * empty group + * empty group or part of a throttled hierarchy */ - if (!busiest_cfs_rq->task_weight) + if (!busiest_cfs_rq->task_weight || + throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) continue; rem_load = (u64)rem_load_move * busiest_weight; rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, all_pinned, + rem_load, sd, idle, lb_flags, busiest_cfs_rq); if (!moved_load) @@ -2303,10 +3420,10 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *lb_flags) { return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, all_pinned, + max_load_move, sd, idle, lb_flags, &busiest->cfs); } #endif @@ -2321,29 +3438,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *lb_flags) { unsigned long total_load_moved = 0, load_moved; do { load_moved = load_balance_fair(this_rq, this_cpu, busiest, max_load_move - total_load_moved, - sd, idle, all_pinned); + sd, idle, lb_flags); total_load_moved += load_moved; + if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + break; + #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) - break; - - if (raw_spin_is_contended(&this_rq->lock) || - raw_spin_is_contended(&busiest->lock)) + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { + *lb_flags |= LBF_ABORT; break; + } #endif } while (load_moved && max_load_move > total_load_moved); @@ -2405,15 +3523,6 @@ struct sg_lb_stats { }; /** - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. - * @group: The group whose first cpu is to be returned. - */ -static inline unsigned int group_first_cpu(struct sched_group *group) -{ - return cpumask_first(sched_group_cpus(group)); -} - -/** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. * @idle: The Idle status of the CPU for whose sd load_icx is obtained. @@ -2662,7 +3771,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) sdg->sgp->power = power; } -static void update_group_power(struct sched_domain *sd, int cpu) +void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; @@ -2854,7 +3963,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, } /** - * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu @@ -2928,11 +4037,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, } while (sg != sd->groups); } -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; -} - /** * check_asym_packing - Check to see if the group is packed into the * sched doman. @@ -3296,7 +4400,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); static int need_active_balance(struct sched_domain *sd, int idle, int busiest_cpu, int this_cpu) @@ -3347,7 +4451,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, all_pinned = 0, active_balance = 0; + int ld_moved, lb_flags = 0, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -3388,11 +4492,11 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - all_pinned = 1; + lb_flags |= LBF_ALL_PINNED; local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &all_pinned); + imbalance, sd, idle, &lb_flags); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -3402,8 +4506,18 @@ redo: if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); + if (lb_flags & LBF_ABORT) + goto out_balanced; + + if (lb_flags & LBF_NEED_BREAK) { + lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; + if (lb_flags & LBF_ABORT) + goto out_balanced; + goto redo; + } + /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(all_pinned)) { + if (unlikely(lb_flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; @@ -3430,10 +4544,10 @@ redo: * moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, - &busiest->curr->cpus_allowed)) { + tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - all_pinned = 1; + lb_flags |= LBF_ALL_PINNED; goto out_one_pinned; } @@ -3486,7 +4600,8 @@ out_balanced: out_one_pinned: /* tune up the balancing interval */ - if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + if (((lb_flags & LBF_ALL_PINNED) && + sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; @@ -3499,7 +4614,7 @@ out: * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static void idle_balance(int this_cpu, struct rq *this_rq) +void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; int pulled_task = 0; @@ -3612,46 +4727,18 @@ out_unlock: } #ifdef CONFIG_NO_HZ - -static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); - -static void trigger_sched_softirq(void *data) -{ - raise_softirq_irqoff(SCHED_SOFTIRQ); -} - -static inline void init_sched_softirq_csd(struct call_single_data *csd) -{ - csd->func = trigger_sched_softirq; - csd->info = NULL; - csd->flags = 0; - csd->priv = 0; -} - /* * idle load balancing details - * - One of the idle CPUs nominates itself as idle load_balancer, while - * entering idle. - * - This idle load balancer CPU will also go into tickless mode when - * it is idle, just like all other idle CPUs * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ static struct { - atomic_t load_balancer; - atomic_t first_pick_cpu; - atomic_t second_pick_cpu; cpumask_var_t idle_cpus_mask; - cpumask_var_t grp_idle_mask; + atomic_t nr_cpus; unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -int get_nohz_load_balancer(void) -{ - return atomic_read(&nohz.load_balancer); -} - #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) /** * lowest_flag_domain - Return lowest sched_domain containing flag. @@ -3667,7 +4754,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) struct sched_domain *sd; for_each_domain(cpu, sd) - if (sd && (sd->flags & flag)) + if (sd->flags & flag) break; return sd; @@ -3688,33 +4775,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) (sd && (sd->flags & flag)); sd = sd->parent) /** - * is_semi_idle_group - Checks if the given sched_group is semi-idle. - * @ilb_group: group to be checked for semi-idleness - * - * Returns: 1 if the group is semi-idle. 0 otherwise. - * - * We define a sched_group to be semi idle if it has atleast one idle-CPU - * and atleast one non-idle CPU. This helper function checks if the given - * sched_group is semi-idle or not. - */ -static inline int is_semi_idle_group(struct sched_group *ilb_group) -{ - cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, - sched_group_cpus(ilb_group)); - - /* - * A sched_group is semi-idle when it has atleast one busy cpu - * and atleast one idle cpu. - */ - if (cpumask_empty(nohz.grp_idle_mask)) - return 0; - - if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) - return 0; - - return 1; -} -/** * find_new_ilb - Finds the optimum idle load balancer for nomination. * @cpu: The cpu which is nominating a new idle_load_balancer. * @@ -3728,9 +4788,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group) */ static int find_new_ilb(int cpu) { + int ilb = cpumask_first(nohz.idle_cpus_mask); + struct sched_group *ilbg; struct sched_domain *sd; - struct sched_group *ilb_group; - int ilb = nr_cpu_ids; /* * Have idle load balancer selection from semi-idle packages only @@ -3748,23 +4808,28 @@ static int find_new_ilb(int cpu) rcu_read_lock(); for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { - ilb_group = sd->groups; + ilbg = sd->groups; do { - if (is_semi_idle_group(ilb_group)) { - ilb = cpumask_first(nohz.grp_idle_mask); + if (ilbg->group_weight != + atomic_read(&ilbg->sgp->nr_busy_cpus)) { + ilb = cpumask_first_and(nohz.idle_cpus_mask, + sched_group_cpus(ilbg)); goto unlock; } - ilb_group = ilb_group->next; + ilbg = ilbg->next; - } while (ilb_group != sd->groups); + } while (ilbg != sd->groups); } unlock: rcu_read_unlock(); out_done: - return ilb; + if (ilb < nr_cpu_ids && idle_cpu(ilb)) + return ilb; + + return nr_cpu_ids; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) @@ -3784,94 +4849,68 @@ static void nohz_balancer_kick(int cpu) nohz.next_balance++; - ilb_cpu = get_nohz_load_balancer(); - - if (ilb_cpu >= nr_cpu_ids) { - ilb_cpu = cpumask_first(nohz.idle_cpus_mask); - if (ilb_cpu >= nr_cpu_ids) - return; - } + ilb_cpu = find_new_ilb(cpu); - if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { - struct call_single_data *cp; + if (ilb_cpu >= nr_cpu_ids) + return; - cpu_rq(ilb_cpu)->nohz_balance_kick = 1; - cp = &per_cpu(remote_sched_softirq_cb, cpu); - __smp_call_function_single(ilb_cpu, cp, 0); - } + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) + return; + /* + * Use smp_send_reschedule() instead of resched_cpu(). + * This way we generate a sched IPI on the target cpu which + * is idle. And the softirq performing nohz idle load balance + * will be run before returning from the IPI. + */ + smp_send_reschedule(ilb_cpu); return; } -/* - * This routine will try to nominate the ilb (idle load balancing) - * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. - * - * When the ilb owner becomes busy, we will not have new ilb owner until some - * idle CPU wakes up and goes back to idle or some busy CPU tries to kick - * idle load balancing by kicking one of the idle CPUs. - * - * Ticks are stopped for the ilb owner as well, with busy CPU kicking this - * ilb owner CPU in future (when there is a need for idle load balancing on - * behalf of all idle CPUs). - */ -void select_nohz_load_balancer(int stop_tick) +static inline void set_cpu_sd_state_busy(void) { + struct sched_domain *sd; int cpu = smp_processor_id(); - if (stop_tick) { - if (!cpu_active(cpu)) { - if (atomic_read(&nohz.load_balancer) != cpu) - return; - - /* - * If we are going offline and still the leader, - * give up! - */ - if (atomic_cmpxchg(&nohz.load_balancer, cpu, - nr_cpu_ids) != cpu) - BUG(); + if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) + return; + clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - return; - } + rcu_read_lock(); + for_each_domain(cpu, sd) + atomic_inc(&sd->groups->sgp->nr_busy_cpus); + rcu_read_unlock(); +} - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); +void set_cpu_sd_state_idle(void) +{ + struct sched_domain *sd; + int cpu = smp_processor_id(); - if (atomic_read(&nohz.first_pick_cpu) == cpu) - atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); - if (atomic_read(&nohz.second_pick_cpu) == cpu) - atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) + return; + set_bit(NOHZ_IDLE, nohz_flags(cpu)); - if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { - int new_ilb; + rcu_read_lock(); + for_each_domain(cpu, sd) + atomic_dec(&sd->groups->sgp->nr_busy_cpus); + rcu_read_unlock(); +} - /* make me the ilb owner */ - if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, - cpu) != nr_cpu_ids) - return; +/* + * This routine will record that this cpu is going idle with tick stopped. + * This info will be used in performing idle load balancing in the future. + */ +void select_nohz_load_balancer(int stop_tick) +{ + int cpu = smp_processor_id(); - /* - * Check to see if there is a more power-efficient - * ilb. - */ - new_ilb = find_new_ilb(cpu); - if (new_ilb < nr_cpu_ids && new_ilb != cpu) { - atomic_set(&nohz.load_balancer, nr_cpu_ids); - resched_cpu(new_ilb); - return; - } - return; - } - } else { - if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + if (stop_tick) { + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - - if (atomic_read(&nohz.load_balancer) == cpu) - if (atomic_cmpxchg(&nohz.load_balancer, cpu, - nr_cpu_ids) != cpu) - BUG(); + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + atomic_inc(&nohz.nr_cpus); + set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } return; } @@ -3885,7 +4924,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ -static void update_max_interval(void) +void update_max_interval(void) { max_load_balance_interval = HZ*num_online_cpus()/10; } @@ -3977,11 +5016,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) struct rq *rq; int balance_cpu; - if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) - return; + if (idle != CPU_IDLE || + !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) + goto end; for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu) + if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; /* @@ -3989,10 +5029,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) * work being done for other cpus. Next load * balancing owner will pick it up. */ - if (need_resched()) { - this_rq->nohz_balance_kick = 0; + if (need_resched()) break; - } raw_spin_lock_irq(&this_rq->lock); update_rq_clock(this_rq); @@ -4006,53 +5044,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) this_rq->next_balance = rq->next_balance; } nohz.next_balance = this_rq->next_balance; - this_rq->nohz_balance_kick = 0; +end: + clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } /* - * Current heuristic for kicking the idle load balancer - * - first_pick_cpu is the one of the busy CPUs. It will kick - * idle load balancer when it has more than one process active. This - * eliminates the need for idle load balancing altogether when we have - * only one running process in the system (common case). - * - If there are more than one busy CPU, idle load balancer may have - * to run for active_load_balance to happen (i.e., two busy CPUs are - * SMT or core siblings and can run better if they move to different - * physical CPUs). So, second_pick_cpu is the second of the busy CPUs - * which will kick idle load balancer as soon as it has any load. + * Current heuristic for kicking the idle load balancer in the presence + * of an idle cpu is the system. + * - This rq has more than one task. + * - At any scheduler domain level, this cpu's scheduler group has multiple + * busy cpu's exceeding the group's power. + * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler + * domain span are idle. */ static inline int nohz_kick_needed(struct rq *rq, int cpu) { unsigned long now = jiffies; - int ret; - int first_pick_cpu, second_pick_cpu; + struct sched_domain *sd; - if (time_before(now, nohz.next_balance)) + if (unlikely(idle_cpu(cpu))) return 0; - if (rq->idle_at_tick) - return 0; + /* + * We may be recently in ticked or tickless idle mode. At the first + * busy tick after returning from idle, we will update the busy stats. + */ + set_cpu_sd_state_busy(); + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } - first_pick_cpu = atomic_read(&nohz.first_pick_cpu); - second_pick_cpu = atomic_read(&nohz.second_pick_cpu); + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing. + */ + if (likely(!atomic_read(&nohz.nr_cpus))) + return 0; - if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && - second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) + if (time_before(now, nohz.next_balance)) return 0; - ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); - if (ret == nr_cpu_ids || ret == cpu) { - atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); - if (rq->nr_running > 1) - return 1; - } else { - ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); - if (ret == nr_cpu_ids || ret == cpu) { - if (rq->nr_running) - return 1; - } + if (rq->nr_running >= 2) + goto need_kick; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + struct sched_group *sg = sd->groups; + struct sched_group_power *sgp = sg->sgp; + int nr_busy = atomic_read(&sgp->nr_busy_cpus); + + if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) + goto need_kick_unlock; + + if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight + && (cpumask_first_and(nohz.idle_cpus_mask, + sched_domain_span(sd)) < cpu)) + goto need_kick_unlock; + + if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) + break; } + rcu_read_unlock(); return 0; + +need_kick_unlock: + rcu_read_unlock(); +need_kick: + return 1; } #else static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } @@ -4066,7 +5126,7 @@ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); - enum cpu_idle_type idle = this_rq->idle_at_tick ? + enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; rebalance_domains(this_cpu, idle); @@ -4087,14 +5147,14 @@ static inline int on_null_domain(int cpu) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -static inline void trigger_load_balance(struct rq *rq, int cpu) +void trigger_load_balance(struct rq *rq, int cpu) { /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ - else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) nohz_balancer_kick(cpu); #endif } @@ -4109,15 +5169,6 @@ static void rq_offline_fair(struct rq *rq) update_sysctl(); } -#else /* CONFIG_SMP */ - -/* - * on UP we do not need to balance between CPUs: - */ -static inline void idle_balance(int cpu, struct rq *rq) -{ -} - #endif /* CONFIG_SMP */ /* @@ -4141,8 +5192,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq = task_cfs_rq(current); - struct sched_entity *se = &p->se, *curr = cfs_rq->curr; + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se, *curr; int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); unsigned long flags; @@ -4151,6 +5202,9 @@ static void task_fork_fair(struct task_struct *p) update_rq_clock(rq); + cfs_rq = task_cfs_rq(current); + curr = cfs_rq->curr; + if (unlikely(task_cpu(p) != this_cpu)) { rcu_read_lock(); __set_task_cpu(p, this_cpu); @@ -4251,8 +5305,23 @@ static void set_curr_task_fair(struct rq *rq) { struct sched_entity *se = &rq->curr->se; - for_each_sched_entity(se) - set_next_entity(cfs_rq_of(se), se); + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + set_next_entity(cfs_rq, se); + /* ensure bandwidth has been allocated on our new cfs_rq */ + account_cfs_rq_runtime(cfs_rq, 0); + } +} + +void init_cfs_rq(struct cfs_rq *cfs_rq) +{ + cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +#ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -4271,13 +5340,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * to another cgroup's rq. This does somewhat interfere with the * fair sleeper stuff for the first placement, but who cares. */ + /* + * When !on_rq, vruntime of the task has usually NOT been normalized. + * But there are some cases where it has already been normalized: + * + * - Moving a forked child which is waiting for being woken up by + * wake_up_new_task(). + * - Moving a task which has been woken up by try_to_wake_up() and + * waiting for actually being woken up by sched_ttwu_pending(). + * + * To prevent boost or penalty in the new cfs_rq caused by delta + * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. + */ + if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) + on_rq = 1; + if (!on_rq) p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; set_task_rq(p, task_cpu(p)); if (!on_rq) p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; } + +void free_fair_sched_group(struct task_group *tg) +{ + int i; + + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + } + + kfree(tg->cfs_rq); + kfree(tg->se); +} + +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se; + int i; + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->se) + goto err; + + tg->shares = NICE_0_LOAD; + + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + + for_each_possible_cpu(i) { + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!se) + goto err_free_rq; + + init_cfs_rq(cfs_rq); + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + } + + return 1; + +err_free_rq: + kfree(cfs_rq); +err: + return 0; +} + +void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + cfs_rq->tg = tg; + cfs_rq->rq = rq; +#ifdef CONFIG_SMP + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; #endif + init_cfs_rq_runtime(cfs_rq); + + tg->cfs_rq[cpu] = cfs_rq; + tg->se[cpu] = se; + + /* se could be NULL for root_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + + se->my_q = cfs_rq; + update_load_set(&se->load, 0); + se->parent = parent; +} + +static DEFINE_MUTEX(shares_mutex); + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int i; + unsigned long flags; + + /* + * We can't change the weight of the root cgroup. + */ + if (!tg->se[0]) + return -EINVAL; + + shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); + + mutex_lock(&shares_mutex); + if (tg->shares == shares) + goto done; + + tg->shares = shares; + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se; + + se = tg->se[i]; + /* Propagate contribution to hierarchy */ + raw_spin_lock_irqsave(&rq->lock, flags); + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se)); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + +done: + mutex_unlock(&shares_mutex); + return 0; +} +#else /* CONFIG_FAIR_GROUP_SCHED */ + +void free_fair_sched_group(struct task_group *tg) { } + +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} + +void unregister_fair_sched_group(struct task_group *tg, int cpu) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { @@ -4297,7 +5535,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -static const struct sched_class fair_sched_class = { +const struct sched_class fair_sched_class = { .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, @@ -4334,7 +5572,7 @@ static const struct sched_class fair_sched_class = { }; #ifdef CONFIG_SCHED_DEBUG -static void print_cfs_stats(struct seq_file *m, int cpu) +void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq; @@ -4344,3 +5582,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif + +__init void init_sched_fair_class(void) +{ +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + +#ifdef CONFIG_NO_HZ + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); +#endif +#endif /* SMP */ + +} diff --git a/kernel/sched_features.h b/kernel/sched/features.h index 2e74677cb040..e61fd73913d0 100644 --- a/kernel/sched_features.h +++ b/kernel/sched/features.h @@ -3,18 +3,13 @@ * them to run sooner, but does not allow tons of sleepers to * rip the spread apart. */ -SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) /* * Place new tasks ahead so that they do not starve already running * tasks */ -SCHED_FEAT(START_DEBIT, 1) - -/* - * Should wakeups try to preempt running tasks. - */ -SCHED_FEAT(WAKEUP_PREEMPT, 1) +SCHED_FEAT(START_DEBIT, true) /* * Based on load and program behaviour, see if it makes sense to place @@ -22,53 +17,54 @@ SCHED_FEAT(WAKEUP_PREEMPT, 1) * improve cache locality. Typically used with SYNC wakeups as * generated by pipes and the like, see also SYNC_WAKEUPS. */ -SCHED_FEAT(AFFINE_WAKEUPS, 1) +SCHED_FEAT(AFFINE_WAKEUPS, true) /* * Prefer to schedule the task we woke last (assuming it failed * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, 0) +SCHED_FEAT(NEXT_BUDDY, false) /* * Prefer to schedule the task that ran last (when we did * wake-preempt) as that likely will touch the same data, increases * cache locality. */ -SCHED_FEAT(LAST_BUDDY, 1) +SCHED_FEAT(LAST_BUDDY, true) /* * Consider buddies to be cache hot, decreases the likelyness of a * cache buddy being migrated away, increases cache locality. */ -SCHED_FEAT(CACHE_HOT_BUDDY, 1) +SCHED_FEAT(CACHE_HOT_BUDDY, true) /* * Use arch dependent cpu power functions */ -SCHED_FEAT(ARCH_POWER, 0) +SCHED_FEAT(ARCH_POWER, false) -SCHED_FEAT(HRTICK, 0) -SCHED_FEAT(DOUBLE_TICK, 0) -SCHED_FEAT(LB_BIAS, 1) +SCHED_FEAT(HRTICK, false) +SCHED_FEAT(DOUBLE_TICK, false) +SCHED_FEAT(LB_BIAS, true) /* * Spin-wait on mutex acquisition when the mutex owner is running on * another cpu -- assumes that when the owner is running, it will soon * release the lock. Decreases scheduling overhead. */ -SCHED_FEAT(OWNER_SPIN, 1) +SCHED_FEAT(OWNER_SPIN, true) /* * Decrement CPU power based on time not spent running tasks */ -SCHED_FEAT(NONTASK_POWER, 1) +SCHED_FEAT(NONTASK_POWER, true) /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ -SCHED_FEAT(TTWU_QUEUE, 1) +SCHED_FEAT(TTWU_QUEUE, true) -SCHED_FEAT(FORCE_SD_OVERLAP, 0) +SCHED_FEAT(FORCE_SD_OVERLAP, false) +SCHED_FEAT(RT_RUNTIME_SHARE, true) diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c index 0a51882534ea..91b4c957f289 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched/idle_task.c @@ -1,3 +1,5 @@ +#include "sched.h" + /* * idle-task scheduling class. * @@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task /* * Simple, special scheduling class for the per-CPU idle tasks: */ -static const struct sched_class idle_sched_class = { +const struct sched_class idle_sched_class = { /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c index af1177858be3..3640ebbb466b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched/rt.c @@ -3,7 +3,92 @@ * policies) */ +#include "sched.h" + +#include <linux/slab.h> + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +struct rt_bandwidth def_rt_bandwidth; + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + raw_spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + raw_spin_lock(&rt_b->rt_runtime_lock); + start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); + raw_spin_unlock(&rt_b->rt_runtime_lock); +} + +void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.next = MAX_RT_PRIO; + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; + plist_head_init(&rt_rq->pushable_tasks); +#endif + + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + raw_spin_lock_init(&rt_rq->rt_runtime_lock); +} + #ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) @@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return rt_se->rt_rq; } +void free_rt_sched_group(struct task_group *tg) +{ + int i; + + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); + + for_each_possible_cpu(i) { + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->rt_rq); + kfree(tg->rt_se); +} + +void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; + rt_rq->tg = tg; + + tg->rt_rq[cpu] = rt_rq; + tg->rt_se[cpu] = rt_se; + + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + + rt_se->my_q = rt_rq; + rt_se->parent = parent; + INIT_LIST_HEAD(&rt_se->run_list); +} + +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; + int i; + + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); + + for_each_possible_cpu(i) { + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_rq) + goto err; + + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_se) + goto err_free_rq; + + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); + } + + return 1; + +err_free_rq: + kfree(rt_rq); +err: + return 0; +} + #else /* CONFIG_RT_GROUP_SCHED */ #define rt_entity_is_task(rt_se) (1) @@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return &rq->rt; } +void free_rt_sched_group(struct task_group *tg) { } + +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP @@ -124,21 +300,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) update_rt_migration(rt_rq); } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); plist_node_init(&p->pushable_tasks, p->prio); plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); + + /* Update the highest prio pushable task */ + if (p->prio < rq->rt.highest_prio.next) + rq->rt.highest_prio.next = p->prio; } static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); -} -static inline int has_pushable_tasks(struct rq *rq) -{ - return !plist_head_empty(&rq->rt.pushable_tasks); + /* Update the new highest prio pushable task */ + if (has_pushable_tasks(rq)) { + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + rq->rt.highest_prio.next = p->prio; + } else + rq->rt.highest_prio.next = MAX_RT_PRIO; } #else @@ -544,10 +732,35 @@ static void enable_runtime(struct rq *rq) raw_spin_unlock_irqrestore(&rq->lock, flags); } +int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} + static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; + if (!sched_feat(RT_RUNTIME_SHARE)) + return more; + if (rt_rq->rt_time > rt_rq->rt_runtime) { raw_spin_unlock(&rt_rq->rt_runtime_lock); more = do_balance_runtime(rt_rq); @@ -633,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); - if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) + if (runtime >= sched_rt_period(rt_rq)) return 0; balance_runtime(rt_rq); @@ -643,6 +856,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (rt_rq->rt_time > runtime) { rt_rq->rt_throttled = 1; + printk_once(KERN_WARNING "sched: RT throttling activated\n"); if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; @@ -698,47 +912,13 @@ static void update_curr_rt(struct rq *rq) #if defined CONFIG_SMP -static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); - -static inline int next_prio(struct rq *rq) -{ - struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); - - if (next && rt_prio(next->prio)) - return next->prio; - else - return MAX_RT_PRIO; -} - static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - if (prio < prev_prio) { - - /* - * If the new task is higher in priority than anything on the - * run-queue, we know that the previous high becomes our - * next-highest. - */ - rt_rq->highest_prio.next = prev_prio; - - if (rq->online) - cpupri_set(&rq->rd->cpupri, rq->cpu, prio); - - } else if (prio == rt_rq->highest_prio.curr) - /* - * If the next task is equal in priority to the highest on - * the run-queue, then we implicitly know that the next highest - * task cannot be any lower than current - */ - rt_rq->highest_prio.next = prio; - else if (prio < rt_rq->highest_prio.next) - /* - * Otherwise, we need to recompute next-highest - */ - rt_rq->highest_prio.next = next_prio(rq); + if (rq->online && prio < prev_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } static void @@ -746,9 +926,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) - rt_rq->highest_prio.next = next_prio(rq); - if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } @@ -961,6 +1138,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + inc_nr_running(rq); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -971,11 +1150,13 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); + + dec_nr_running(rq); } /* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. + * Put task to the head or the end of the run list without the overhead of + * dequeue followed by enqueue. */ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) @@ -1017,10 +1198,15 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) struct rq *rq; int cpu; - if (sd_flag != SD_BALANCE_WAKE) - return smp_processor_id(); - cpu = task_cpu(p); + + if (p->rt.nr_cpus_allowed == 1) + goto out; + + /* For anything but wake ups, just return the task_cpu */ + if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + goto out; + rq = cpu_rq(cpu); rcu_read_lock(); @@ -1059,6 +1245,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) } rcu_read_unlock(); +out: return cpu; } @@ -1178,7 +1365,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); - p->se.exec_start = 0; /* * The previous task needs to be made eligible for pushing @@ -1193,12 +1379,10 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); - static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && + (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && (p->rt.nr_cpus_allowed > 1)) return 1; return 0; @@ -1343,7 +1527,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) */ if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(lowest_rq->cpu, - &task->cpus_allowed) || + tsk_cpus_allowed(task)) || task_running(rq, task) || !task->on_rq)) { @@ -1394,6 +1578,7 @@ static int push_rt_task(struct rq *rq) { struct task_struct *next_task; struct rq *lowest_rq; + int ret = 0; if (!rq->rt.overloaded) return 0; @@ -1426,7 +1611,7 @@ retry: if (!lowest_rq) { struct task_struct *task; /* - * find lock_lowest_rq releases rq->lock + * find_lock_lowest_rq releases rq->lock * so it is possible that next_task has migrated. * * We need to make sure that the task is still on the same @@ -1436,12 +1621,11 @@ retry: task = pick_next_pushable_task(rq); if (task_cpu(next_task) == rq->cpu && task == next_task) { /* - * If we get here, the task hasn't moved at all, but - * it has failed to push. We will not try again, - * since the other cpus will pull from us when they - * are ready. + * The task hasn't migrated, and is still the next + * eligible task, but we failed to find a run-queue + * to push it to. Do not retry in this case, since + * other cpus will pull from us when ready. */ - dequeue_pushable_task(rq, next_task); goto out; } @@ -1460,6 +1644,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, lowest_rq->cpu); activate_task(lowest_rq, next_task, 0); + ret = 1; resched_task(lowest_rq->curr); @@ -1468,7 +1653,7 @@ retry: out: put_task_struct(next_task); - return 1; + return ret; } static void push_rt_tasks(struct rq *rq) @@ -1626,9 +1811,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, update_rt_migration(&rq->rt); } - - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = weight; } /* Assumes rq->lock is held */ @@ -1670,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) pull_rt_task(rq); } -static inline void init_sched_rt_class(void) +void init_sched_rt_class(void) { unsigned int i; - for_each_possible_cpu(i) + for_each_possible_cpu(i) { zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i)); + } } #endif /* CONFIG_SMP */ @@ -1817,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -static const struct sched_class rt_sched_class = { +const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, @@ -1852,7 +2035,7 @@ static const struct sched_class rt_sched_class = { #ifdef CONFIG_SCHED_DEBUG extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); -static void print_rt_stats(struct seq_file *m, int cpu) +void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; struct rt_rq *rt_rq; @@ -1863,4 +2046,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif /* CONFIG_SCHED_DEBUG */ - diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h new file mode 100644 index 000000000000..98c0c2623db8 --- /dev/null +++ b/kernel/sched/sched.h @@ -0,0 +1,1166 @@ + +#include <linux/sched.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/stop_machine.h> + +#include "cpupri.h" + +extern __read_mostly int scheduler_running; + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT + +/* + * These are the 'tuning knobs' of the scheduler: + * + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define DEF_TIMESLICE (100 * HZ / 1000) + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +static inline int rt_policy(int policy) +{ + if (policy == SCHED_FIFO || policy == SCHED_RR) + return 1; + return 0; +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +extern struct mutex sched_domains_mutex; + +#ifdef CONFIG_CGROUP_SCHED + +#include <linux/cgroup.h> + +struct cfs_rq; +struct rt_rq; + +static LIST_HEAD(task_groups); + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota, runtime; + s64 hierarchal_quota; + u64 runtime_expires; + + int idle, timer_active; + struct hrtimer period_timer, slack_timer; + struct list_head throttled_cfs_rq; + + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; +#endif +}; + +/* task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; + + atomic_t load_weight; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + + struct cfs_bandwidth cfs_bandwidth; +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) +#endif + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +extern struct task_group root_task_group; + +typedef int (*tg_visitor)(struct task_group *, void *); + +extern int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + +extern int tg_nop(struct task_group *tg, void *data); + +extern void free_fair_sched_group(struct task_group *tg); +extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + +extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); + +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); +extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent); + +#else /* CONFIG_CGROUP_SCHED */ + +struct cfs_bandwidth { }; + +#endif /* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long nr_running, h_nr_running; + + u64 exec_clock; + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr, *next, *last, *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + /* + * the part of load.weight contributed by tasks + */ + unsigned long task_weight; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + + /* + * Maintaining per-cpu shares distribution for group scheduling + * + * load_stamp is the last time we updated the load average + * load_last is the last time we updated the load average and saw load + * load_unacc_exec_time is currently unaccounted execution time + */ + u64 load_avg; + u64 load_period; + u64 load_stamp, load_last, load_unacc_exec_time; + + unsigned long load_contribution; +#endif /* CONFIG_SMP */ +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_timestamp; + int throttled, throttle_count; + struct list_head throttled_list; +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +}; + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned long rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; +#endif + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; +#endif +}; + +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; +}; + +extern struct root_domain def_root_domain; + +#endif /* CONFIG_SMP */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; +#ifdef CONFIG_NO_HZ + u64 nohz_stamp; + unsigned long nohz_flags; +#endif + int skip_clock_update; + + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + struct rt_rq rt; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; +#endif +#ifdef CONFIG_RT_GROUP_SCHED + struct list_head leaf_rt_rq_list; +#endif + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr, *idle, *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; + + u64 clock; + u64 clock_task; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain *sd; + + unsigned long cpu_power; + + unsigned char idle_balance; + /* For active balancing */ + int post_schedule; + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + /* cpu of this runqueue: */ + int cpu; + int online; + + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif + struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_switch; + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif +}; + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +DECLARE_PER_CPU(struct rq, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) + +#ifdef CONFIG_SMP + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + __sd; __sd = __sd->parent) + +#define for_each_lower_domain(sd) for (; sd; sd = sd->child) + +/** + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu: The cpu whose highest level of sched domain is to + * be returned. + * @flag: The flag to check for the highest sched_domain + * for the given cpu. + * + * Returns the highest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *highest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd, *hsd = NULL; + + for_each_domain(cpu, sd) { + if (!(sd->flags & flag)) + break; + hsd = sd; + } + + return hsd; +} + +DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_id); + +#endif /* CONFIG_SMP */ + +#include "stats.h" +#include "auto_group.h" + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification with + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each + * task it moves into the cgroup. Therefore by holding either of those locks, + * we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct task_group *tg; + struct cgroup_subsys_state *css; + + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock)); + tg = container_of(css, struct task_group, css); + + return autogroup_task_group(p, tg); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) + struct task_group *tg = task_group(p); +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = tg->cfs_rq[cpu]; + p->se.parent = tg->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = tg->rt_rq[cpu]; + p->rt.parent = tg->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# include <linux/jump_label.h> +# define const_debug __read_mostly +#else +# define const_debug const +#endif + +extern const_debug unsigned int sysctl_sched_features; + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "features.h" + __SCHED_FEAT_NR, +}; + +#undef SCHED_FEAT + +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) +static __always_inline bool static_branch__true(struct jump_label_key *key) +{ + return likely(static_branch(key)); /* Not out of line branch. */ +} + +static __always_inline bool static_branch__false(struct jump_label_key *key) +{ + return unlikely(static_branch(key)); /* Out of line branch. */ +} + +#define SCHED_FEAT(name, enabled) \ +static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +{ \ + return static_branch__##enabled(key); \ +} + +#include "features.h" + +#undef SCHED_FEAT + +extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) +#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + + + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + raw_spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + raw_spin_unlock_irq(&rq->lock); +#else + raw_spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + + +#define sched_class_highest (&stop_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) + +extern const struct sched_class stop_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; + + +#ifdef CONFIG_SMP + +extern void trigger_load_balance(struct rq *rq, int cpu); +extern void idle_balance(int this_cpu, struct rq *this_rq); + +#else /* CONFIG_SMP */ + +static inline void idle_balance(int cpu, struct rq *rq) +{ +} + +#endif + +extern void sysrq_sched_debug_show(void); +extern void sched_init_granularity(void); +extern void update_max_interval(void); +extern void update_group_power(struct sched_domain *sd, int cpu); +extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); +extern void init_sched_rt_class(void); +extern void init_sched_fair_class(void); + +extern void resched_task(struct task_struct *p); +extern void resched_cpu(int cpu); + +extern struct rt_bandwidth def_rt_bandwidth; +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +extern void update_cpu_load(struct rq *this_rq); + +#ifdef CONFIG_CGROUP_CPUACCT +#include <linux/cgroup.h> +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca || !ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +#else +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +#endif + +static inline void inc_nr_running(struct rq *rq) +{ + rq->nr_running++; +} + +static inline void dec_nr_running(struct rq *rq) +{ + rq->nr_running--; +} + +extern void update_rq_clock(struct rq *rq); + +extern void activate_task(struct rq *rq, struct task_struct *p, int flags); +extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +extern const_debug unsigned int sysctl_sched_time_avg; +extern const_debug unsigned int sysctl_sched_nr_migrate; +extern const_debug unsigned int sysctl_sched_migration_cost; + +static inline u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +void calc_load_account_idle(struct rq *this_rq); + +#ifdef CONFIG_SCHED_HRTICK + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +void hrtick_start(struct rq *rq, u64 delay); + +#else + +static inline int hrtick_enabled(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HRTICK */ + +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} +#else +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } +static inline void sched_avg_update(struct rq *rq) { } +#endif + +extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); + +#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT + +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!raw_spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); + ret = 1; + } else + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); + } + return ret; +} + +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + raw_spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + raw_spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + +#endif + +extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); +extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); +extern void print_cfs_stats(struct seq_file *m, int cpu); +extern void print_rt_stats(struct seq_file *m, int cpu); + +extern void init_cfs_rq(struct cfs_rq *cfs_rq); +extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); +extern void unthrottle_offline_cfs_rqs(struct rq *rq); + +extern void account_cfs_bandwidth_used(int enabled, int was_enabled); + +#ifdef CONFIG_NO_HZ +enum rq_nohz_flag_bits { + NOHZ_TICK_STOPPED, + NOHZ_BALANCE_KICK, + NOHZ_IDLE, +}; + +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) +#endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c new file mode 100644 index 000000000000..2a581ba8e190 --- /dev/null +++ b/kernel/sched/stats.c @@ -0,0 +1,111 @@ + +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "sched.h" + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 15 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; + char *mask_str = kmalloc(mask_len, GFP_KERNEL); + + if (mask_str == NULL) + return -ENOMEM; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcount = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %u %u %u %u %u %u %llu %llu %lu", + cpu, rq->yld_count, + rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->ttwu_count, rq->ttwu_local, + rq->rq_cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { + enum cpu_idle_type itype; + + cpumask_scnprintf(mask_str, mask_len, + sched_domain_span(sd)); + seq_printf(seq, "domain%d %s", dcount++, mask_str); + for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %u %u %u %u %u %u %u %u", + sd->lb_count[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, + " %u %u %u %u %u %u %u %u %u %u %u %u\n", + sd->alb_count, sd->alb_failed, sd->alb_pushed, + sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); + } + rcu_read_unlock(); +#endif + } + kfree(mask_str); + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +static const struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_schedstat_init(void) +{ + proc_create("schedstat", 0, NULL, &proc_schedstat_operations); + return 0; +} +module_init(proc_schedstat_init); diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h index 331e01bcd026..2ef90a51ec5e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched/stats.h @@ -1,108 +1,5 @@ #ifdef CONFIG_SCHEDSTATS -/* - * bump this up when changing the output format or the meaning of an existing - * format, so that tools can adapt (or abort) - */ -#define SCHEDSTAT_VERSION 15 - -static int show_schedstat(struct seq_file *seq, void *v) -{ - int cpu; - int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; - char *mask_str = kmalloc(mask_len, GFP_KERNEL); - - if (mask_str == NULL) - return -ENOMEM; - - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); -#ifdef CONFIG_SMP - struct sched_domain *sd; - int dcount = 0; -#endif - - /* runqueue-specific stats */ - seq_printf(seq, - "cpu%d %u %u %u %u %u %u %llu %llu %lu", - cpu, rq->yld_count, - rq->sched_switch, rq->sched_count, rq->sched_goidle, - rq->ttwu_count, rq->ttwu_local, - rq->rq_cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); - - seq_printf(seq, "\n"); - -#ifdef CONFIG_SMP - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { - enum cpu_idle_type itype; - - cpumask_scnprintf(mask_str, mask_len, - sched_domain_span(sd)); - seq_printf(seq, "domain%d %s", dcount++, mask_str); - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %u %u %u %u %u %u %u %u", - sd->lb_count[itype], - sd->lb_balanced[itype], - sd->lb_failed[itype], - sd->lb_imbalance[itype], - sd->lb_gained[itype], - sd->lb_hot_gained[itype], - sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); - } - seq_printf(seq, - " %u %u %u %u %u %u %u %u %u %u %u %u\n", - sd->alb_count, sd->alb_failed, sd->alb_pushed, - sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, - sd->ttwu_move_balance); - } - rcu_read_unlock(); -#endif - } - kfree(mask_str); - return 0; -} - -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -static const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init proc_schedstat_init(void) -{ - proc_create("schedstat", 0, NULL, &proc_schedstat_operations); - return 0; -} -module_init(proc_schedstat_init); /* * Expects runqueue lock to be held for atomicity of update @@ -282,10 +179,9 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); - cputimer->cputime.utime = - cputime_add(cputimer->cputime.utime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); + cputimer->cputime.utime += cputime; + raw_spin_unlock(&cputimer->lock); } /** @@ -306,10 +202,9 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); - cputimer->cputime.stime = - cputime_add(cputimer->cputime.stime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); + cputimer->cputime.stime += cputime; + raw_spin_unlock(&cputimer->lock); } /** @@ -330,7 +225,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.sum_exec_runtime += ns; - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c index 6f437632afab..7b386e86fd23 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched/stop_task.c @@ -1,3 +1,5 @@ +#include "sched.h" + /* * stop-task scheduling class. * @@ -34,11 +36,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { + inc_nr_running(rq); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { + dec_nr_running(rq); } static void yield_task_stop(struct rq *rq) @@ -78,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -static const struct sched_class stop_sched_class = { +const struct sched_class stop_sched_class = { .next = &rt_sched_class, .enqueue_task = enqueue_task_stop, diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 94a62c0d4ade..60636a4e25c3 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -27,7 +27,7 @@ #include <linux/compiler.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/semaphore.h> #include <linux/spinlock.h> @@ -54,12 +54,12 @@ void down(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else __down(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(down); @@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_interruptible(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_killable(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem) unsigned long flags; int count; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) sem->count = count; - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); } @@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_timeout(sem, jiffies); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -179,12 +179,12 @@ void up(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else __up(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(up); @@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state, if (timeout <= 0) goto timed_out; __set_task_state(task, state); - spin_unlock_irq(&sem->lock); + raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); - spin_lock_irq(&sem->lock); + raw_spin_lock_irq(&sem->lock); if (waiter.up) return 0; } diff --git a/kernel/signal.c b/kernel/signal.c index 291c9700be75..c73c4284160e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -11,7 +11,7 @@ */ #include <linux/slab.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/fs.h> @@ -28,6 +28,7 @@ #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> +#include <linux/user_namespace.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } +/* + * map the uid in struct cred into user namespace *ns + */ +static inline uid_t map_cred_ns(const struct cred *cred, + struct user_namespace *ns) +{ + return user_ns_map_uid(ns, cred, cred->uid); +} + +#ifdef CONFIG_USER_NS +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + if (current_user_ns() == task_cred_xxx(t, user_ns)) + return; + + if (SI_FROMKERNEL(info)) + return; + + info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), + current_cred(), info->si_uid); +} +#else +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + return; +} +#endif + static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, int group, int from_ancestor_ns) { @@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_pid = 0; break; } + + userns_fixup_signal_uid(&q->info, t); + } else if (!is_si_special(info)) { if (sig >= SIGRTMIN && info->si_code != SI_USER) { /* @@ -1344,13 +1376,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } +static int kill_as_cred_perm(const struct cred *cred, + struct task_struct *target) +{ + const struct cred *pcred = __task_cred(target); + if (cred->user_ns != pcred->user_ns) + return 0; + if (cred->euid != pcred->suid && cred->euid != pcred->uid && + cred->uid != pcred->suid && cred->uid != pcred->uid) + return 0; + return 1; +} + /* like kill_pid_info(), but doesn't use uid/euid of "current" */ -int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, - uid_t uid, uid_t euid, u32 secid) +int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, + const struct cred *cred, u32 secid) { int ret = -EINVAL; struct task_struct *p; - const struct cred *pcred; unsigned long flags; if (!valid_signal(sig)) @@ -1362,10 +1405,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, ret = -ESRCH; goto out_unlock; } - pcred = __task_cred(p); - if (si_fromuser(info) && - euid != pcred->suid && euid != pcred->uid && - uid != pcred->suid && uid != pcred->uid) { + if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { ret = -EPERM; goto out_unlock; } @@ -1384,7 +1424,7 @@ out_unlock: rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); +EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); /* * kill_something_info() interprets pid in interesting ways just like kill(2). @@ -1618,13 +1658,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(tsk->parent, user_ns)); rcu_read_unlock(); - info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, - tsk->signal->utime)); - info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, - tsk->signal->stime)); + info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); + info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) @@ -1703,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime); @@ -1986,8 +2026,6 @@ static bool do_signal_stop(int signr) */ if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; - else - WARN_ON_ONCE(!current->ptrace); sig->group_stop_count = 0; @@ -2121,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; + rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); - info->si_uid = task_uid(current->parent); + info->si_uid = map_cred_ns(__task_cred(current->parent), + current_user_ns()); + rcu_read_unlock(); } /* If the (new) signal is now blocked, requeue it. */ @@ -2314,6 +2355,27 @@ relock: return signr; } +/** + * block_sigmask - add @ka's signal mask to current->blocked + * @ka: action for @signr + * @signr: signal that has been successfully delivered + * + * This function should be called when a signal has succesfully been + * delivered. It adds the mask of signals for @ka to current->blocked + * so that they are blocked during the execution of the signal + * handler. In addition, @signr will be blocked unless %SA_NODEFER is + * set in @ka->sa.sa_flags. + */ +void block_sigmask(struct k_sigaction *ka, int signr) +{ + sigset_t blocked; + + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, signr); + set_current_blocked(&blocked); +} + /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take @@ -2351,8 +2413,15 @@ void exit_signals(struct task_struct *tsk) int group_stop = 0; sigset_t unblocked; + /* + * @tsk is about to have PF_EXITING set - lock out users which + * expect stable threadgroup. + */ + threadgroup_change_begin(tsk); + if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { tsk->flags |= PF_EXITING; + threadgroup_change_end(tsk); return; } @@ -2362,6 +2431,9 @@ void exit_signals(struct task_struct *tsk) * see wants_signal(), do_signal_stop(). */ tsk->flags |= PF_EXITING; + + threadgroup_change_end(tsk); + if (!signal_pending(tsk)) goto out; diff --git a/kernel/smp.c b/kernel/smp.c index fb67dfa8394e..db197d60489b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -6,7 +6,7 @@ #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/gfp.h> diff --git a/kernel/softirq.c b/kernel/softirq.c index fca82c32042b..4eb3a0fa351e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -10,7 +10,7 @@ * Remote softirq infrastructure is by Jens Axboe. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/kernel_stat.h> #include <linux/interrupt.h> #include <linux/init.h> @@ -347,12 +347,12 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); - rcu_irq_exit(); #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) - tick_nohz_stop_sched_tick(0); + tick_nohz_irq_exit(); #endif + rcu_irq_exit(); preempt_enable_no_resched(); } diff --git a/kernel/spinlock.c b/kernel/spinlock.c index be6517fb9c14..84c7d96918bf 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -19,7 +19,7 @@ #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/debug_locks.h> -#include <linux/module.h> +#include <linux/export.h> /* * If lockdep is enabled then we use the non-preemption spin-ops diff --git a/kernel/srcu.c b/kernel/srcu.c index 73ce23feaea9..0febf61e1aa3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -24,7 +24,7 @@ * */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/preempt.h> diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index d20c6983aad9..00fe55cc5a82 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -7,7 +7,7 @@ */ #include <linux/sched.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/kallsyms.h> #include <linux/stacktrace.h> diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index ba5070ce5765..2f194e965715 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -12,7 +12,7 @@ #include <linux/cpu.h> #include <linux/init.h> #include <linux/kthread.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/percpu.h> #include <linux/sched.h> #include <linux/stop_machine.h> @@ -41,6 +41,7 @@ struct cpu_stopper { }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); +static bool stop_machine_initialized = false; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { @@ -386,6 +387,8 @@ static int __init cpu_stop_init(void) cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); register_cpu_notifier(&cpu_stop_cpu_notifier); + stop_machine_initialized = true; + return 0; } early_initcall(cpu_stop_init); @@ -485,6 +488,25 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) .num_threads = num_online_cpus(), .active_cpus = cpus }; + if (!stop_machine_initialized) { + /* + * Handle the case where stop_machine() is called + * early in boot before stop_machine() has been + * initialized. + */ + unsigned long flags; + int ret; + + WARN_ON_ONCE(smdata.num_threads != 1); + + local_irq_save(flags); + hard_irq_disable(); + ret = (*fn)(data); + local_irq_restore(flags); + + return ret; + } + /* Set the initial state and stop all online cpus. */ set_state(&smdata, STOPMACHINE_PREPARE); return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); diff --git a/kernel/sys.c b/kernel/sys.c index 1dbbe695a5ef..40701538fbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -4,7 +4,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/mm.h> #include <linux/utsname.h> #include <linux/mman.h> @@ -12,6 +12,7 @@ #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> +#include <linux/kmod.h> #include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> @@ -1286,6 +1287,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } + uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); return errno; } @@ -1336,6 +1338,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } + uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); return errno; } @@ -1602,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); - utime = stime = cputime_zero; + utime = stime = 0; if (who == RUSAGE_THREAD) { task_times(current, &utime, &stime); @@ -1632,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) case RUSAGE_SELF: thread_group_times(p, &tgutime, &tgstime); - utime = cputime_add(utime, tgutime); - stime = cputime_add(stime, tgstime); + utime += tgutime; + stime += tgstime; r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; @@ -1689,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + unsigned long rlim = rlimit(RLIMIT_DATA); + unsigned long vm_req_flags; + unsigned long vm_bad_flags; + struct vm_area_struct *vma; + int error = 0; + struct mm_struct *mm = current->mm; + + if (arg4 | arg5) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (addr >= TASK_SIZE) + return -EINVAL; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + + if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { + /* It must be existing VMA */ + if (!vma || vma->vm_start > addr) + goto out; + } + + error = -EINVAL; + switch (opt) { + case PR_SET_MM_START_CODE: + case PR_SET_MM_END_CODE: + vm_req_flags = VM_READ | VM_EXEC; + vm_bad_flags = VM_WRITE | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_CODE) + mm->start_code = addr; + else + mm->end_code = addr; + break; + + case PR_SET_MM_START_DATA: + case PR_SET_MM_END_DATA: + vm_req_flags = VM_READ | VM_WRITE; + vm_bad_flags = VM_EXEC | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_DATA) + mm->start_data = addr; + else + mm->end_data = addr; + break; + + case PR_SET_MM_START_STACK: + +#ifdef CONFIG_STACK_GROWSUP + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; +#else + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; +#endif + if ((vma->vm_flags & vm_req_flags) != vm_req_flags) + goto out; + + mm->start_stack = addr; + break; + + case PR_SET_MM_START_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (mm->brk - addr) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->start_brk = addr; + break; + + case PR_SET_MM_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (addr - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->brk = addr; + break; + + default: + error = -EINVAL; + goto out; + } + + error = 0; + +out: + up_read(&mm->mmap_sem); + + return error; +} +#else /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + return -EINVAL; +} +#endif + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -1759,6 +1880,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, sizeof(me->comm) - 1) < 0) return -EFAULT; set_task_comm(me, comm); + proc_comm_connector(me); return 0; case PR_GET_NAME: get_task_comm(comm, me); @@ -1837,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else error = PR_MCE_KILL_DEFAULT; break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a9a5de07c4f1..47bfa16430d7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -145,6 +145,10 @@ cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); cond_syscall(sys_syslog); +cond_syscall(sys_process_vm_readv); +cond_syscall(sys_process_vm_writev); +cond_syscall(compat_sys_process_vm_readv); +cond_syscall(compat_sys_process_vm_writev); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d65b531e50..f487f257e05e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -57,6 +57,7 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/kmod.h> +#include <linux/capability.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -134,6 +135,7 @@ static int minolduid; static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; +static const int cap_last_cap = CAP_LAST_CAP; #ifdef CONFIG_INOTIFY_USER #include <linux/inotify.h> @@ -151,14 +153,6 @@ extern int pwrsw_enabled; extern int unaligned_enabled; #endif -#ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU -extern int sysctl_ieee_emulation_warnings; -#endif -extern int sysctl_userprocess_debug; -extern int spin_retry; -#endif - #ifdef CONFIG_IA64 extern int no_unaligned_warning; extern int unaligned_dump_stack; @@ -379,6 +373,16 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -730,6 +734,13 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, + { + .procname = "cap_last_cap", + .data = (void *)&cap_last_cap, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, #if defined(CONFIG_LOCKUP_DETECTOR) { .procname = "watchdog", @@ -792,6 +803,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_DEBUG_STACKOVERFLOW + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "bootloader_type", .data = &bootloader_type, diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e8bffbe2ba4b..a650694883a1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = { { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, + /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */ { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, @@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, fput(file); out_putname: - putname(pathname); + __putname(pathname); out: return result; } diff --git a/kernel/time.c b/kernel/time.c index 8e8dc6d705c9..73e416db0a1e 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -27,7 +27,7 @@ * with nanosecond accuracy */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/timex.h> #include <linux/capability.h> #include <linux/clocksource.h> @@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval); /* * Convert jiffies/jiffies_64 to clock_t and back. */ -clock_t jiffies_to_clock_t(long x) +clock_t jiffies_to_clock_t(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f06a8a365648..2cf9cc7aa103 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -25,5 +25,7 @@ config HIGH_RES_TIMERS config GENERIC_CLOCKEVENTS_BUILD bool default y - depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR + depends on GENERIC_CLOCKEVENTS +config GENERIC_CLOCKEVENTS_MIN_ADJUST + bool diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ea5e1a928d5b..8a46f5d64504 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -53,27 +53,6 @@ static struct rtc_device *rtcdev; static DEFINE_SPINLOCK(rtcdev_lock); /** - * has_wakealarm - check rtc device has wakealarm ability - * @dev: current device - * @name_ptr: name to be returned - * - * This helper function checks to see if the rtc device can wake - * from suspend. - */ -static int has_wakealarm(struct device *dev, void *name_ptr) -{ - struct rtc_device *candidate = to_rtc_device(dev); - - if (!candidate->ops->set_alarm) - return 0; - if (!device_may_wakeup(candidate->dev.parent)) - return 0; - - *(const char **)name_ptr = dev_name(dev); - return 1; -} - -/** * alarmtimer_get_rtcdev - Return selected rtcdevice * * This function returns the rtc device to use for wakealarms. @@ -82,37 +61,64 @@ static int has_wakealarm(struct device *dev, void *name_ptr) */ static struct rtc_device *alarmtimer_get_rtcdev(void) { - struct device *dev; - char *str; unsigned long flags; struct rtc_device *ret; spin_lock_irqsave(&rtcdev_lock, flags); - if (!rtcdev) { - /* Find an rtc device and init the rtc_timer */ - dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); - /* If we have a device then str is valid. See has_wakealarm() */ - if (dev) { - rtcdev = rtc_class_open(str); - /* - * Drop the reference we got in class_find_device, - * rtc_open takes its own. - */ - put_device(dev); - rtc_timer_init(&rtctimer, NULL, NULL); - } - } ret = rtcdev; spin_unlock_irqrestore(&rtcdev_lock, flags); return ret; } + + +static int alarmtimer_rtc_add_device(struct device *dev, + struct class_interface *class_intf) +{ + unsigned long flags; + struct rtc_device *rtc = to_rtc_device(dev); + + if (rtcdev) + return -EBUSY; + + if (!rtc->ops->set_alarm) + return -1; + if (!device_may_wakeup(rtc->dev.parent)) + return -1; + + spin_lock_irqsave(&rtcdev_lock, flags); + if (!rtcdev) { + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + } + spin_unlock_irqrestore(&rtcdev_lock, flags); + return 0; +} + +static struct class_interface alarmtimer_rtc_interface = { + .add_dev = &alarmtimer_rtc_add_device, +}; + +static int alarmtimer_rtc_interface_setup(void) +{ + alarmtimer_rtc_interface.class = rtc_class; + return class_interface_register(&alarmtimer_rtc_interface); +} +static void alarmtimer_rtc_interface_remove(void) +{ + class_interface_unregister(&alarmtimer_rtc_interface); +} #else -#define alarmtimer_get_rtcdev() (0) -#define rtcdev (0) +static inline struct rtc_device *alarmtimer_get_rtcdev(void) +{ + return NULL; +} +#define rtcdev (NULL) +static inline int alarmtimer_rtc_interface_setup(void) { return 0; } +static inline void alarmtimer_rtc_interface_remove(void) { } #endif - /** * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue * @base: pointer to the base where the timer is being run @@ -126,6 +132,8 @@ static struct rtc_device *alarmtimer_get_rtcdev(void) static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) { timerqueue_add(&base->timerqueue, &alarm->node); + alarm->state |= ALARMTIMER_STATE_ENQUEUED; + if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { hrtimer_try_to_cancel(&base->timer); hrtimer_start(&base->timer, alarm->node.expires, @@ -147,7 +155,12 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) { struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); + if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) + return; + timerqueue_del(&base->timerqueue, &alarm->node); + alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; + if (next == &alarm->node) { hrtimer_try_to_cancel(&base->timer); next = timerqueue_getnext(&base->timerqueue); @@ -174,6 +187,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) unsigned long flags; ktime_t now; int ret = HRTIMER_NORESTART; + int restart = ALARMTIMER_NORESTART; spin_lock_irqsave(&base->lock, flags); now = base->gettime(); @@ -181,23 +195,25 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) struct alarm *alarm; ktime_t expired = next->expires; - if (expired.tv64 >= now.tv64) + if (expired.tv64 > now.tv64) break; alarm = container_of(next, struct alarm, node); timerqueue_del(&base->timerqueue, &alarm->node); - alarm->enabled = 0; - /* Re-add periodic timers */ - if (alarm->period.tv64) { - alarm->node.expires = ktime_add(expired, alarm->period); - timerqueue_add(&base->timerqueue, &alarm->node); - alarm->enabled = 1; - } + alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; + + alarm->state |= ALARMTIMER_STATE_CALLBACK; spin_unlock_irqrestore(&base->lock, flags); if (alarm->function) - alarm->function(alarm); + restart = alarm->function(alarm, now); spin_lock_irqsave(&base->lock, flags); + alarm->state &= ~ALARMTIMER_STATE_CALLBACK; + + if (restart != ALARMTIMER_NORESTART) { + timerqueue_add(&base->timerqueue, &alarm->node); + alarm->state |= ALARMTIMER_STATE_ENQUEUED; + } } if (next) { @@ -234,7 +250,7 @@ static int alarmtimer_suspend(struct device *dev) freezer_delta = ktime_set(0, 0); spin_unlock_irqrestore(&freezer_delta_lock, flags); - rtc = rtcdev; + rtc = alarmtimer_get_rtcdev(); /* If we have no rtcdev, just return */ if (!rtc) return 0; @@ -299,53 +315,111 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) * @function: callback that is run when the alarm fires */ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - void (*function)(struct alarm *)) + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); - alarm->period = ktime_set(0, 0); alarm->function = function; alarm->type = type; - alarm->enabled = 0; + alarm->state = ALARMTIMER_STATE_INACTIVE; } /** * alarm_start - Sets an alarm to fire * @alarm: ptr to alarm to set * @start: time to run the alarm - * @period: period at which the alarm will recur */ -void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) +void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; spin_lock_irqsave(&base->lock, flags); - if (alarm->enabled) + if (alarmtimer_active(alarm)) alarmtimer_remove(base, alarm); alarm->node.expires = start; - alarm->period = period; alarmtimer_enqueue(base, alarm); - alarm->enabled = 1; spin_unlock_irqrestore(&base->lock, flags); } /** - * alarm_cancel - Tries to cancel an alarm timer + * alarm_try_to_cancel - Tries to cancel an alarm timer * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not running, + * and -1 if the callback was running */ -void alarm_cancel(struct alarm *alarm) +int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - + int ret = -1; spin_lock_irqsave(&base->lock, flags); - if (alarm->enabled) + + if (alarmtimer_callback_running(alarm)) + goto out; + + if (alarmtimer_is_queued(alarm)) { alarmtimer_remove(base, alarm); - alarm->enabled = 0; + ret = 1; + } else + ret = 0; +out: spin_unlock_irqrestore(&base->lock, flags); + return ret; +} + + +/** + * alarm_cancel - Spins trying to cancel an alarm timer until it is done + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not active. + */ +int alarm_cancel(struct alarm *alarm) +{ + for (;;) { + int ret = alarm_try_to_cancel(alarm); + if (ret >= 0) + return ret; + cpu_relax(); + } +} + + +u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) +{ + u64 overrun = 1; + ktime_t delta; + + delta = ktime_sub(now, alarm->node.expires); + + if (delta.tv64 < 0) + return 0; + + if (unlikely(delta.tv64 >= interval.tv64)) { + s64 incr = ktime_to_ns(interval); + + overrun = ktime_divns(delta, incr); + + alarm->node.expires = ktime_add_ns(alarm->node.expires, + incr*overrun); + + if (alarm->node.expires.tv64 > now.tv64) + return overrun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + overrun++; + } + + alarm->node.expires = ktime_add(alarm->node.expires, interval); + return overrun; } + + /** * clock2alarm - helper that converts from clockid to alarmtypes * @clockid: clockid. @@ -365,12 +439,21 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) * * Posix timer callback for expired alarm timers. */ -static void alarm_handle_timer(struct alarm *alarm) +static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, + ktime_t now) { struct k_itimer *ptr = container_of(alarm, struct k_itimer, - it.alarmtimer); + it.alarm.alarmtimer); if (posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; + + /* Re-add periodic timers */ + if (ptr->it.alarm.interval.tv64) { + ptr->it_overrun += alarm_forward(alarm, now, + ptr->it.alarm.interval); + return ALARMTIMER_RESTART; + } + return ALARMTIMER_NORESTART; } /** @@ -427,7 +510,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) type = clock2alarm(new_timer->it_clock); base = &alarm_bases[type]; - alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); + alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); return 0; } @@ -444,9 +527,9 @@ static void alarm_timer_get(struct k_itimer *timr, memset(cur_setting, 0, sizeof(struct itimerspec)); cur_setting->it_interval = - ktime_to_timespec(timr->it.alarmtimer.period); + ktime_to_timespec(timr->it.alarm.interval); cur_setting->it_value = - ktime_to_timespec(timr->it.alarmtimer.node.expires); + ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); return; } @@ -461,7 +544,9 @@ static int alarm_timer_del(struct k_itimer *timr) if (!rtcdev) return -ENOTSUPP; - alarm_cancel(&timr->it.alarmtimer); + if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) + return TIMER_RETRY; + return 0; } @@ -481,25 +566,17 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, if (!rtcdev) return -ENOTSUPP; - /* - * XXX HACK! Currently we can DOS a system if the interval - * period on alarmtimers is too small. Cap the interval here - * to 100us and solve this properly in a future patch! -jstultz - */ - if ((new_setting->it_interval.tv_sec == 0) && - (new_setting->it_interval.tv_nsec < 100000)) - new_setting->it_interval.tv_nsec = 100000; - if (old_setting) alarm_timer_get(timr, old_setting); /* If the timer was already set, cancel it */ - alarm_cancel(&timr->it.alarmtimer); + if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) + return TIMER_RETRY; /* start the timer */ - alarm_start(&timr->it.alarmtimer, - timespec_to_ktime(new_setting->it_value), - timespec_to_ktime(new_setting->it_interval)); + timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); + alarm_start(&timr->it.alarm.alarmtimer, + timespec_to_ktime(new_setting->it_value)); return 0; } @@ -509,13 +586,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, * * Wakes up the task that set the alarmtimer */ -static void alarmtimer_nsleep_wakeup(struct alarm *alarm) +static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, + ktime_t now) { struct task_struct *task = (struct task_struct *)alarm->data; alarm->data = NULL; if (task) wake_up_process(task); + return ALARMTIMER_NORESTART; } /** @@ -530,7 +609,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); - alarm_start(alarm, absexp, ktime_set(0, 0)); + alarm_start(alarm, absexp); if (likely(alarm->data)) schedule(); @@ -691,6 +770,7 @@ static struct platform_driver alarmtimer_driver = { */ static int __init alarmtimer_init(void) { + struct platform_device *pdev; int error = 0; int i; struct k_clock alarm_clock = { @@ -719,10 +799,26 @@ static int __init alarmtimer_init(void) HRTIMER_MODE_ABS); alarm_bases[i].timer.function = alarmtimer_fired; } + + error = alarmtimer_rtc_interface_setup(); + if (error) + return error; + error = platform_driver_register(&alarmtimer_driver); - platform_device_register_simple("alarmtimer", -1, NULL, 0); + if (error) + goto out_if; + pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); + if (IS_ERR(pdev)) { + error = PTR_ERR(pdev); + goto out_drv; + } + return 0; + +out_drv: + platform_driver_unregister(&alarmtimer_driver); +out_if: + alarmtimer_rtc_interface_remove(); return error; } device_initcall(alarmtimer_init); - diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index e4c699dfa4e8..9cd928f7a7c6 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -17,7 +17,6 @@ #include <linux/module.h> #include <linux/notifier.h> #include <linux/smp.h> -#include <linux/sysdev.h> #include "tick-internal.h" @@ -94,42 +93,143 @@ void clockevents_shutdown(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +/** + * clockevents_increase_min_delta - raise minimum delta of a clock event device + * @dev: device to increase the minimum delta + * + * Returns 0 on success, -ETIME when the minimum delta reached the limit. + */ +static int clockevents_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { + printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); + dev->next_event.tv64 = KTIME_MAX; + return -ETIME; + } + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta; + int i; + + for (i = 0;;) { + delta = dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + + if (++i > 2) { + /* + * We tried 3 times to program the device with the + * given min_delta_ns. Try to increase the minimum + * delta, if that fails as well get out of here. + */ + if (clockevents_increase_min_delta(dev)) + return -ETIME; + i = 0; + } + } +} + +#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta; + + delta = dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + return dev->set_next_event((unsigned long) clc, dev); +} + +#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + /** * clockevents_program_event - Reprogram the clock event device. + * @dev: device to program * @expires: absolute expiry time (monotonic clock) + * @force: program minimum delay if expires can not be set * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - ktime_t now) + bool force) { unsigned long long clc; int64_t delta; + int rc; if (unlikely(expires.tv64 < 0)) { WARN_ON_ONCE(1); return -ETIME; } - delta = ktime_to_ns(ktime_sub(expires, now)); - - if (delta <= 0) - return -ETIME; - dev->next_event = expires; if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) return 0; - if (delta > dev->max_delta_ns) - delta = dev->max_delta_ns; - if (delta < dev->min_delta_ns) - delta = dev->min_delta_ns; + /* Shortcut for clockevent devices that can deal with ktime. */ + if (dev->features & CLOCK_EVT_FEAT_KTIME) + return dev->set_next_ktime(expires, dev); + + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); + if (delta <= 0) + return force ? clockevents_program_min_delta(dev) : -ETIME; - clc = delta * dev->mult; - clc >>= dev->shift; + delta = min(delta, (int64_t) dev->max_delta_ns); + delta = max(delta, (int64_t) dev->min_delta_ns); - return dev->set_next_event((unsigned long) clc, dev); + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + rc = dev->set_next_event((unsigned long) clc, dev); + + return (rc && force) ? clockevents_program_min_delta(dev) : rc; } /** @@ -258,7 +358,7 @@ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) if (dev->mode != CLOCK_EVT_MODE_ONESHOT) return 0; - return clockevents_program_event(dev, dev->next_event, ktime_get()); + return clockevents_program_event(dev, dev->next_event, false); } /* diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0980f0d9a0a..a45ca167ab24 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -23,8 +23,8 @@ * o Allow clocksource drivers to be unregistered */ +#include <linux/device.h> #include <linux/clocksource.h> -#include <linux/sysdev.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ @@ -186,6 +186,7 @@ static struct timer_list watchdog_timer; static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; +static atomic_t watchdog_reset_pending; static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); @@ -247,12 +248,14 @@ static void clocksource_watchdog(unsigned long data) struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; - int next_cpu; + int next_cpu, reset_pending; spin_lock(&watchdog_lock); if (!watchdog_running) goto out; + reset_pending = atomic_read(&watchdog_reset_pending); + list_for_each_entry(cs, &watchdog_list, wd_list) { /* Clocksource already marked unstable? */ @@ -268,7 +271,8 @@ static void clocksource_watchdog(unsigned long data) local_irq_enable(); /* Clocksource initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || + atomic_read(&watchdog_reset_pending)) { cs->flags |= CLOCK_SOURCE_WATCHDOG; cs->wd_last = wdnow; cs->cs_last = csnow; @@ -283,8 +287,11 @@ static void clocksource_watchdog(unsigned long data) cs->cs_last = csnow; cs->wd_last = wdnow; + if (atomic_read(&watchdog_reset_pending)) + continue; + /* Check the deviation from the watchdog clocksource. */ - if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { clocksource_unstable(cs, cs_nsec - wd_nsec); continue; } @@ -303,6 +310,13 @@ static void clocksource_watchdog(unsigned long data) } /* + * We only clear the watchdog_reset_pending, when we did a + * full cycle through all clocksources. + */ + if (reset_pending) + atomic_dec(&watchdog_reset_pending); + + /* * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ @@ -344,23 +358,7 @@ static inline void clocksource_reset_watchdog(void) static void clocksource_resume_watchdog(void) { - unsigned long flags; - - /* - * We use trylock here to avoid a potential dead lock when - * kgdb calls this code after the kernel has been stopped with - * watchdog_lock held. When watchdog_lock is held we just - * return and accept, that the watchdog might trigger and mark - * the monitored clock source (usually TSC) unstable. - * - * This does not affect the other caller clocksource_resume() - * because at this point the kernel is UP, interrupts are - * disabled and nothing can hold watchdog_lock. - */ - if (!spin_trylock_irqsave(&watchdog_lock, flags)) - return; - clocksource_reset_watchdog(); - spin_unlock_irqrestore(&watchdog_lock, flags); + atomic_inc(&watchdog_reset_pending); } static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -494,6 +492,22 @@ void clocksource_touch_watchdog(void) } /** + * clocksource_max_adjustment- Returns max adjustment amount + * @cs: Pointer to clocksource + * + */ +static u32 clocksource_max_adjustment(struct clocksource *cs) +{ + u64 ret; + /* + * We won't try to correct for more then 11% adjustments (110,000 ppm), + */ + ret = (u64)cs->mult * 11; + do_div(ret,100); + return (u32)ret; +} + +/** * clocksource_max_deferment - Returns max time the clocksource can be deferred * @cs: Pointer to clocksource * @@ -505,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs) /* * Calculate the maximum number of cycles that we can pass to the * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/cs->mult which - * is equivalent to the below. - * max_cycles < (2^63)/cs->mult - * max_cycles < 2^(log2((2^63)/cs->mult)) - * max_cycles < 2^(log2(2^63) - log2(cs->mult)) - * max_cycles < 2^(63 - log2(cs->mult)) - * max_cycles < 1 << (63 - log2(cs->mult)) + * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) + * which is equivalent to the below. + * max_cycles < (2^63)/(cs->mult + cs->maxadj) + * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) + * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) + * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) + * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) * Please note that we add 1 to the result of the log2 to account for * any rounding errors, ensure the above inequality is satisfied and * no overflow will occur. */ - max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); + max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); /* * The actual maximum number of cycles we can defer the clocksource is * determined by the minimum of max_cycles and cs->mask. + * Note: Here we subtract the maxadj to make sure we don't sleep for + * too long if there's a large negative adjustment. */ max_cycles = min_t(u64, max_cycles, (u64) cs->mask); - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, + cs->shift); /* * To ensure that the clocksource does not wrap whilst we are idle, @@ -531,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) * note a margin of 12.5% is used because this can be computed with * a shift, versus say 10% which would require division. */ - return max_nsecs - (max_nsecs >> 5); + return max_nsecs - (max_nsecs >> 3); } #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET @@ -630,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs) /** * __clocksource_updatefreq_scale - Used update clocksource with new freq - * @t: clocksource to be registered + * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale * @@ -642,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs) void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; - /* * Calc the maximum number of seconds which we can run before * wrapping around. For clocksources which have a mask > 32bit @@ -653,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) * ~ 0.06ppm granularity for NTP. We apply the same 12.5% * margin as we do in clocksource_max_deferment() */ - sec = (cs->mask - (cs->mask >> 5)); + sec = (cs->mask - (cs->mask >> 3)); do_div(sec, freq); do_div(sec, scale); if (!sec) @@ -663,13 +679,27 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, NSEC_PER_SEC / scale, sec * scale); + + /* + * for clocksources that have large mults, to avoid overflow. + * Since mult may be adjusted by ntp, add an safety extra margin + * + */ + cs->maxadj = clocksource_max_adjustment(cs); + while ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult)) { + cs->mult >>= 1; + cs->shift--; + cs->maxadj = clocksource_max_adjustment(cs); + } + cs->max_idle_ns = clocksource_max_deferment(cs); } EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); /** * __clocksource_register_scale - Used to install new clocksources - * @t: clocksource to be registered + * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale * @@ -697,12 +727,18 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); /** * clocksource_register - Used to install new clocksources - * @t: clocksource to be registered + * @cs: clocksource to be registered * * Returns -EBUSY if registration fails, zero otherwise. */ int clocksource_register(struct clocksource *cs) { + /* calculate max adjustment for given mult/shift */ + cs->maxadj = clocksource_max_adjustment(cs); + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + /* calculate max idle time permitted for this clocksource */ cs->max_idle_ns = clocksource_max_deferment(cs); @@ -725,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) /** * clocksource_change_rating - Change the rating of a registered clocksource + * @cs: clocksource to be changed + * @rating: new rating */ void clocksource_change_rating(struct clocksource *cs, int rating) { @@ -736,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating); /** * clocksource_unregister - remove a registered clocksource + * @cs: clocksource to be unregistered */ void clocksource_unregister(struct clocksource *cs) { @@ -751,13 +790,14 @@ EXPORT_SYMBOL(clocksource_unregister); /** * sysfs_show_current_clocksources - sysfs interface for current clocksource * @dev: unused + * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing current clocksource. */ static ssize_t -sysfs_show_current_clocksources(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +sysfs_show_current_clocksources(struct device *dev, + struct device_attribute *attr, char *buf) { ssize_t count = 0; @@ -771,14 +811,15 @@ sysfs_show_current_clocksources(struct sys_device *dev, /** * sysfs_override_clocksource - interface for manually overriding clocksource * @dev: unused + * @attr: unused * @buf: name of override clocksource * @count: length of buffer * * Takes input from sysfs interface for manually overriding the default * clocksource selection. */ -static ssize_t sysfs_override_clocksource(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sysfs_override_clocksource(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { size_t ret = count; @@ -806,13 +847,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, /** * sysfs_show_available_clocksources - sysfs interface for listing clocksource * @dev: unused + * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing registered clocksources */ static ssize_t -sysfs_show_available_clocksources(struct sys_device *dev, - struct sysdev_attribute *attr, +sysfs_show_available_clocksources(struct device *dev, + struct device_attribute *attr, char *buf) { struct clocksource *src; @@ -841,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev, /* * Sysfs setup bits: */ -static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, +static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); -static SYSDEV_ATTR(available_clocksource, 0444, +static DEVICE_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); -static struct sysdev_class clocksource_sysclass = { +static struct bus_type clocksource_subsys = { .name = "clocksource", + .dev_name = "clocksource", }; -static struct sys_device device_clocksource = { +static struct device device_clocksource = { .id = 0, - .cls = &clocksource_sysclass, + .bus = &clocksource_subsys, }; static int __init init_clocksource_sysfs(void) { - int error = sysdev_class_register(&clocksource_sysclass); + int error = subsys_system_register(&clocksource_subsys, NULL); if (!error) - error = sysdev_register(&device_clocksource); + error = device_register(&device_clocksource); if (!error) - error = sysdev_create_file( + error = device_create_file( &device_clocksource, - &attr_current_clocksource); + &dev_attr_current_clocksource); if (!error) - error = sysdev_create_file( + error = device_create_file( &device_clocksource, - &attr_available_clocksource); + &dev_attr_available_clocksource); return error; } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index c340ca658f37..ce033c7aa2e8 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -18,6 +18,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/device.h> +#include <linux/export.h> #include <linux/file.h> #include <linux/posix-clock.h> #include <linux/slab.h> diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c7218d132738..fd4a7b1625a2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; - clockevents_exchange_device(NULL, dev); + clockevents_exchange_device(tick_broadcast_device.evtdev, dev); tick_broadcast_device.evtdev = dev; if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(dev); @@ -194,7 +194,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) for (next = dev->next_event; ;) { next = ktime_add(next, tick_period); - if (!clockevents_program_event(dev, next, ktime_get())) + if (!clockevents_program_event(dev, next, false)) return; tick_do_periodic_broadcast(); } @@ -373,7 +373,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force) { struct clock_event_device *bc = tick_broadcast_device.evtdev; - return tick_dev_program_event(bc, expires, force); + return clockevents_program_event(bc, expires, force); } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 119528de8235..da6c9ecad4e4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -94,7 +94,7 @@ void tick_handle_periodic(struct clock_event_device *dev) */ next = ktime_add(dev->next_event, tick_period); for (;;) { - if (!clockevents_program_event(dev, next, ktime_get())) + if (!clockevents_program_event(dev, next, false)) return; /* * Have to be careful here. If we're in oneshot mode, @@ -137,7 +137,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); for (;;) { - if (!clockevents_program_event(dev, next, ktime_get())) + if (!clockevents_program_event(dev, next, false)) return; next = ktime_add(next, tick_period); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 1009b06d6f89..4e265b901fed 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -26,8 +26,6 @@ extern void clockevents_shutdown(struct clock_event_device *dev); extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), ktime_t nextevt); -extern int tick_dev_program_event(struct clock_event_device *dev, - ktime_t expires, int force); extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 2d04411a5f05..824109060a33 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -21,74 +21,6 @@ #include "tick-internal.h" -/* Limit min_delta to a jiffie */ -#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) - -static int tick_increase_min_delta(struct clock_event_device *dev) -{ - /* Nothing to do if we already reached the limit */ - if (dev->min_delta_ns >= MIN_DELTA_LIMIT) - return -ETIME; - - if (dev->min_delta_ns < 5000) - dev->min_delta_ns = 5000; - else - dev->min_delta_ns += dev->min_delta_ns >> 1; - - if (dev->min_delta_ns > MIN_DELTA_LIMIT) - dev->min_delta_ns = MIN_DELTA_LIMIT; - - printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns); - return 0; -} - -/** - * tick_program_event internal worker function - */ -int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, - int force) -{ - ktime_t now = ktime_get(); - int i; - - for (i = 0;;) { - int ret = clockevents_program_event(dev, expires, now); - - if (!ret || !force) - return ret; - - dev->retries++; - /* - * We tried 3 times to program the device with the given - * min_delta_ns. If that's not working then we increase it - * and emit a warning. - */ - if (++i > 2) { - /* Increase the min. delta and try again */ - if (tick_increase_min_delta(dev)) { - /* - * Get out of the loop if min_delta_ns - * hit the limit already. That's - * better than staying here forever. - * - * We clear next_event so we have a - * chance that the box survives. - */ - printk(KERN_WARNING - "CE: Reprogramming failure. Giving up\n"); - dev->next_event.tv64 = KTIME_MAX; - return -ETIME; - } - i = 0; - } - - now = ktime_get(); - expires = ktime_add_ns(now, dev->min_delta_ns); - } -} - /** * tick_program_event */ @@ -96,7 +28,7 @@ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - return tick_dev_program_event(dev, expires, force); + return clockevents_program_event(dev, expires, force); } /** @@ -104,11 +36,10 @@ int tick_program_event(ktime_t expires, int force) */ void tick_resume_oneshot(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - struct clock_event_device *dev = td->evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - tick_program_event(ktime_get(), 1); + clockevents_program_event(dev, ktime_get(), true); } /** @@ -120,7 +51,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, { newdev->event_handler = handler; clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - tick_dev_program_event(newdev, next_event, 1); + clockevents_program_event(newdev, next_event, true); } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d5097c44b407..7656642e4b8e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long flags; - cpumask_clear_cpu(cpu, nohz_cpu_mask); ts->idle_waketime = now; local_irq_save(flags); @@ -159,9 +158,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; } @@ -197,11 +197,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) /** * get_cpu_idle_time_us - get the total idle time of a cpu * @cpu: CPU number to query - * @last_update_time: variable to store update time in + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. * * Return the cummulative idle time (since boot) for a given - * CPU, in microseconds. The idle time returned includes - * the iowait time (unlike what "top" and co report). + * CPU, in microseconds. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. @@ -211,20 +211,35 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now, idle; if (!tick_nohz_enabled) return -1; - update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); + now = ktime_get(); + if (last_update_time) { + update_ts_time_stats(cpu, ts, now, last_update_time); + idle = ts->idle_sleeptime; + } else { + if (ts->idle_active && !nr_iowait_cpu(cpu)) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(ts->idle_sleeptime, delta); + } else { + idle = ts->idle_sleeptime; + } + } + + return ktime_to_us(idle); - return ktime_to_us(ts->idle_sleeptime); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); -/* +/** * get_cpu_iowait_time_us - get the total iowait time of a cpu * @cpu: CPU number to query - * @last_update_time: variable to store update time in + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. * * Return the cummulative iowait time (since boot) for a given * CPU, in microseconds. @@ -237,52 +252,40 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now, iowait; if (!tick_nohz_enabled) return -1; - update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); + now = ktime_get(); + if (last_update_time) { + update_ts_time_stats(cpu, ts, now, last_update_time); + iowait = ts->iowait_sleeptime; + } else { + if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); - return ktime_to_us(ts->iowait_sleeptime); + iowait = ktime_add(ts->iowait_sleeptime, delta); + } else { + iowait = ts->iowait_sleeptime; + } + } + + return ktime_to_us(iowait); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -/** - * tick_nohz_stop_sched_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - * Called either from the idle loop or from irq_exit() when an idle period was - * just interrupted by an interrupt which did not cause a reschedule. - */ -void tick_nohz_stop_sched_tick(int inidle) +static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - struct tick_sched *ts; + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; - local_irq_save(flags); - cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); - /* - * Call to tick_nohz_start_idle stops the last_update_time from being - * updated. Thus, it must not be called in the event we are called from - * irq_exit() with the prior state different than idle. - */ - if (!inidle && !ts->inidle) - goto end; - - /* - * Set ts->inidle unconditionally. Even if the system did not - * switch to NOHZ mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ - ts->inidle = 1; - now = tick_nohz_start_idle(cpu, ts); /* @@ -298,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle) } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - goto end; + return; if (need_resched()) - goto end; + return; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; @@ -311,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle) (unsigned int) local_softirq_pending()); ratelimit++; } - goto end; + return; } ts->idle_calls++; @@ -389,9 +392,6 @@ void tick_nohz_stop_sched_tick(int inidle) else expires.tv64 = KTIME_MAX; - if (delta_jiffies > 1) - cpumask_set_cpu(cpu, nohz_cpu_mask); - /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; @@ -409,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle) ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; - rcu_enter_nohz(); } ts->idle_sleeps++; @@ -441,15 +440,70 @@ void tick_nohz_stop_sched_tick(int inidle) * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); -end: - local_irq_restore(flags); +} + +/** + * tick_nohz_idle_enter - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called when we start the idle loop. + * + * The arch is responsible of calling: + * + * - rcu_idle_enter() after its last use of RCU before the CPU is put + * to sleep. + * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + */ +void tick_nohz_idle_enter(void) +{ + struct tick_sched *ts; + + WARN_ON_ONCE(irqs_disabled()); + + /* + * Update the idle state in the scheduler domain hierarchy + * when tick_nohz_stop_sched_tick() is called from the idle loop. + * State will be updated to busy during the first busy tick after + * exiting idle. + */ + set_cpu_sd_state_idle(); + + local_irq_disable(); + + ts = &__get_cpu_var(tick_cpu_sched); + /* + * set ts->inidle unconditionally. even if the system did not + * switch to nohz mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + tick_nohz_stop_sched_tick(ts); + + local_irq_enable(); +} + +/** + * tick_nohz_irq_exit - update next tick event from interrupt exit + * + * When an interrupt fires while we are idle and it doesn't cause + * a reschedule, it may still add, modify or delete a timer, enqueue + * an RCU callback, etc... + * So we need to re-calculate and reprogram the next tick event. + */ +void tick_nohz_irq_exit(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (!ts->inidle) + return; + + tick_nohz_stop_sched_tick(ts); } /** @@ -491,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } /** - * tick_nohz_restart_sched_tick - restart the idle tick from the idle task + * tick_nohz_idle_exit - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. */ -void tick_nohz_restart_sched_tick(void) +void tick_nohz_idle_exit(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -505,6 +561,7 @@ void tick_nohz_restart_sched_tick(void) ktime_t now; local_irq_disable(); + if (ts->idle_active || (ts->inidle && ts->tick_stopped)) now = ktime_get(); @@ -519,12 +576,9 @@ void tick_nohz_restart_sched_tick(void) ts->inidle = 0; - rcu_exit_nohz(); - /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); - cpumask_clear_cpu(cpu, nohz_cpu_mask); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* @@ -640,8 +694,6 @@ static void tick_nohz_switch_to_nohz(void) next = ktime_add(next, tick_period); } local_irq_enable(); - - printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); } /* @@ -793,10 +845,8 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ - if (tick_nohz_enabled) { + if (tick_nohz_enabled) ts->nohz_mode = NOHZ_MODE_HIGHRES; - printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); - } #endif } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b021b0e8507..0c6358186401 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void) /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds using ntp adjusted mult. */ + /* return delta convert to nanoseconds. */ return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } @@ -249,6 +249,8 @@ ktime_t ktime_get(void) secs = xtime.tv_sec + wall_to_monotonic.tv_sec; nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; nsecs += timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); } while (read_seqretry(&xtime_lock, seq)); /* @@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts) *ts = xtime; tomono = wall_to_monotonic; nsecs = timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); } while (read_seqretry(&xtime_lock, seq)); @@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset) s64 error, interval = timekeeper.cycle_interval; int adj; + /* + * The point of this is to check if the error is greater then half + * an interval. + * + * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. + * + * Note we subtract one in the shift, so that error is really error*2. + * This "saves" dividing(shifting) interval twice, but keeps the + * (error > interval) comparison as still measuring if error is + * larger then half an interval. + * + * Note: It does not "save" on aggravation when reading the code. + */ error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); if (error > interval) { + /* + * We now divide error by 4(via shift), which checks if + * the error is greater then twice the interval. + * If it is greater, we need a bigadjust, if its smaller, + * we can adjust by 1. + */ error >>= 2; + /* + * XXX - In update_wall_time, we round up to the next + * nanosecond, and store the amount rounded up into + * the error. This causes the likely below to be unlikely. + * + * The proper fix is to avoid rounding up by using + * the high precision timekeeper.xtime_nsec instead of + * xtime.tv_nsec everywhere. Fixing this will take some + * time. + */ if (likely(error <= interval)) adj = 1; else adj = timekeeping_bigadjust(error, &interval, &offset); } else if (error < -interval) { + /* See comment above, this is just switched for the negative */ error >>= 2; if (likely(error >= -interval)) { adj = -1; @@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset) offset = -offset; } else adj = timekeeping_bigadjust(error, &interval, &offset); - } else + } else /* No adjustment needed */ return; + WARN_ONCE(timekeeper.clock->maxadj && + (timekeeper.mult + adj > timekeeper.clock->mult + + timekeeper.clock->maxadj), + "Adjusting %s more then 11%% (%ld vs %ld)\n", + timekeeper.clock->name, (long)timekeeper.mult + adj, + (long)timekeeper.clock->mult + + timekeeper.clock->maxadj); + /* + * So the following can be confusing. + * + * To keep things simple, lets assume adj == 1 for now. + * + * When adj != 1, remember that the interval and offset values + * have been appropriately scaled so the math is the same. + * + * The basic idea here is that we're increasing the multiplier + * by one, this causes the xtime_interval to be incremented by + * one cycle_interval. This is because: + * xtime_interval = cycle_interval * mult + * So if mult is being incremented by one: + * xtime_interval = cycle_interval * (mult + 1) + * Its the same as: + * xtime_interval = (cycle_interval * mult) + cycle_interval + * Which can be shortened to: + * xtime_interval += cycle_interval + * + * So offset stores the non-accumulated cycles. Thus the current + * time (in shifted nanoseconds) is: + * now = (offset * adj) + xtime_nsec + * Now, even though we're adjusting the clock frequency, we have + * to keep time consistent. In other words, we can't jump back + * in time, and we also want to avoid jumping forward in time. + * + * So given the same offset value, we need the time to be the same + * both before and after the freq adjustment. + * now = (offset * adj_1) + xtime_nsec_1 + * now = (offset * adj_2) + xtime_nsec_2 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_2) + xtime_nsec_2 + * And we know: + * adj_2 = adj_1 + 1 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * (adj_1+1)) + xtime_nsec_2 + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_1) + offset + xtime_nsec_2 + * Canceling the sides: + * xtime_nsec_1 = offset + xtime_nsec_2 + * Which gives us: + * xtime_nsec_2 = xtime_nsec_1 - offset + * Which simplfies to: + * xtime_nsec -= offset + * + * XXX - TODO: Doc ntp_error calculation. + */ timekeeper.mult += adj; timekeeper.xtime_interval += interval; timekeeper.xtime_nsec -= offset; diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index a5d0a3a85dd8..0b537f27b559 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -81,7 +81,7 @@ struct entry { /* * Spinlock protecting the tables - not taken during lookup: */ -static DEFINE_SPINLOCK(table_lock); +static DEFINE_RAW_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: @@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) prev = NULL; curr = *head; - spin_lock(&table_lock); + raw_spin_lock(&table_lock); /* * Make sure we have not raced with another CPU: */ @@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) *head = curr; } out_unlock: - spin_unlock(&table_lock); + raw_spin_unlock(&table_lock); return curr; } diff --git a/kernel/timer.c b/kernel/timer.c index 8cff36119e4d..a297ffcf888e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -20,7 +20,7 @@ */ #include <linux/kernel_stat.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/init.h> @@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) } } +/* Stub timer callback for improperly used timers. */ +static void stub_timer(unsigned long data) +{ + WARN_ON(1); +} + /* * fixup_activate is called when: * - an active object is activated @@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) debug_object_activate(timer, &timer_debug_descr); return 0; } else { - WARN_ON_ONCE(1); + setup_timer(timer, stub_timer, 0); + return 1; } return 0; @@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) } } +/* + * fixup_assert_init is called when: + * - an untracked/uninit-ed object is found + */ +static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + if (timer->entry.prev == TIMER_ENTRY_STATIC) { + /* + * This is not really a fixup. The timer was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + debug_object_init(timer, &timer_debug_descr); + return 0; + } else { + setup_timer(timer, stub_timer, 0); + return 1; + } + default: + return 0; + } +} + static struct debug_obj_descr timer_debug_descr = { - .name = "timer_list", - .debug_hint = timer_debug_hint, - .fixup_init = timer_fixup_init, - .fixup_activate = timer_fixup_activate, - .fixup_free = timer_fixup_free, + .name = "timer_list", + .debug_hint = timer_debug_hint, + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, + .fixup_assert_init = timer_fixup_assert_init, }; static inline void debug_timer_init(struct timer_list *timer) @@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer) debug_object_free(timer, &timer_debug_descr); } +static inline void debug_timer_assert_init(struct timer_list *timer) +{ + debug_object_assert_init(timer, &timer_debug_descr); +} + static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key); @@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack); static inline void debug_timer_init(struct timer_list *timer) { } static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } +static inline void debug_timer_assert_init(struct timer_list *timer) { } #endif static inline void debug_init(struct timer_list *timer) @@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer) trace_timer_cancel(timer); } +static inline void debug_assert_init(struct timer_list *timer) +{ + debug_timer_assert_init(timer); +} + static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key) @@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; + debug_assert_init(timer); + timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); @@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer) unsigned long flags; int ret = -1; + debug_assert_init(timer); + base = lock_timer_base(timer, &flags); if (base->running_timer == timer) @@ -1368,7 +1418,7 @@ SYSCALL_DEFINE0(getppid) int pid; rcu_read_lock(); - pid = task_tgid_vnr(current->real_parent); + pid = task_tgid_vnr(rcu_dereference(current->real_parent)); rcu_read_unlock(); return pid; diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 761c510a06c5..5f39a07fe5ea 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +CFLAGS_trace_events_filter.o := -I$(src) + # # Make the trace clocks available generally: it's infrastructure # relied on by ptrace for example: @@ -53,6 +55,9 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o +ifeq ($(CONFIG_PM_RUNTIME),y) +obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o +endif ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7c910a5593a6..cdea7b56b0c9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,6 +23,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/debugfs.h> +#include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> @@ -401,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry) static struct dentry *blk_create_buf_file_callback(const char *filename, struct dentry *parent, - int mode, + umode_t mode, struct rchan_buf *buf, int *is_global) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c3e4575e7829..b1e8943fed1d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,6 +22,7 @@ #include <linux/hardirq.h> #include <linux/kthread.h> #include <linux/uaccess.h> +#include <linux/module.h> #include <linux/ftrace.h> #include <linux/sysctl.h> #include <linux/slab.h> @@ -151,7 +152,6 @@ void clear_ftrace_function(void) ftrace_pid_function = ftrace_stub; } -#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* * For those archs that do not test ftrace_trace_stop in their @@ -1211,7 +1211,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, if (!src->count) { free_ftrace_hash_rcu(*dst); rcu_assign_pointer(*dst, EMPTY_HASH); - return 0; + /* still need to update the function records */ + ret = 0; + goto out; } /* @@ -3863,6 +3865,14 @@ void ftrace_kill(void) } /** + * Test if ftrace is dead or not. + */ +int ftrace_is_dead(void) +{ + return ftrace_disabled; +} + +/** * register_ftrace_function - register a function for profiling * @ops - ops structure that holds the function for profiling. * diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 731201bf4acc..f5b7b5c1195b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -478,7 +478,7 @@ struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; @@ -488,12 +488,14 @@ struct ring_buffer_per_cpu { struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; + local_t entries_bytes; local_t commit_overrun; local_t overrun; local_t entries; local_t committing; local_t commits; unsigned long read; + unsigned long read_bytes; u64 write_stamp; u64 read_stamp; }; @@ -1062,7 +1064,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; - spin_lock_init(&cpu_buffer->reader_lock); + raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -1259,7 +1261,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1277,7 +1279,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } static void @@ -1288,7 +1290,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1303,7 +1305,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } /** @@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the counters. */ local_add(entries, &cpu_buffer->overrun); + local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); /* * The entries will be zeroed out when we move the @@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (!tail) tail_page->page->time_stamp = ts; + /* account for these added bytes */ + local_add(length, &cpu_buffer->entries_bytes); + return event; } @@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; + unsigned long event_length = rb_event_length(event); /* * This is on the tail page. It is possible that * a write could come in and move the tail page @@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, old_index += write_mask; new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) + if (index == old_index) { + /* update counters */ + local_sub(event_length, &cpu_buffer->entries_bytes); return 1; + } } /* could not discard */ @@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) } /** + * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +{ + unsigned long flags; + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* + * if the tail is on reader_page, oldest time stamp is on the reader + * page + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + bpage = cpu_buffer->reader_page; + else + bpage = rb_set_head_page(cpu_buffer); + ret = bpage->page->time_stamp; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); + +/** + * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); + +/** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to get the entries from. @@ -2804,9 +2869,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); @@ -3265,12 +3330,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, again: local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) @@ -3295,9 +3360,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) unsigned long flags; again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; @@ -3337,7 +3402,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { @@ -3346,7 +3411,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, } if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); out: @@ -3438,11 +3503,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -3477,7 +3542,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); again: event = rb_iter_peek(iter, ts); if (!event) @@ -3488,7 +3553,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) rb_advance_iter(iter); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return event; } @@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read = 0; local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -3557,7 +3624,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; @@ -3569,7 +3636,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) arch_spin_unlock(&cpu_buffer->lock); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); } @@ -3607,10 +3674,10 @@ int ring_buffer_empty(struct ring_buffer *buffer) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (!ret) @@ -3641,10 +3708,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); return ret; @@ -3841,7 +3908,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (!bpage) goto out; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); + cpu_buffer->read_bytes += BUF_PAGE_SIZE; /* swap the pages */ rb_init_page(bpage); @@ -3964,7 +4032,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: return ret; diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c new file mode 100644 index 000000000000..4b3b5eaf94d1 --- /dev/null +++ b/kernel/trace/rpm-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/usb.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/rpm.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5df02c69b1d..a3f1bc5d2a00 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -338,10 +338,11 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | + TRACE_ITER_IRQ_INFO; static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); +static DEFINE_RAW_SPINLOCK(tracing_start_lock); static void wakeup_work_handler(struct work_struct *work) { @@ -426,6 +427,7 @@ static const char *trace_options[] = { "record-cmd", "overwrite", "disable_on_free", + "irq-info", NULL }; @@ -435,6 +437,7 @@ static struct { } trace_clocks[] = { { trace_clock_local, "local" }, { trace_clock_global, "global" }, + { trace_clock_counter, "counter" }, }; int trace_clock_id; @@ -960,7 +963,7 @@ void tracing_start(void) if (tracing_disabled) return; - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (--trace_stop_count) { if (trace_stop_count < 0) { /* Someone screwed up their debugging */ @@ -985,7 +988,7 @@ void tracing_start(void) ftrace_start(); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } /** @@ -1000,7 +1003,7 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (trace_stop_count++) goto out; @@ -1018,7 +1021,7 @@ void tracing_stop(void) arch_spin_unlock(&ftrace_max_lock); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } void trace_stop_cmdline_recording(void); @@ -1842,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p) trace_event_read_unlock(); } +static void +get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +{ + unsigned long count; + int cpu; + + *total = 0; + *entries = 0; + + for_each_tracing_cpu(cpu) { + count = ring_buffer_entries_cpu(tr->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (tr->data[cpu]->skipped_entries) { + count -= tr->data[cpu]->skipped_entries; + /* total is the same as the entries */ + *total += count; + } else + *total += count + + ring_buffer_overrun_cpu(tr->buffer, cpu); + *entries += count; + } +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); @@ -1854,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void print_func_help_header(struct seq_file *m) +static void print_event_info(struct trace_array *tr, struct seq_file *m) +{ + unsigned long total; + unsigned long entries; + + get_total_entries(tr, &total, &entries); + seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", + entries, total, num_online_cpus()); + seq_puts(m, "#\n"); +} + +static void print_func_help_header(struct trace_array *tr, struct seq_file *m) { - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + print_event_info(tr, m); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } +static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) +{ + print_event_info(tr, m); + seq_puts(m, "# _-----=> irqs-off\n"); + seq_puts(m, "# / _----=> need-resched\n"); + seq_puts(m, "# | / _---=> hardirq/softirq\n"); + seq_puts(m, "# || / _--=> preempt-depth\n"); + seq_puts(m, "# ||| / delay\n"); + seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | |||| | |\n"); +} void print_trace_header(struct seq_file *m, struct trace_iterator *iter) @@ -1868,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long entries = 0; - unsigned long total = 0; - unsigned long count; + unsigned long entries; + unsigned long total; const char *name = "preemption"; - int cpu; if (type) name = type->name; - - for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(tr->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (tr->data[cpu]->skipped_entries) { - count -= tr->data[cpu]->skipped_entries; - /* total is the same as the entries */ - total += count; - } else - total += count + - ring_buffer_overrun_cpu(tr->buffer, cpu); - entries += count; - } + get_total_entries(tr, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -2139,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) return print_trace_fmt(iter); } +void trace_latency_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + print_trace_header(m, iter); + + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); +} + void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; @@ -2154,11 +2204,23 @@ void trace_default_header(struct seq_file *m) if (!(trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_func_help_header(m); + if (!(trace_flags & TRACE_ITER_VERBOSE)) { + if (trace_flags & TRACE_ITER_IRQ_INFO) + print_func_help_header_irq(iter->tr, m); + else + print_func_help_header(iter->tr, m); + } } } +static void test_ftrace_alive(struct seq_file *m) +{ + if (!ftrace_is_dead()) + return; + seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); +} + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2168,6 +2230,7 @@ static int s_show(struct seq_file *m, void *v) if (iter->tr) { seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); + test_ftrace_alive(m); } if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); @@ -2710,9 +2773,9 @@ static const char readme_msg[] = "# cat /sys/kernel/debug/tracing/trace_options\n" "noprint-parent nosym-offset nosym-addr noverbose\n" "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" - "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n" + "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" - "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n" + "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" ; static ssize_t @@ -3569,6 +3632,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } static ssize_t +tracing_total_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r, cpu; + unsigned long size = 0, expanded_size = 0; + + mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + size += tr->entries >> 10; + if (!ring_buffer_expanded) + expanded_size += trace_buf_size >> 10; + } + if (ring_buffer_expanded) + r = sprintf(buf, "%lu\n", size); + else + r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -3594,22 +3681,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) return 0; } -static int mark_printk(const char *fmt, ...) -{ - int ret; - va_list args; - va_start(args, fmt); - ret = trace_vprintk(0, fmt, args); - va_end(args); - return ret; -} - static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { - char *buf; - size_t written; + unsigned long addr = (unsigned long)ubuf; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + struct page *pages[2]; + int nr_pages = 1; + ssize_t written; + void *page1; + void *page2; + int offset; + int size; + int len; + int ret; if (tracing_disabled) return -EINVAL; @@ -3617,28 +3706,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - buf = kmalloc(cnt + 2, GFP_KERNEL); - if (buf == NULL) - return -ENOMEM; + /* + * Userspace is injecting traces into the kernel trace buffer. + * We want to be as non intrusive as possible. + * To do so, we do not want to allocate any special buffers + * or take any locks, but instead write the userspace data + * straight into the ring buffer. + * + * First we need to pin the userspace buffer into memory, + * which, most likely it is, because it just referenced it. + * But there's no guarantee that it is. By using get_user_pages_fast() + * and kmap_atomic/kunmap_atomic() we can get access to the + * pages directly. We then write the data directly into the + * ring buffer. + */ + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - if (copy_from_user(buf, ubuf, cnt)) { - kfree(buf); - return -EFAULT; + /* check if we cross pages */ + if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) + nr_pages = 2; + + offset = addr & (PAGE_SIZE - 1); + addr &= PAGE_MASK; + + ret = get_user_pages_fast(addr, nr_pages, 0, pages); + if (ret < nr_pages) { + while (--ret >= 0) + put_page(pages[ret]); + written = -EFAULT; + goto out; } - if (buf[cnt-1] != '\n') { - buf[cnt] = '\n'; - buf[cnt+1] = '\0'; + + page1 = kmap_atomic(pages[0]); + if (nr_pages == 2) + page2 = kmap_atomic(pages[1]); + + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* possible \n added */ + buffer = global_trace.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, preempt_count()); + if (!event) { + /* Ring buffer disabled, return as if not open for write */ + written = -EBADF; + goto out_unlock; + } + + entry = ring_buffer_event_data(event); + entry->ip = _THIS_IP_; + + if (nr_pages == 2) { + len = PAGE_SIZE - offset; + memcpy(&entry->buf, page1 + offset, len); + memcpy(&entry->buf[len], page2, cnt - len); } else - buf[cnt] = '\0'; + memcpy(&entry->buf, page1 + offset, cnt); - written = mark_printk("%s", buf); - kfree(buf); - *fpos += written; + if (entry->buf[cnt - 1] != '\n') { + entry->buf[cnt] = '\n'; + entry->buf[cnt + 1] = '\0'; + } else + entry->buf[cnt] = '\0'; + + ring_buffer_unlock_commit(buffer, event); + + written = cnt; - /* don't tell userspace we wrote more - it might confuse them */ - if (written > cnt) - written = cnt; + *fpos += written; + out_unlock: + if (nr_pages == 2) + kunmap_atomic(page2); + kunmap_atomic(page1); + while (nr_pages > 0) + put_page(pages[--nr_pages]); + out: return written; } @@ -3739,6 +3881,12 @@ static const struct file_operations tracing_entries_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations tracing_total_entries_fops = { + .open = tracing_open_generic, + .read = tracing_total_entries_read, + .llseek = generic_file_llseek, +}; + static const struct file_operations tracing_free_buffer_fops = { .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, @@ -3808,8 +3956,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (info->read < PAGE_SIZE) goto read; - info->read = 0; - trace_access_lock(info->cpu); ret = ring_buffer_read_page(info->tr->buffer, &info->spare, @@ -3819,6 +3965,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) return 0; + info->read = 0; + read: size = PAGE_SIZE - info->read; if (size > count) @@ -4026,6 +4174,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, struct trace_array *tr = &global_trace; struct trace_seq *s; unsigned long cnt; + unsigned long long t; + unsigned long usec_rem; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -4042,6 +4192,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); + cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); + trace_seq_printf(s, "bytes: %ld\n", cnt); + + t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); + + t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); @@ -4277,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = { }; struct dentry *trace_create_file(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) @@ -4450,6 +4611,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); + trace_create_file("buffer_total_size_kb", 0444, d_tracer, + &global_trace, &tracing_total_entries_fops); + trace_create_file("free_buffer", 0644, d_tracer, &global_trace, &tracing_free_buffer_fops); @@ -4566,6 +4730,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) tracing_off(); + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + if (disable_tracing) ftrace_kill(); @@ -4658,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { __ftrace_dump(true, oops_dump_mode); } +EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 616846bcfee5..b93ecbadad6d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -312,7 +312,7 @@ void tracing_reset_current(int cpu); void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); @@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -579,11 +580,13 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } +extern int ftrace_is_dead(void); #else static inline int ftrace_trace_task(struct task_struct *task) { return 1; } +static inline int ftrace_is_dead(void) { return 0; } #endif /* @@ -652,6 +655,7 @@ enum trace_iterator_flags { TRACE_ITER_RECORD_CMD = 0x100000, TRACE_ITER_OVERWRITE = 0x200000, TRACE_ITER_STOP_ON_FREE = 0x400000, + TRACE_ITER_IRQ_INFO = 0x800000, }; /* @@ -761,16 +765,10 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - /* - * Leaf nodes use field_name, ops is used by AND and OR - * nodes. The field_name is always freed when freeing a pred. - * We can overload field_name for ops and have it freed - * as well. - */ - union { - char *field_name; - unsigned short *ops; - }; + unsigned short *ops; +#ifdef CONFIG_FTRACE_STARTUP_TEST + struct ftrace_event_field *field; +#endif int offset; int not; int op; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6302747a1398..394783531cbb 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void) return now; } + +static atomic64_t trace_counter; + +/* + * trace_clock_counter(): simply an atomic counter. + * Use the trace_counter "counter" for cases where you do not care + * about timings, but are interested in strict ordering. + */ +u64 notrace trace_clock_counter(void) +{ + return atomic64_add_return(1, &trace_counter); +} diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 581876f9f387..c212a7f934ec 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { - __get_system(system); system->nr_events++; return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 256764ecccd6..f04cc3136bd3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -27,6 +27,12 @@ #include "trace.h" #include "trace_output.h" +#define DEFAULT_SYS_FILTER_MESSAGE \ + "### global filter ###\n" \ + "# Use this to set filters for multiple events.\n" \ + "# Only events with the given fields will be affected.\n" \ + "# If no events are modified, an error message will be displayed here" + enum filter_op_ids { OP_OR, @@ -381,6 +387,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, return pred; } +enum walk_return { + WALK_PRED_ABORT, + WALK_PRED_PARENT, + WALK_PRED_DEFAULT, +}; + +typedef int (*filter_pred_walkcb_t) (enum move_type move, + struct filter_pred *pred, + int *err, void *data); + +static int walk_pred_tree(struct filter_pred *preds, + struct filter_pred *root, + filter_pred_walkcb_t cb, void *data) +{ + struct filter_pred *pred = root; + enum move_type move = MOVE_DOWN; + int done = 0; + + if (!preds) + return -EINVAL; + + do { + int err = 0, ret; + + ret = cb(move, pred, &err, data); + if (ret == WALK_PRED_ABORT) + return err; + if (ret == WALK_PRED_PARENT) + goto get_parent; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + goto get_parent; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + get_parent: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, + &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + /* * A series of AND or ORs where found together. Instead of * climbing up and down the tree branches, an array of the @@ -410,99 +473,91 @@ static int process_ops(struct filter_pred *preds, for (i = 0; i < op->val; i++) { pred = &preds[op->ops[i]]; - match = pred->fn(pred, rec); + if (!WARN_ON_ONCE(!pred->fn)) + match = pred->fn(pred, rec); if (!!match == type) return match; } return match; } +struct filter_match_preds_data { + struct filter_pred *preds; + int match; + void *rec; +}; + +static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct filter_match_preds_data *d = data; + + *err = 0; + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* If ops is set, then it was folded. */ + if (!pred->ops) + return WALK_PRED_DEFAULT; + /* We can treat folded ops as a leaf node */ + d->match = process_ops(d->preds, pred, d->rec); + } else { + if (!WARN_ON_ONCE(!pred->fn)) + d->match = pred->fn(pred, d->rec); + } + + return WALK_PRED_PARENT; + case MOVE_UP_FROM_LEFT: + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!d->match == (pred->op == OP_OR)) + return WALK_PRED_PARENT; + break; + case MOVE_UP_FROM_RIGHT: + break; + } + + return WALK_PRED_DEFAULT; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match = -1; - enum move_type move = MOVE_DOWN; struct filter_pred *preds; - struct filter_pred *pred; struct filter_pred *root; - int n_preds; - int done = 0; + struct filter_match_preds_data data = { + /* match is currently meaningless */ + .match = -1, + .rec = rec, + }; + int n_preds, ret; /* no filter is considered a match */ if (!filter) return 1; n_preds = filter->n_preds; - if (!n_preds) return 1; /* * n_preds, root and filter->preds are protect with preemption disabled. */ - preds = rcu_dereference_sched(filter->preds); root = rcu_dereference_sched(filter->root); if (!root) return 1; - pred = root; - - /* match is currently meaningless */ - match = -1; - - do { - switch (move) { - case MOVE_DOWN: - /* only AND and OR have children */ - if (pred->left != FILTER_PRED_INVALID) { - /* If ops is set, then it was folded. */ - if (!pred->ops) { - /* keep going to down the left side */ - pred = &preds[pred->left]; - continue; - } - /* We can treat folded ops as a leaf node */ - match = process_ops(preds, pred, rec); - } else - match = pred->fn(pred, rec); - /* If this pred is the only pred */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - /* - * Check for short circuits. - * - * Optimization: !!match == (pred->op == OP_OR) - * is the same as: - * if ((match && pred->op == OP_OR) || - * (!match && pred->op == OP_AND)) - */ - if (!!match == (pred->op == OP_OR)) { - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - /* now go down the right side of the tree. */ - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - /* We finished this equation. */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); - - return match; + data.preds = preds = rcu_dereference_sched(filter->preds); + ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); + WARN_ON(ret); + return data.match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -597,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, "none\n"); + trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); mutex_unlock(&event_mutex); } @@ -628,22 +683,6 @@ find_event_field(struct ftrace_event_call *call, char *name) return __find_event_field(head, name); } -static void filter_free_pred(struct filter_pred *pred) -{ - if (!pred) - return; - - kfree(pred->field_name); - kfree(pred); -} - -static void filter_clear_pred(struct filter_pred *pred) -{ - kfree(pred->field_name); - pred->field_name = NULL; - pred->regex.len = 0; -} - static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); @@ -689,20 +728,13 @@ __pop_pred_stack(struct pred_stack *stack) static int filter_set_pred(struct event_filter *filter, int idx, struct pred_stack *stack, - struct filter_pred *src, - filter_pred_fn_t fn) + struct filter_pred *src) { struct filter_pred *dest = &filter->preds[idx]; struct filter_pred *left; struct filter_pred *right; *dest = *src; - if (src->field_name) { - dest->field_name = kstrdup(src->field_name, GFP_KERNEL); - if (!dest->field_name) - return -ENOMEM; - } - dest->fn = fn; dest->index = idx; if (dest->op == OP_OR || dest->op == OP_AND) { @@ -743,11 +775,7 @@ static int filter_set_pred(struct event_filter *filter, static void __free_preds(struct event_filter *filter) { - int i; - if (filter->preds) { - for (i = 0; i < filter->a_preds; i++) - kfree(filter->preds[i].field_name); kfree(filter->preds); filter->preds = NULL; } @@ -840,23 +868,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system) } } -static int filter_add_pred_fn(struct filter_parse_state *ps, - struct ftrace_event_call *call, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack, - filter_pred_fn_t fn) +static int filter_add_pred(struct filter_parse_state *ps, + struct event_filter *filter, + struct filter_pred *pred, + struct pred_stack *stack) { - int idx, err; + int err; if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } - idx = filter->n_preds; - filter_clear_pred(&filter->preds[idx]); - err = filter_set_pred(filter, idx, stack, pred, fn); + err = filter_set_pred(filter, filter->n_preds, stack, pred); if (err) return err; @@ -937,31 +961,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, return fn; } -static int filter_add_pred(struct filter_parse_state *ps, - struct ftrace_event_call *call, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack, - bool dry_run) +static int init_pred(struct filter_parse_state *ps, + struct ftrace_event_field *field, + struct filter_pred *pred) + { - struct ftrace_event_field *field; - filter_pred_fn_t fn; + filter_pred_fn_t fn = filter_pred_none; unsigned long long val; int ret; - fn = pred->fn = filter_pred_none; - - if (pred->op == OP_AND) - goto add_pred_fn; - else if (pred->op == OP_OR) - goto add_pred_fn; - - field = find_event_field(call, pred->field_name); - if (!field) { - parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); - return -EINVAL; - } - pred->offset = field->offset; if (!is_legal_op(field, pred->op)) { @@ -1001,9 +1009,7 @@ static int filter_add_pred(struct filter_parse_state *ps, if (pred->op == OP_NE) pred->not = 1; -add_pred_fn: - if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, stack, fn); + pred->fn = fn; return 0; } @@ -1302,39 +1308,37 @@ parse_operand: return 0; } -static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +static struct filter_pred *create_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + int op, char *operand1, char *operand2) { - struct filter_pred *pred; + struct ftrace_event_field *field; + static struct filter_pred pred; - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - return NULL; + memset(&pred, 0, sizeof(pred)); + pred.op = op; - pred->field_name = kstrdup(operand1, GFP_KERNEL); - if (!pred->field_name) { - kfree(pred); + if (op == OP_AND || op == OP_OR) + return &pred; + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); return NULL; } - strcpy(pred->regex.pattern, operand2); - pred->regex.len = strlen(pred->regex.pattern); - - pred->op = op; - - return pred; -} - -static struct filter_pred *create_logical_pred(int op) -{ - struct filter_pred *pred; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + field = find_event_field(call, operand1); + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return NULL; + } - pred->op = op; + strcpy(pred.regex.pattern, operand2); + pred.regex.len = strlen(pred.regex.pattern); - return pred; +#ifdef CONFIG_FTRACE_STARTUP_TEST + pred.field = field; +#endif + return init_pred(ps, field, &pred) ? NULL : &pred; } static int check_preds(struct filter_parse_state *ps) @@ -1375,6 +1379,23 @@ static int count_preds(struct filter_parse_state *ps) return n_preds; } +struct check_pred_data { + int count; + int max; +}; + +static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct check_pred_data *d = data; + + if (WARN_ON(d->count++ > d->max)) { + *err = -EINVAL; + return WALK_PRED_ABORT; + } + return WALK_PRED_DEFAULT; +} + /* * The tree is walked at filtering of an event. If the tree is not correctly * built, it may cause an infinite loop. Check here that the tree does @@ -1383,107 +1404,76 @@ static int count_preds(struct filter_parse_state *ps) static int check_pred_tree(struct event_filter *filter, struct filter_pred *root) { - struct filter_pred *preds; - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; - int done = 0; - int max; - - /* - * The max that we can hit a node is three times. - * Once going down, once coming up from left, and - * once coming up from right. This is more than enough - * since leafs are only hit a single time. - */ - max = 3 * filter->n_preds; + struct check_pred_data data = { + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + .max = 3 * filter->n_preds, + .count = 0, + }; - preds = filter->preds; - if (!preds) - return -EINVAL; - pred = root; + return walk_pred_tree(filter->preds, root, + check_pred_tree_cb, &data); +} - do { - if (WARN_ON(count++ > max)) - return -EINVAL; +static int count_leafs_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + int *count = data; - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); + if ((move == MOVE_DOWN) && + (pred->left == FILTER_PRED_INVALID)) + (*count)++; - /* We are fine. */ - return 0; + return WALK_PRED_DEFAULT; } static int count_leafs(struct filter_pred *preds, struct filter_pred *root) { - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; - int done = 0; + int count = 0, ret; - pred = root; + ret = walk_pred_tree(preds, root, count_leafs_cb, &count); + WARN_ON(ret); + return count; +} - do { - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - return 1; - count++; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); +struct fold_pred_data { + struct filter_pred *root; + int count; + int children; +}; - return count; +static int fold_pred_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct fold_pred_data *d = data; + struct filter_pred *root = d->root; + + if (move != MOVE_DOWN) + return WALK_PRED_DEFAULT; + if (pred->left != FILTER_PRED_INVALID) + return WALK_PRED_DEFAULT; + + if (WARN_ON(d->count == d->children)) { + *err = -EINVAL; + return WALK_PRED_ABORT; + } + + pred->index &= ~FILTER_PRED_FOLD; + root->ops[d->count++] = pred->index; + return WALK_PRED_DEFAULT; } static int fold_pred(struct filter_pred *preds, struct filter_pred *root) { - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; + struct fold_pred_data data = { + .root = root, + .count = 0, + }; int children; - int done = 0; /* No need to keep the fold flag */ root->index &= ~FILTER_PRED_FOLD; @@ -1501,37 +1491,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) return -ENOMEM; root->val = children; + data.children = children; + return walk_pred_tree(preds, root, fold_pred_cb, &data); +} - pred = root; - do { - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - if (WARN_ON(count == children)) - return -EINVAL; - pred->index &= ~FILTER_PRED_FOLD; - root->ops[count++] = pred->index; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); +static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct filter_pred *preds = data; - return 0; + if (move != MOVE_DOWN) + return WALK_PRED_DEFAULT; + if (!(pred->index & FILTER_PRED_FOLD)) + return WALK_PRED_DEFAULT; + + *err = fold_pred(preds, pred); + if (*err) + return WALK_PRED_ABORT; + + /* eveyrhing below is folded, continue with parent */ + return WALK_PRED_PARENT; } /* @@ -1542,51 +1521,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) static int fold_pred_tree(struct event_filter *filter, struct filter_pred *root) { - struct filter_pred *preds; - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int done = 0; - int err; - - preds = filter->preds; - if (!preds) - return -EINVAL; - pred = root; - - do { - switch (move) { - case MOVE_DOWN: - if (pred->index & FILTER_PRED_FOLD) { - err = fold_pred(preds, pred); - if (err) - return err; - /* Folded nodes are like leafs */ - } else if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); - - return 0; + return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, + filter->preds); } static int replace_preds(struct ftrace_event_call *call, @@ -1643,27 +1579,17 @@ static int replace_preds(struct ftrace_event_call *call, goto fail; } - if (elt->op == OP_AND || elt->op == OP_OR) { - pred = create_logical_pred(elt->op); - goto add_pred; - } - - if (!operand1 || !operand2) { - parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + pred = create_pred(ps, call, elt->op, operand1, operand2); + if (!pred) { err = -EINVAL; goto fail; } - pred = create_pred(elt->op, operand1, operand2); -add_pred: - if (!pred) { - err = -ENOMEM; - goto fail; + if (!dry_run) { + err = filter_add_pred(ps, filter, pred, &stack); + if (err) + goto fail; } - err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); - filter_free_pred(pred); - if (err) - goto fail; operand1 = operand2 = NULL; } @@ -1729,7 +1655,9 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - goto fail; + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; } list_for_each_entry(call, &ftrace_events, list) { @@ -1738,6 +1666,9 @@ static int replace_system_preds(struct event_subsystem *system, if (strcmp(call->class->system, system->name) != 0) continue; + if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + continue; + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); if (!filter_item) goto fail_mem; @@ -1766,7 +1697,7 @@ static int replace_system_preds(struct event_subsystem *system, * replace the filter for the call. */ filter = call->filter; - call->filter = filter_item->filter; + rcu_assign_pointer(call->filter, filter_item->filter); filter_item->filter = filter; fail = false; @@ -1821,7 +1752,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) filter = call->filter; if (!filter) goto out_unlock; - call->filter = NULL; + RCU_INIT_POINTER(call->filter, NULL); /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); @@ -1862,7 +1793,7 @@ out: * string */ tmp = call->filter; - call->filter = filter; + rcu_assign_pointer(call->filter, filter); if (tmp) { /* Make sure the call is done with the filter */ synchronize_sched(); @@ -1913,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!filter) goto out; - replace_filter_string(filter, filter_string); + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + /* * No event actually uses the system filter * we can free it without synchronize_sched(). @@ -1923,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); - if (err) { - append_filter_err(ps, system->filter); - goto out; - } + if (err) + goto err_filter; err = replace_system_preds(system, ps, filter_string); if (err) - append_filter_err(ps, system->filter); + goto err_filter; out: filter_opstack_clear(ps); @@ -1940,6 +1872,11 @@ out_unlock: mutex_unlock(&event_mutex); return err; + +err_filter: + replace_filter_string(filter, filter_string); + append_filter_err(ps, system->filter); + goto out; } #ifdef CONFIG_PERF_EVENTS @@ -1958,17 +1895,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, int err; struct event_filter *filter; struct filter_parse_state *ps; - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (call->event.type == event_id) - break; - } + call = event->tp_event; err = -EINVAL; - if (&call->list == &ftrace_events) + if (!call) goto out_unlock; err = -EEXIST; @@ -2012,3 +1946,215 @@ out_unlock: #endif /* CONFIG_PERF_EVENTS */ +#ifdef CONFIG_FTRACE_STARTUP_TEST + +#include <linux/types.h> +#include <linux/tracepoint.h> + +#define CREATE_TRACE_POINTS +#include "trace_events_filter_test.h" + +static int test_get_filter(char *filter_str, struct ftrace_event_call *call, + struct event_filter **pfilter) +{ + struct event_filter *filter; + struct filter_parse_state *ps; + int err = -ENOMEM; + + filter = __alloc_filter(); + if (!filter) + goto out; + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_filter; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + *pfilter = filter; + + free_ps: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + free_filter: + if (err) + __free_filter(filter); + + out: + return err; +} + +#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ +{ \ + .filter = FILTER, \ + .rec = { .a = va, .b = vb, .c = vc, .d = vd, \ + .e = ve, .f = vf, .g = vg, .h = vh }, \ + .match = m, \ + .not_visited = nvisit, \ +} +#define YES 1 +#define NO 0 + +static struct test_filter_data_t { + char *filter; + struct ftrace_raw_ftrace_test_filter rec; + int match; + char *not_visited; +} test_filter_data[] = { +#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ + "e == 1 && f == 1 && g == 1 && h == 1" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), + DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), + DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""), +#undef FILTER +#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ + "e == 1 || f == 1 || g == 1 || h == 1" + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), +#undef FILTER +#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ + "(e == 1 || f == 1) && (g == 1 || h == 1)" + DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), + DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"), +#undef FILTER +#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), +#undef FILTER +#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), +#undef FILTER +#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ + "(e == 1 || f == 1)) && (g == 1 || h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), +#undef FILTER +#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ + "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), + DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""), +#undef FILTER +#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ + "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), +}; + +#undef DATA_REC +#undef FILTER +#undef YES +#undef NO + +#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) + +static int test_pred_visited; + +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + struct ftrace_event_field *field = pred->field; + + test_pred_visited = 1; + printk(KERN_INFO "\npred visited %s\n", field->name); + return 1; +} + +static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + char *fields = data; + + if ((move == MOVE_DOWN) && + (pred->left == FILTER_PRED_INVALID)) { + struct ftrace_event_field *field = pred->field; + + if (!field) { + WARN(1, "all leafs should have field defined"); + return WALK_PRED_DEFAULT; + } + if (!strchr(fields, *field->name)) + return WALK_PRED_DEFAULT; + + WARN_ON(!pred->fn); + pred->fn = test_pred_visited_fn; + } + return WALK_PRED_DEFAULT; +} + +static __init int ftrace_test_event_filter(void) +{ + int i; + + printk(KERN_INFO "Testing ftrace filter: "); + + for (i = 0; i < DATA_CNT; i++) { + struct event_filter *filter = NULL; + struct test_filter_data_t *d = &test_filter_data[i]; + int err; + + err = test_get_filter(d->filter, &event_ftrace_test_filter, + &filter); + if (err) { + printk(KERN_INFO + "Failed to get filter for '%s', err %d\n", + d->filter, err); + break; + } + + /* + * The preemption disabling is not really needed for self + * tests, but the rcu dereference will complain without it. + */ + preempt_disable(); + if (*d->not_visited) + walk_pred_tree(filter->preds, filter->root, + test_walk_pred_cb, + d->not_visited); + + test_pred_visited = 0; + err = filter_match_preds(filter, &d->rec); + preempt_enable(); + + __free_filter(filter); + + if (test_pred_visited) { + printk(KERN_INFO + "Failed, unwanted pred visited for filter %s\n", + d->filter); + break; + } + + if (err != d->match) { + printk(KERN_INFO + "Failed to match filter '%s', expected %d\n", + d->filter, d->match); + break; + } + } + + if (i == DATA_CNT) + printk(KERN_CONT "OK\n"); + + return 0; +} + +late_initcall(ftrace_test_event_filter); + +#endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h new file mode 100644 index 000000000000..bfd4dba0d603 --- /dev/null +++ b/kernel/trace/trace_events_filter_test.h @@ -0,0 +1,50 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM test + +#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TEST_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(ftrace_test_filter, + + TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h), + + TP_ARGS(a, b, c, d, e, f, g, h), + + TP_STRUCT__entry( + __field(int, a) + __field(int, b) + __field(int, c) + __field(int, d) + __field(int, e) + __field(int, f) + __field(int, g) + __field(int, h) + ), + + TP_fast_assign( + __entry->a = a; + __entry->b = b; + __entry->c = c; + __entry->d = d; + __entry->e = e; + __entry->f = f; + __entry->g = g; + __entry->h = h; + ), + + TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d", + __entry->a, __entry->b, __entry->c, __entry->d, + __entry->e, __entry->f, __entry->g, __entry->h) +); + +#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_events_filter_test + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 667aa8cc0cfc..99d20e920368 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly; static DEFINE_PER_CPU(int, tracing_cpu); -static DEFINE_SPINLOCK(max_trace_lock); +static DEFINE_RAW_SPINLOCK(max_trace_lock); enum { TRACER_IRQS_OFF = (1 << 1), @@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) } static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } -static void irqsoff_print_header(struct seq_file *s) { } static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void irqsoff_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* @@ -321,7 +332,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; - spin_lock_irqsave(&max_trace_lock, flags); + raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) @@ -344,7 +355,7 @@ check_critical_timing(struct trace_array *tr, max_sequence++; out_unlock: - spin_unlock_irqrestore(&max_trace_lock, flags); + raw_spin_unlock_irqrestore(&max_trace_lock, flags); out: data->critical_sequence = max_sequence; @@ -505,13 +516,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { - if (preempt_trace()) + if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - if (preempt_trace()) + if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); } #endif /* CONFIG_PREEMPT_TRACER */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5fb3697bf0e5..00d527c945a4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp) } /* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static void unregister_trace_probe(struct trace_probe *tp) +static int unregister_trace_probe(struct trace_probe *tp) { + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(tp)) + return -EBUSY; + __unregister_trace_probe(tp); list_del(&tp->list); unregister_probe_event(tp); + + return 0; } /* Register a trace_probe and probe_event */ @@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp) /* Delete old (same name) event if exist */ old_tp = find_trace_probe(tp->call.name, tp->call.class->system); if (old_tp) { - unregister_trace_probe(old_tp); + ret = unregister_trace_probe(old_tp); + if (ret < 0) + goto end; free_trace_probe(old_tp); } @@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb, mutex_lock(&probe_lock); list_for_each_entry(tp, &probe_list, list) { if (trace_probe_within_module(tp, mod)) { + /* Don't need to check busy - this should have gone. */ __unregister_trace_probe(tp); ret = __register_trace_probe(tp); if (ret) @@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv) return -ENOENT; } /* delete an event */ - unregister_trace_probe(tp); - free_trace_probe(tp); + ret = unregister_trace_probe(tp); + if (ret == 0) + free_trace_probe(tp); mutex_unlock(&probe_lock); - return 0; + return ret; } if (argc < 2) { @@ -1317,18 +1327,29 @@ error: return ret; } -static void release_all_trace_probes(void) +static int release_all_trace_probes(void) { struct trace_probe *tp; + int ret = 0; mutex_lock(&probe_lock); + /* Ensure no probe is in use. */ + list_for_each_entry(tp, &probe_list, list) + if (trace_probe_is_enabled(tp)) { + ret = -EBUSY; + goto end; + } /* TODO: Use batch unregistration */ while (!list_empty(&probe_list)) { tp = list_entry(probe_list.next, struct trace_probe, list); unregister_trace_probe(tp); free_trace_probe(tp); } + +end: mutex_unlock(&probe_lock); + + return ret; } /* Probes listing interfaces */ @@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = { static int probes_open(struct inode *inode, struct file *file) { - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - release_all_trace_probes(); + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = release_all_trace_probes(); + if (ret < 0) + return ret; + } return seq_open(file, &probes_seq_op); } @@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void) ret = target(1, 2, 3, 4, 5, 6); + /* Disable trace points before removing it */ + tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting test probe.\n"); + warn++; + } else + disable_trace_probe(tp, TP_FLAG_TRACE); + + tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting 2nd test probe.\n"); + warn++; + } else + disable_trace_probe(tp, TP_FLAG_TRACE); + ret = command_trace_probe("-:testprobe"); if (WARN_ON_ONCE(ret)) { pr_warning("error on deleting a probe.\n"); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 51999309a6cf..0d6ff3555942 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter) unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned long secs = (unsigned long)t; char comm[TASK_COMM_LEN]; + int ret; trace_find_cmdline(entry->pid, comm); - return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", - comm, entry->pid, iter->cpu, secs, usec_rem); + ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", + comm, entry->pid, iter->cpu); + if (!ret) + return 0; + + if (trace_flags & TRACE_ITER_IRQ_INFO) { + ret = trace_print_lat_fmt(s, entry); + if (!ret) + return 0; + } + + return trace_seq_printf(s, " %5lu.%06lu: ", + secs, usec_rem); } int trace_print_lat_context(struct trace_iterator *iter) diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 1f06468a10d7..6fd4ffd042f9 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) continue; } + fmt = NULL; tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); - if (tb_fmt) + if (tb_fmt) { fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); - if (tb_fmt && fmt) { - list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); - strcpy(fmt, *iter); - tb_fmt->fmt = fmt; - *iter = tb_fmt->fmt; - } else { - kfree(tb_fmt); - *iter = NULL; + if (fmt) { + list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); + strcpy(fmt, *iter); + tb_fmt->fmt = fmt; + } else + kfree(tb_fmt); } + *iter = fmt; + } mutex_unlock(&btrace_mutex); } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e4a70c0c71b6..ff791ea48b57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) } static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } -static void wakeup_print_header(struct seq_file *s) { } static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void wakeup_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void wakeup_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ee7b5a0bb9f8..cb654542c1a1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -2,6 +2,7 @@ #include <trace/events/syscalls.h> #include <linux/slab.h> #include <linux/kernel.h> +#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ #include <linux/ftrace.h> #include <linux/perf_event.h> #include <asm/syscall.h> diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index b219f1449c54..db110b8ae030 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[]; static const int tracepoint_debug; /* - * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the - * builtin and module tracepoints and the hash table. + * Tracepoints mutex protects the builtin and module tracepoints and the hash + * table, as well as the local module list. */ static DEFINE_MUTEX(tracepoints_mutex); +#ifdef CONFIG_MODULES +/* Local list of struct module */ +static LIST_HEAD(tracepoint_module_list); +#endif /* CONFIG_MODULES */ + /* * Tracepoint hash table, containing the active tracepoints. * Protected by tracepoints_mutex. @@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem) * @end: end of the range * * Updates the probe callback corresponding to a range of tracepoints. + * Called with tracepoints_mutex held. */ -void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end) +static void tracepoint_update_probe_range(struct tracepoint * const *begin, + struct tracepoint * const *end) { struct tracepoint * const *iter; struct tracepoint_entry *mark_entry; @@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, if (!begin) return; - mutex_lock(&tracepoints_mutex); for (iter = begin; iter < end; iter++) { mark_entry = get_tracepoint((*iter)->name); if (mark_entry) { @@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, disable_tracepoint(*iter); } } - mutex_unlock(&tracepoints_mutex); } +#ifdef CONFIG_MODULES +void module_update_tracepoints(void) +{ + struct tp_module *tp_mod; + + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + tracepoint_update_probe_range(tp_mod->tracepoints_ptrs, + tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints); +} +#else /* CONFIG_MODULES */ +void module_update_tracepoints(void) +{ +} +#endif /* CONFIG_MODULES */ + + /* * Update probes, removing the faulty probes. + * Called with tracepoints_mutex held. */ static void tracepoint_update_probes(void) { @@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data) mutex_lock(&tracepoints_mutex); old = tracepoint_add_probe(name, probe, data); - mutex_unlock(&tracepoints_mutex); - if (IS_ERR(old)) + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); - + } tracepoint_update_probes(); /* may update entry */ + mutex_unlock(&tracepoints_mutex); release_probes(old); return 0; } @@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data) mutex_lock(&tracepoints_mutex); old = tracepoint_remove_probe(name, probe, data); - mutex_unlock(&tracepoints_mutex); - if (IS_ERR(old)) + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); - + } tracepoint_update_probes(); /* may update entry */ + mutex_unlock(&tracepoints_mutex); release_probes(old); return 0; } @@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void) if (!list_empty(&old_probes)) list_replace_init(&old_probes, &release_probes); need_update = 0; - mutex_unlock(&tracepoints_mutex); - tracepoint_update_probes(); + mutex_unlock(&tracepoints_mutex); list_for_each_entry_safe(pos, next, &release_probes, u.list) { list_del(&pos->u.list); call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); @@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); * Will return the first tracepoint in the range if the input tracepoint is * NULL. */ -int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, +static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, struct tracepoint * const *begin, struct tracepoint * const *end) { if (!*tracepoint && begin != end) { @@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, return 1; return 0; } -EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); +#ifdef CONFIG_MODULES static void tracepoint_get_iter(struct tracepoint_iter *iter) { int found = 0; + struct tp_module *iter_mod; /* Core kernel tracepoints */ if (!iter->module) { @@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) if (found) goto end; } - /* tracepoints in modules. */ - found = module_get_iter_tracepoints(iter); + /* Tracepoints in modules */ + mutex_lock(&tracepoints_mutex); + list_for_each_entry(iter_mod, &tracepoint_module_list, list) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->tracepoint = NULL; + found = tracepoint_get_iter_range(&iter->tracepoint, + iter_mod->tracepoints_ptrs, + iter_mod->tracepoints_ptrs + + iter_mod->num_tracepoints); + if (found) { + iter->module = iter_mod; + break; + } + } + mutex_unlock(&tracepoints_mutex); end: if (!found) tracepoint_iter_reset(iter); } +#else /* CONFIG_MODULES */ +static void tracepoint_get_iter(struct tracepoint_iter *iter) +{ + int found = 0; + + /* Core kernel tracepoints */ + found = tracepoint_get_iter_range(&iter->tracepoint, + __start___tracepoints_ptrs, + __stop___tracepoints_ptrs); + if (!found) + tracepoint_iter_reset(iter); +} +#endif /* CONFIG_MODULES */ void tracepoint_iter_start(struct tracepoint_iter *iter) { @@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop); void tracepoint_iter_reset(struct tracepoint_iter *iter) { +#ifdef CONFIG_MODULES iter->module = NULL; +#endif /* CONFIG_MODULES */ iter->tracepoint = NULL; } EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +static int tracepoint_module_coming(struct module *mod) +{ + struct tp_module *tp_mod, *iter; + int ret = 0; + + /* + * We skip modules that tain the kernel, especially those with different + * module header (for forced load), to make sure we don't cause a crash. + */ + if (mod->taints) + return 0; + mutex_lock(&tracepoints_mutex); + tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); + if (!tp_mod) { + ret = -ENOMEM; + goto end; + } + tp_mod->num_tracepoints = mod->num_tracepoints; + tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; + + /* + * tracepoint_module_list is kept sorted by struct module pointer + * address for iteration on tracepoints from a seq_file that can release + * the mutex between calls. + */ + list_for_each_entry_reverse(iter, &tracepoint_module_list, list) { + BUG_ON(iter == tp_mod); /* Should never be in the list twice */ + if (iter < tp_mod) { + /* We belong to the location right after iter. */ + list_add(&tp_mod->list, &iter->list); + goto module_added; + } + } + /* We belong to the beginning of the list */ + list_add(&tp_mod->list, &tracepoint_module_list); +module_added: + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} + +static int tracepoint_module_going(struct module *mod) +{ + struct tp_module *pos; + + mutex_lock(&tracepoints_mutex); + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); + list_for_each_entry(pos, &tracepoint_module_list, list) { + if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { + list_del(&pos->list); + kfree(pos); + break; + } + } + /* + * In the case of modules that were tainted at "coming", we'll simply + * walk through the list without finding it. We cannot use the "tainted" + * flag on "going", in case a module taints the kernel only after being + * loaded. + */ + mutex_unlock(&tracepoints_mutex); + return 0; +} int tracepoint_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; + int ret = 0; switch (val) { case MODULE_STATE_COMING: + ret = tracepoint_module_coming(mod); + break; + case MODULE_STATE_LIVE: + break; case MODULE_STATE_GOING: - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + ret = tracepoint_module_going(mod); break; } - return 0; + return ret; } struct notifier_block tracepoint_module_nb = { @@ -598,7 +724,6 @@ static int init_tracepoints(void) return register_module_notifier(&tracepoint_module_nb); } __initcall(init_tracepoints); - #endif /* CONFIG_MODULES */ #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 5bbfac85866e..23b4d784ebdd 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk) local_irq_save(flags); time = tsk->stime + tsk->utime; - dtime = cputime_sub(time, tsk->acct_timexpd); + dtime = time - tsk->acct_timexpd; jiffies_to_timeval(cputime_to_jiffies(dtime), &value); delta = value.tv_sec; delta = delta * USEC_PER_SEC + value.tv_usec; diff --git a/kernel/up.c b/kernel/up.c index 1ff27a28bb7d..c54c75e9faf7 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -4,7 +4,7 @@ #include <linux/interrupt.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/smp.h> int smp_call_function_single(int cpu, void (*func) (void *info), void *info, diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 92cb706c7fc8..1744bb80f1fb 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -2,7 +2,7 @@ #include <linux/user-return-notifier.h> #include <linux/percpu.h> #include <linux/sched.h> -#include <linux/module.h> +#include <linux/export.h> static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); diff --git a/kernel/user.c b/kernel/user.c index 9e03e9c1df8d..71dd2363ab0f 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -14,7 +14,7 @@ #include <linux/bitops.h> #include <linux/key.h> #include <linux/interrupt.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/user_namespace.h> /* diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9da289c34f22..3b906e98b1db 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -5,7 +5,7 @@ * License. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> diff --git a/kernel/utsname.c b/kernel/utsname.c index bff131b9510a..405caf91aad5 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -9,7 +9,7 @@ * License. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/uts.h> #include <linux/utsname.h> #include <linux/err.h> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index a2cd77e70d4d..63da38c2d820 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -9,10 +9,11 @@ * License. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/uts.h> #include <linux/utsname.h> #include <linux/sysctl.h> +#include <linux/wait.h> static void *get_uts(ctl_table *table, int write) { @@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write, uts_table.data = get_uts(table, write); r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); + + if (write) + proc_sys_poll_notify(table->poll); + return r; } #else #define proc_do_uts_string NULL #endif +static DEFINE_CTL_TABLE_POLL(hostname_poll); +static DEFINE_CTL_TABLE_POLL(domainname_poll); + static struct ctl_table uts_kern_table[] = { { .procname = "ostype", @@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = { .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = proc_do_uts_string, + .poll = &hostname_poll, }, { .procname = "domainname", @@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = { .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = proc_do_uts_string, + .poll = &domainname_poll, }, {} }; @@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = { {} }; +#ifdef CONFIG_PROC_SYSCTL +/* + * Notify userspace about a change in a certain entry of uts_kern_table, + * identified by the parameter proc. + */ +void uts_proc_notify(enum uts_proc proc) +{ + struct ctl_table *table = &uts_kern_table[proc]; + + proc_sys_poll_notify(table->poll); +} +#endif + static int __init utsname_sysctl_init(void) { register_sysctl_table(uts_root_table); diff --git a/kernel/wait.c b/kernel/wait.c index f45ea8d2a1ce..7fdd9eaca2c3 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -4,16 +4,16 @@ * (C) 2004 William Irwin, Oracle */ #include <linux/init.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/wait.h> #include <linux/hash.h> -void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) +void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) { spin_lock_init(&q->lock); - lockdep_set_class(&q->lock, key); + lockdep_set_class_and_name(&q->lock, key, name); INIT_LIST_HEAD(&q->task_list); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 36491cd5b7d4..1d7bca7f4f52 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ static int watchdog(void *unused) { - static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -350,7 +350,8 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); - + param.sched_priority = 0; + sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; } @@ -438,7 +439,7 @@ static int watchdog_enable(int cpu) /* create the watchdog thread */ if (!p) { - p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); + p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); if (!err) { @@ -480,6 +481,8 @@ static void watchdog_disable(int cpu) } } +/* sysctl functions */ +#ifdef CONFIG_SYSCTL static void watchdog_enable_all_cpus(void) { int cpu; @@ -509,8 +512,6 @@ static void watchdog_disable_all_cpus(void) } -/* sysctl functions */ -#ifdef CONFIG_SYSCTL /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1783aabc6128..bec7b5b53e03 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -23,7 +23,7 @@ * Please read Documentation/workqueue.txt for details. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> @@ -242,10 +242,10 @@ struct workqueue_struct { int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved cwq max_active */ - const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif + char name[]; /* I: workqueue name */ }; struct workqueue_struct *system_wq __read_mostly; @@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, return clamp_val(max_active, 1, lim); } -struct workqueue_struct *__alloc_workqueue_key(const char *name, +struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, struct lock_class_key *key, - const char *lock_name) + const char *lock_name, ...) { + va_list args, args1; struct workqueue_struct *wq; unsigned int cpu; + size_t namelen; + + /* determine namelen, allocate wq and format name */ + va_start(args, lock_name); + va_copy(args1, args); + namelen = vsnprintf(NULL, 0, fmt, args) + 1; + + wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); + if (!wq) + goto err; + + vsnprintf(wq->name, namelen, fmt, args1); + va_end(args); + va_end(args1); /* * Workqueues which may be used during memory reclaim should @@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, flags |= WQ_HIGHPRI; max_active = max_active ?: WQ_DFL_ACTIVE; - max_active = wq_clamp_max_active(max_active, flags, name); - - wq = kzalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) - goto err; + max_active = wq_clamp_max_active(max_active, flags, wq->name); + /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; mutex_init(&wq->flush_mutex); @@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); - wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); @@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, if (!rescuer) goto err; - rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + rescuer->task = kthread_create(rescuer_thread, wq, "%s", + wq->name); if (IS_ERR(rescuer->task)) goto err; |