diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2020-05-13 12:14:05 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2020-05-13 12:14:05 -0400 |
commit | 4aef2ec9022b217f74d0f4c9b84081f07cc223d9 (patch) | |
tree | edf9bb9ca1f8ab6345c156a7e87aaed28939f66c /kernel | |
parent | 7c67f54661fcc8d141fb11abbab1739f32e13b03 (diff) | |
parent | 37486135d3a7b03acc7755b63627a130437f066a (diff) | |
download | linux-stable-4aef2ec9022b217f74d0f4c9b84081f07cc223d9.tar.gz linux-stable-4aef2ec9022b217f74d0f4c9b84081f07cc223d9.tar.bz2 linux-stable-4aef2ec9022b217f74d0f4c9b84081f07cc223d9.zip |
Merge branch 'kvm-amd-fixes' into HEAD
Diffstat (limited to 'kernel')
60 files changed, 1534 insertions, 697 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index 34d1e77ee9df..78701ea37c97 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,6 +1,4 @@ -# -# Generated files -# +# SPDX-License-Identifier: GPL-2.0-only kheaders.md5 timeconst.h hz.bc diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index f0d243318452..3596448bfdab 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -160,23 +160,14 @@ static int audit_mark_handle_event(struct fsnotify_group *group, { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct audit_fsnotify_mark *audit_mark; - const struct inode *inode = NULL; + const struct inode *inode = fsnotify_data_inode(data, data_type); audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); BUG_ON(group != audit_fsnotify_group); - switch (data_type) { - case (FSNOTIFY_EVENT_PATH): - inode = ((const struct path *)data)->dentry->d_inode; - break; - case (FSNOTIFY_EVENT_INODE): - inode = (const struct inode *)data; - break; - default: - BUG(); + if (WARN_ON(!inode)) return 0; - } if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 8a8fd732ff6d..e09c551ae52d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -471,25 +471,13 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); - const struct inode *inode; + const struct inode *inode = fsnotify_data_inode(data, data_type); struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); BUG_ON(group != audit_watch_group); - - switch (data_type) { - case (FSNOTIFY_EVENT_PATH): - inode = d_backing_inode(((const struct path *)data)->dentry); - break; - case (FSNOTIFY_EVENT_INODE): - inode = (const struct inode *)data; - break; - default: - BUG(); - inode = NULL; - break; - } + WARN_ON(!inode); if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index f02504640e18..6b12f06ee18c 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -30,7 +30,7 @@ struct bpf_lru_node { struct bpf_lru_list { struct list_head lists[NR_BPF_LRU_LIST_T]; unsigned int counts[NR_BPF_LRU_LIST_COUNT]; - /* The next inacitve list rotation starts from here */ + /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation; raw_spinlock_t lock ____cacheline_aligned_in_smp; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64783da34202..d85f37239540 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -586,9 +586,7 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - bpf_map_inc_with_uref(map); - - if (vma->vm_flags & VM_WRITE) { + if (vma->vm_flags & VM_MAYWRITE) { mutex_lock(&map->freeze_mutex); map->writecnt++; mutex_unlock(&map->freeze_mutex); @@ -600,13 +598,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_WRITE) { + if (vma->vm_flags & VM_MAYWRITE) { mutex_lock(&map->freeze_mutex); map->writecnt--; mutex_unlock(&map->freeze_mutex); } - - bpf_map_put_with_uref(map); } static const struct vm_operations_struct bpf_map_default_vmops = { @@ -635,14 +631,16 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; + vma->vm_flags &= ~VM_MAYEXEC; + if (!(vma->vm_flags & VM_WRITE)) + /* disallow re-mapping with PROT_WRITE */ + vma->vm_flags &= ~VM_MAYWRITE; err = map->ops->map_mmap(map, vma); if (err) goto out; - bpf_map_inc_with_uref(map); - - if (vma->vm_flags & VM_WRITE) + if (vma->vm_flags & VM_MAYWRITE) map->writecnt++; out: mutex_unlock(&map->freeze_mutex); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04c6630cc18f..38cfcf701eeb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1255,8 +1255,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? - true : false; + reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; __mark_reg_unbounded(reg); } diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index f2d7cea86ffe..191c329e482a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -38,10 +38,7 @@ static bool cgroup_no_v1_named; */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; -/* - * Protects cgroup_subsys->release_agent_path. Modifying it also requires - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. - */ +/* protects cgroup_subsys->release_agent_path */ static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup1_ssid_disabled(int ssid) @@ -775,22 +772,29 @@ void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; + char *pathbuf, *agentbuf; char *argv[3], *envp[3]; int ret; - mutex_lock(&cgroup_mutex); + /* snoop agent path and exit early if empty */ + if (!cgrp->root->release_agent_path[0]) + return; + /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf || !strlen(agentbuf)) - goto out; + agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out_free; - spin_lock_irq(&css_set_lock); - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_irq(&css_set_lock); + spin_lock(&release_agent_path_lock); + strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); + spin_unlock(&release_agent_path_lock); + if (!agentbuf[0]) + goto out_free; + + ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); if (ret < 0 || ret >= PATH_MAX) - goto out; + goto out_free; argv[0] = agentbuf; argv[1] = pathbuf; @@ -801,11 +805,7 @@ void cgroup1_release_agent(struct work_struct *work) envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; - mutex_unlock(&cgroup_mutex); call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - goto out_free; -out: - mutex_unlock(&cgroup_mutex); out_free: kfree(agentbuf); kfree(pathbuf); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 755c07d845ce..06b5ea9d899d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1966,7 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | - KERNFS_ROOT_SUPPORT_EXPORTOP, + KERNFS_ROOT_SUPPORT_EXPORTOP | + KERNFS_ROOT_SUPPORT_USER_XATTR, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); @@ -2726,11 +2727,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; - int ret; - - ret = cgroup_migrate_vet_dst(dst_cgrp); - if (ret) - return ret; + int ret = 0; /* look up all src csets */ spin_lock_irq(&css_set_lock); @@ -4160,7 +4157,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, } else if (likely(!(pos->flags & CSS_RELEASED))) { next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { - list_for_each_entry_rcu(next, &parent->children, sibling) + list_for_each_entry_rcu(next, &parent->children, sibling, + lockdep_is_held(&cgroup_mutex)) if (next->serial_nr > pos->serial_nr) break; } @@ -4403,29 +4401,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) lockdep_assert_held(&css_set_lock); - /* Advance to the next non-empty css_set */ - do { - cset = css_task_iter_next_css_set(it); - if (!cset) { - it->task_pos = NULL; - return; + /* Advance to the next non-empty css_set and find first non-empty tasks list*/ + while ((cset = css_task_iter_next_css_set(it))) { + if (!list_empty(&cset->tasks)) { + it->cur_tasks_head = &cset->tasks; + break; + } else if (!list_empty(&cset->mg_tasks)) { + it->cur_tasks_head = &cset->mg_tasks; + break; + } else if (!list_empty(&cset->dying_tasks)) { + it->cur_tasks_head = &cset->dying_tasks; + break; } - } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - - if (!list_empty(&cset->tasks)) { - it->task_pos = cset->tasks.next; - it->cur_tasks_head = &cset->tasks; - } else if (!list_empty(&cset->mg_tasks)) { - it->task_pos = cset->mg_tasks.next; - it->cur_tasks_head = &cset->mg_tasks; - } else { - it->task_pos = cset->dying_tasks.next; - it->cur_tasks_head = &cset->dying_tasks; } - - it->tasks_head = &cset->tasks; - it->mg_tasks_head = &cset->mg_tasks; - it->dying_tasks_head = &cset->dying_tasks; + if (!cset) { + it->task_pos = NULL; + return; + } + it->task_pos = it->cur_tasks_head->next; /* * We don't keep css_sets locked across iteration steps and thus @@ -4470,24 +4463,24 @@ static void css_task_iter_advance(struct css_task_iter *it) repeat: if (it->task_pos) { /* - * Advance iterator to find next entry. cset->tasks is - * consumed first and then ->mg_tasks. After ->mg_tasks, - * we move onto the next cset. + * Advance iterator to find next entry. We go through cset + * tasks, mg_tasks and dying_tasks, when consumed we move onto + * the next cset. */ if (it->flags & CSS_TASK_ITER_SKIPPED) it->flags &= ~CSS_TASK_ITER_SKIPPED; else it->task_pos = it->task_pos->next; - if (it->task_pos == it->tasks_head) { - it->task_pos = it->mg_tasks_head->next; - it->cur_tasks_head = it->mg_tasks_head; + if (it->task_pos == &it->cur_cset->tasks) { + it->cur_tasks_head = &it->cur_cset->mg_tasks; + it->task_pos = it->cur_tasks_head->next; } - if (it->task_pos == it->mg_tasks_head) { - it->task_pos = it->dying_tasks_head->next; - it->cur_tasks_head = it->dying_tasks_head; + if (it->task_pos == &it->cur_cset->mg_tasks) { + it->cur_tasks_head = &it->cur_cset->dying_tasks; + it->task_pos = it->cur_tasks_head->next; } - if (it->task_pos == it->dying_tasks_head) + if (it->task_pos == &it->cur_cset->dying_tasks) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ @@ -4505,12 +4498,12 @@ repeat: goto repeat; /* and dying leaders w/o live member threads */ - if (it->cur_tasks_head == it->dying_tasks_head && + if (it->cur_tasks_head == &it->cur_cset->dying_tasks && !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ - if (it->cur_tasks_head == it->dying_tasks_head) + if (it->cur_tasks_head == &it->cur_cset->dying_tasks) goto repeat; } } @@ -4674,13 +4667,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v) return 0; } +static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) +{ + int ret; + struct inode *inode; + + lockdep_assert_held(&cgroup_mutex); + + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + return ret; +} + static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb) { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; - struct inode *inode; int ret; lockdep_assert_held(&cgroup_mutex); @@ -4690,12 +4698,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, com_cgrp = cgroup_parent(com_cgrp); /* %current should be authorized to migrate to the common ancestor */ - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); - if (!inode) - return -ENOMEM; - - ret = inode_permission(inode, MAY_WRITE); - iput(inode); + ret = cgroup_may_write(com_cgrp, sb); if (ret) return ret; @@ -4711,6 +4714,26 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, return 0; } +static int cgroup_attach_permissions(struct cgroup *src_cgrp, + struct cgroup *dst_cgrp, + struct super_block *sb, bool threadgroup) +{ + int ret = 0; + + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + if (ret) + return ret; + + ret = cgroup_migrate_vet_dst(dst_cgrp); + if (ret) + return ret; + + if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) + ret = -EOPNOTSUPP; + + return ret; +} + static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -4733,8 +4756,8 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, true); if (ret) goto out_finish; @@ -4778,16 +4801,11 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, spin_unlock_irq(&css_set_lock); /* thread migrations follow the cgroup.procs delegation rule */ - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, false); if (ret) goto out_finish; - /* and must be contained in the same domain */ - ret = -EOPNOTSUPP; - if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) - goto out_finish; - ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: @@ -5876,8 +5894,7 @@ out: * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() - * attaches it to the parent's css_set. Empty cg_list indicates that - * @child isn't holding reference to its css_set. + * attaches it to the target css_set. */ void cgroup_fork(struct task_struct *child) { @@ -5885,21 +5902,172 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + if (IS_ERR(css)) + return ERR_CAST(css); + + cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + + return cgrp; +} + +/** + * cgroup_css_set_fork - find or create a css_set for a child process + * @kargs: the arguments passed to create the child process + * + * This functions finds or creates a new css_set which the child + * process will be attached to in cgroup_post_fork(). By default, + * the child process will be given the same css_set as its parent. + * + * If CLONE_INTO_CGROUP is specified this function will try to find an + * existing css_set which includes the requested cgroup and if not create + * a new css_set that the child will be attached to later. If this function + * succeeds it will hold cgroup_threadgroup_rwsem on return. If + * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex + * before grabbing cgroup_threadgroup_rwsem and will hold a reference + * to the target cgroup. + */ +static int cgroup_css_set_fork(struct kernel_clone_args *kargs) + __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) +{ + int ret; + struct cgroup *dst_cgrp = NULL; + struct css_set *cset; + struct super_block *sb; + struct file *f; + + if (kargs->flags & CLONE_INTO_CGROUP) + mutex_lock(&cgroup_mutex); + + cgroup_threadgroup_change_begin(current); + + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + if (!(kargs->flags & CLONE_INTO_CGROUP)) { + kargs->cset = cset; + return 0; + } + + f = fget_raw(kargs->cgroup); + if (!f) { + ret = -EBADF; + goto err; + } + sb = f->f_path.dentry->d_sb; + + dst_cgrp = cgroup_get_from_file(f); + if (IS_ERR(dst_cgrp)) { + ret = PTR_ERR(dst_cgrp); + dst_cgrp = NULL; + goto err; + } + + if (cgroup_is_dead(dst_cgrp)) { + ret = -ENODEV; + goto err; + } + + /* + * Verify that we the target cgroup is writable for us. This is + * usually done by the vfs layer but since we're not going through + * the vfs layer here we need to do it "manually". + */ + ret = cgroup_may_write(dst_cgrp, sb); + if (ret) + goto err; + + ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, + !(kargs->flags & CLONE_THREAD)); + if (ret) + goto err; + + kargs->cset = find_css_set(cset, dst_cgrp); + if (!kargs->cset) { + ret = -ENOMEM; + goto err; + } + + put_css_set(cset); + fput(f); + kargs->cgrp = dst_cgrp; + return ret; + +err: + cgroup_threadgroup_change_end(current); + mutex_unlock(&cgroup_mutex); + if (f) + fput(f); + if (dst_cgrp) + cgroup_put(dst_cgrp); + put_css_set(cset); + if (kargs->cset) + put_css_set(kargs->cset); + return ret; +} + +/** + * cgroup_css_set_put_fork - drop references we took during fork + * @kargs: the arguments passed to create the child process + * + * Drop references to the prepared css_set and target cgroup if + * CLONE_INTO_CGROUP was requested. + */ +static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) +{ + cgroup_threadgroup_change_end(current); + + if (kargs->flags & CLONE_INTO_CGROUP) { + struct cgroup *cgrp = kargs->cgrp; + struct css_set *cset = kargs->cset; + + mutex_unlock(&cgroup_mutex); + + if (cset) { + put_css_set(cset); + kargs->cset = NULL; + } + + if (cgrp) { + cgroup_put(cgrp); + kargs->cgrp = NULL; + } + } +} + /** * cgroup_can_fork - called on a new task before the process is exposed - * @child: the task in question. + * @child: the child process * - * This calls the subsystem can_fork() callbacks. If the can_fork() callback - * returns an error, the fork aborts with that error code. This allows for - * a cgroup subsystem to conditionally allow or deny new forks. + * This prepares a new css_set for the child process which the child will + * be attached to in cgroup_post_fork(). + * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() + * callback returns an error, the fork aborts with that error code. This + * allows for a cgroup subsystem to conditionally allow or deny new forks. */ -int cgroup_can_fork(struct task_struct *child) +int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i, j, ret; + ret = cgroup_css_set_fork(kargs); + if (ret) + return ret; + do_each_subsys_mask(ss, i, have_canfork_callback) { - ret = ss->can_fork(child); + ret = ss->can_fork(child, kargs->cset); if (ret) goto out_revert; } while_each_subsys_mask(); @@ -5911,54 +6079,64 @@ out_revert: if (j >= i) break; if (ss->cancel_fork) - ss->cancel_fork(child); + ss->cancel_fork(child, kargs->cset); } + cgroup_css_set_put_fork(kargs); + return ret; } /** * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() - * @child: the task in question + * @child: the child process + * @kargs: the arguments passed to create the child process * * This calls the cancel_fork() callbacks if a fork failed *after* - * cgroup_can_fork() succeded. + * cgroup_can_fork() succeded and cleans up references we took to + * prepare a new css_set for the child process in cgroup_can_fork(). */ -void cgroup_cancel_fork(struct task_struct *child) +void cgroup_cancel_fork(struct task_struct *child, + struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) - ss->cancel_fork(child); + ss->cancel_fork(child, kargs->cset); + + cgroup_css_set_put_fork(kargs); } /** - * cgroup_post_fork - called on a new task after adding it to the task list - * @child: the task in question - * - * Adds the task to the list running through its css_set if necessary and - * call the subsystem fork() callbacks. Has to be after the task is - * visible on the task list in case we race with the first call to - * cgroup_task_iter_start() - to guarantee that the new task ends up on its - * list. + * cgroup_post_fork - finalize cgroup setup for the child process + * @child: the child process + * + * Attach the child process to its css_set calling the subsystem fork() + * callbacks. */ -void cgroup_post_fork(struct task_struct *child) +void cgroup_post_fork(struct task_struct *child, + struct kernel_clone_args *kargs) + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { struct cgroup_subsys *ss; struct css_set *cset; int i; + cset = kargs->cset; + kargs->cset = NULL; + spin_lock_irq(&css_set_lock); /* init tasks are special, only link regular threads */ if (likely(child->pid)) { WARN_ON_ONCE(!list_empty(&child->cg_list)); - cset = task_css_set(current); /* current is @child's parent */ - get_css_set(cset); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); + } else { + put_css_set(cset); + cset = NULL; } /* @@ -5990,6 +6168,17 @@ void cgroup_post_fork(struct task_struct *child) do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); } while_each_subsys_mask(); + + /* Make the new cset the root_cset of the new cgroup namespace. */ + if (kargs->flags & CLONE_NEWCGROUP) { + struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; + + get_css_set(cset); + child->nsproxy->cgroup_ns->root_cset = cset; + put_css_set(rcset); + } + + cgroup_css_set_put_fork(kargs); } /** @@ -6176,7 +6365,6 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); */ struct cgroup *cgroup_get_from_fd(int fd) { - struct cgroup_subsys_state *css; struct cgroup *cgrp; struct file *f; @@ -6184,17 +6372,8 @@ struct cgroup *cgroup_get_from_fd(int fd) if (!f) return ERR_PTR(-EBADF); - css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + cgrp = cgroup_get_from_file(f); fput(f); - if (IS_ERR(css)) - return ERR_CAST(css); - - cgrp = css->cgroup; - if (!cgroup_on_dfl(cgrp)) { - cgroup_put(cgrp); - return ERR_PTR(-EBADF); - } - return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58f5073acff7..729d3a5c772e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -358,8 +358,12 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); /* - * Cgroup v2 behavior is used when on default hierarchy or the - * cgroup_v2_mode flag is set. + * Cgroup v2 behavior is used on the "cpus" and "mems" control files when + * on default hierarchy or when the cpuset_v2_mode flag is set by mounting + * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. + * With v2 behavior, "cpus" and "mems" are always what the users have + * requested and won't be changed by hotplug events. Only the effective + * cpus or mems will be affected. */ static inline bool is_in_v2_mode(void) { diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 138059eb730d..511af87f685e 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -33,6 +33,7 @@ #include <linux/atomic.h> #include <linux/cgroup.h> #include <linux/slab.h> +#include <linux/sched/task.h> #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" @@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on cgroup_threadgroup_change_begin() held by the copy_process(). */ -static int pids_can_fork(struct task_struct *task) +static int pids_can_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; int err; - css = task_css_check(current, pids_cgrp_id, true); + if (cset) + css = cset->subsys[pids_cgrp_id]; + else + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); err = pids_try_charge(pids, 1); if (err) { @@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task) return err; } -static void pids_cancel_fork(struct task_struct *task) +static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; - css = task_css_check(current, pids_cgrp_id, true); + if (cset) + css = cset->subsys[pids_cgrp_id]; + else + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); pids_uncharge(pids, 1); } diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 7fa0c4ae6394..8a44b93da0f3 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -6,7 +6,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set -CONFIG_OPTIMIZE_INLINING=y # CONFIG_SLAB is not set # CONFIG_SLUB is not set CONFIG_SLOB=y diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore index 396d12eda9e8..df259542a236 100644 --- a/kernel/debug/kdb/.gitignore +++ b/kernel/debug/kdb/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only gen-kdb_cmds.c diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index ba12e9f4661e..515379cbf209 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -399,6 +399,13 @@ int kdb_set(int argc, const char **argv) return KDB_ARGCOUNT; /* + * Censor sensitive variables + */ + if (strcmp(argv[1], "PROMPT") == 0 && + !kdb_check_flags(KDB_ENABLE_MEM_READ, kdb_cmd_enabled, false)) + return KDB_NOPERM; + + /* * Check for internal variables */ if (strcmp(argv[1], "KDBDEBUG") == 0) { @@ -1102,12 +1109,12 @@ static int handle_ctrl_cmd(char *cmd) case CTRL_P: if (cmdptr != cmd_tail) cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT; - strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); + strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); return 1; case CTRL_N: if (cmdptr != cmd_head) cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT; - strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); + strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); return 1; } return 0; @@ -1298,12 +1305,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, *(cmd_hist[cmd_head]) = '\0'; do_full_getstr: -#if defined(CONFIG_SMP) + /* PROMPT can only be set if we have MEM_READ permission. */ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), raw_smp_processor_id()); -#else - snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT")); -#endif if (defcmd_in_progress) strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN); @@ -1314,7 +1318,7 @@ do_full_getstr: if (*cmdbuf != '\n') { if (*cmdbuf < 32) { if (cmdptr == cmd_head) { - strncpy(cmd_hist[cmd_head], cmd_cur, + strscpy(cmd_hist[cmd_head], cmd_cur, CMD_BUFLEN); *(cmd_hist[cmd_head] + strlen(cmd_hist[cmd_head])-1) = '\0'; @@ -1324,7 +1328,7 @@ do_full_getstr: cmdbuf = cmd_cur; goto do_full_getstr; } else { - strncpy(cmd_hist[cmd_head], cmd_cur, + strscpy(cmd_hist[cmd_head], cmd_cur, CMD_BUFLEN); } diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 551b0eb7028a..2a0c4985f38e 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -134,7 +134,7 @@ static void *__dma_alloc_from_coherent(struct device *dev, spin_lock_irqsave(&mem->spinlock, flags); - if (unlikely(size > (mem->size << PAGE_SHIFT))) + if (unlikely(size > ((dma_addr_t)mem->size << PAGE_SHIFT))) goto err; pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); @@ -144,8 +144,9 @@ static void *__dma_alloc_from_coherent(struct device *dev, /* * Memory was found in the coherent area. */ - *dma_handle = dma_get_device_base(dev, mem) + (pageno << PAGE_SHIFT); - ret = mem->virt_base + (pageno << PAGE_SHIFT); + *dma_handle = dma_get_device_base(dev, mem) + + ((dma_addr_t)pageno << PAGE_SHIFT); + ret = mem->virt_base + ((dma_addr_t)pageno << PAGE_SHIFT); spin_unlock_irqrestore(&mem->spinlock, flags); memset(ret, 0, size); return ret; @@ -194,7 +195,7 @@ static int __dma_release_from_coherent(struct dma_coherent_mem *mem, int order, void *vaddr) { if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { + (mem->virt_base + ((dma_addr_t)mem->size << PAGE_SHIFT))) { int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; unsigned long flags; @@ -238,10 +239,10 @@ static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem, struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) { if (mem && vaddr >= mem->virt_base && vaddr + size <= - (mem->virt_base + (mem->size << PAGE_SHIFT))) { + (mem->virt_base + ((dma_addr_t)mem->size << PAGE_SHIFT))) { unsigned long off = vma->vm_pgoff; int start = (vaddr - mem->virt_base) >> PAGE_SHIFT; - int user_count = vma_pages(vma); + unsigned long user_count = vma_pages(vma); int count = PAGE_ALIGN(size) >> PAGE_SHIFT; *ret = -ENXIO; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 2031ed1ad7fa..9e1777c81f55 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -137,9 +137,12 @@ static const char *const maperr2str[] = { [MAP_ERR_CHECKED] = "dma map error checked", }; -static const char *type2name[5] = { "single", "page", - "scather-gather", "coherent", - "resource" }; +static const char *type2name[] = { + [dma_debug_single] = "single", + [dma_debug_sg] = "scather-gather", + [dma_debug_coherent] = "coherent", + [dma_debug_resource] = "resource", +}; static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", "DMA_FROM_DEVICE", "DMA_NONE" }; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index ac7956c38f69..8f4bbdaf965e 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -39,7 +39,8 @@ static inline struct page *dma_direct_to_page(struct device *dev, u64 dma_direct_get_required_mask(struct device *dev) { - u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); + phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT; + u64 max_dma = phys_to_dma_direct(dev, phys); return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } @@ -157,11 +158,8 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size), dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); - if (!ret) { - dma_free_contiguous(dev, page, size); - return ret; - } - + if (!ret) + goto out_free_pages; memset(ret, 0, size); goto done; } @@ -174,8 +172,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, * so log an error and fail. */ dev_info(dev, "Rejecting highmem page from CMA.\n"); - dma_free_contiguous(dev, page, size); - return NULL; + goto out_free_pages; } ret = page_address(page); @@ -184,10 +181,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, memset(ret, 0, size); - if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && dma_alloc_need_uncached(dev, attrs)) { arch_dma_prep_coherent(page, size); - ret = uncached_kernel_address(ret); + ret = arch_dma_set_uncached(ret, size); + if (IS_ERR(ret)) + goto out_free_pages; } done: if (force_dma_unencrypted(dev)) @@ -195,6 +194,9 @@ done: else *dma_handle = phys_to_dma(dev, page_to_phys(page)); return ret; +out_free_pages: + dma_free_contiguous(dev, page, size); + return NULL; } void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, @@ -218,6 +220,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) vunmap(cpu_addr); + else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) + arch_dma_clear_uncached(cpu_addr, size); dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); } @@ -225,7 +229,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); @@ -235,7 +239,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs)) arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); diff --git a/kernel/events/core.c b/kernel/events/core.c index e1459df73043..bc9b98a9af9a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -28,6 +28,7 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> +#include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> @@ -387,6 +388,7 @@ static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; +static atomic_t nr_cgroup_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -981,16 +983,10 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) event->shadow_ctx_time = now - t->timestamp; } -/* - * Update cpuctx->cgrp so that it is set when first cgroup event is added and - * cleared when last cgroup event is removed. - */ static inline void -list_update_cgroup_event(struct perf_event *event, - struct perf_event_context *ctx, bool add) +perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; - struct list_head *cpuctx_entry; if (!is_cgroup_event(event)) return; @@ -1007,28 +1003,41 @@ list_update_cgroup_event(struct perf_event *event, * because if the first would mismatch, the second would not try again * and we would leave cpuctx->cgrp unset. */ - if (add && !cpuctx->cgrp) { + if (ctx->is_active && !cpuctx->cgrp) { struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) cpuctx->cgrp = cgrp; } - if (add && ctx->nr_cgroups++) + if (ctx->nr_cgroups++) return; - else if (!add && --ctx->nr_cgroups) + + list_add(&cpuctx->cgrp_cpuctx_entry, + per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); +} + +static inline void +perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx; + + if (!is_cgroup_event(event)) return; - /* no cgroup running */ - if (!add) + /* + * Because cgroup events are always per-cpu events, + * @ctx == &cpuctx->ctx. + */ + cpuctx = container_of(ctx, struct perf_cpu_context, ctx); + + if (--ctx->nr_cgroups) + return; + + if (ctx->is_active && cpuctx->cgrp) cpuctx->cgrp = NULL; - cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; - if (add) - list_add(cpuctx_entry, - per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); - else - list_del(cpuctx_entry); + list_del(&cpuctx->cgrp_cpuctx_entry); } #else /* !CONFIG_CGROUP_PERF */ @@ -1094,11 +1103,14 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) } static inline void -list_update_cgroup_event(struct perf_event *event, - struct perf_event_context *ctx, bool add) +perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { } +static inline void +perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) +{ +} #endif /* @@ -1789,13 +1801,14 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) add_event_to_groups(event, ctx); } - list_update_cgroup_event(event, ctx, true); - list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; + if (event->state > PERF_EVENT_STATE_OFF) + perf_cgroup_event_enable(event, ctx); + ctx->generation++; } @@ -1861,6 +1874,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); + if (sample_type & PERF_SAMPLE_CGROUP) + size += sizeof(data->cgroup); + event->header_size = size; } @@ -1971,8 +1987,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - list_update_cgroup_event(event, ctx, false); - ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -1989,8 +2003,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * of error state is by explicit re-enabling * of the event */ - if (event->state > PERF_EVENT_STATE_OFF) + if (event->state > PERF_EVENT_STATE_OFF) { + perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); + } ctx->generation++; } @@ -2221,6 +2237,7 @@ event_sched_out(struct perf_event *event, if (READ_ONCE(event->pending_disable) >= 0) { WRITE_ONCE(event->pending_disable, -1); + perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } perf_event_set_state(event, state); @@ -2358,6 +2375,7 @@ static void __perf_event_disable(struct perf_event *event, event_sched_out(event, cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); + perf_cgroup_event_disable(event, ctx); } /* @@ -2741,7 +2759,7 @@ static int __perf_install_in_context(void *info) } #ifdef CONFIG_CGROUP_PERF - if (is_cgroup_event(event)) { + if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { /* * If the current cgroup doesn't match the event's * cgroup, we should not try to schedule it. @@ -2901,6 +2919,7 @@ static void __perf_event_enable(struct perf_event *event, ctx_sched_out(ctx, cpuctx, EVENT_TIME); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); + perf_cgroup_event_enable(event, ctx); if (!ctx->is_active) return; @@ -3503,7 +3522,8 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, static bool perf_less_group_idx(const void *l, const void *r) { - const struct perf_event *le = l, *re = r; + const struct perf_event *le = *(const struct perf_event **)l; + const struct perf_event *re = *(const struct perf_event **)r; return le->group_index < re->group_index; } @@ -3611,8 +3631,10 @@ static int merge_sched_in(struct perf_event *event, void *data) } if (event->state == PERF_EVENT_STATE_INACTIVE) { - if (event->attr.pinned) + if (event->attr.pinned) { + perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + } *can_add_hw = 0; ctx->rotate_necessary = 1; @@ -4608,6 +4630,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_comm_events); if (event->attr.namespaces) atomic_dec(&nr_namespaces_events); + if (event->attr.cgroup) + atomic_dec(&nr_cgroup_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) @@ -6864,6 +6888,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); + if (sample_type & PERF_SAMPLE_CGROUP) + perf_output_put(handle, data->cgroup); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -6907,9 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe __get_user_pages_fast first. * If failed, leave phys_addr as 0. */ - if ((current->mm != NULL) && - (__get_user_pages_fast(virt, 1, 0, &p) == 1)) - phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + if (current->mm != NULL) { + pagefault_disable(); + if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + pagefault_enable(); + } if (p) put_page(p); @@ -7063,6 +7093,16 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); +#ifdef CONFIG_CGROUP_PERF + if (sample_type & PERF_SAMPLE_CGROUP) { + struct cgroup *cgrp; + + /* protected by RCU */ + cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; + data->cgroup = cgroup_id(cgrp); + } +#endif + if (sample_type & PERF_SAMPLE_AUX) { u64 size; @@ -7736,6 +7776,105 @@ void perf_event_namespaces(struct task_struct *task) } /* + * cgroup tracking + */ +#ifdef CONFIG_CGROUP_PERF + +struct perf_cgroup_event { + char *path; + int path_size; + struct { + struct perf_event_header header; + u64 id; + char path[]; + } event_id; +}; + +static int perf_event_cgroup_match(struct perf_event *event) +{ + return event->attr.cgroup; +} + +static void perf_event_cgroup_output(struct perf_event *event, void *data) +{ + struct perf_cgroup_event *cgroup_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u16 header_size = cgroup_event->event_id.header.size; + int ret; + + if (!perf_event_cgroup_match(event)) + return; + + perf_event_header__init_id(&cgroup_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + cgroup_event->event_id.header.size); + if (ret) + goto out; + + perf_output_put(&handle, cgroup_event->event_id); + __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + cgroup_event->event_id.header.size = header_size; +} + +static void perf_event_cgroup(struct cgroup *cgrp) +{ + struct perf_cgroup_event cgroup_event; + char path_enomem[16] = "//enomem"; + char *pathname; + size_t size; + + if (!atomic_read(&nr_cgroup_events)) + return; + + cgroup_event = (struct perf_cgroup_event){ + .event_id = { + .header = { + .type = PERF_RECORD_CGROUP, + .misc = 0, + .size = sizeof(cgroup_event.event_id), + }, + .id = cgroup_id(cgrp), + }, + }; + + pathname = kmalloc(PATH_MAX, GFP_KERNEL); + if (pathname == NULL) { + cgroup_event.path = path_enomem; + } else { + /* just to be sure to have enough space for alignment */ + cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); + cgroup_event.path = pathname; + } + + /* + * Since our buffer works in 8 byte units we need to align our string + * size to a multiple of 8. However, we must guarantee the tail end is + * zero'd out to avoid leaking random bits to userspace. + */ + size = strlen(cgroup_event.path) + 1; + while (!IS_ALIGNED(size, sizeof(u64))) + cgroup_event.path[size++] = '\0'; + + cgroup_event.event_id.header.size += size; + cgroup_event.path_size = size; + + perf_iterate_sb(perf_event_cgroup_output, + &cgroup_event, + NULL); + + kfree(pathname); +} + +#endif + +/* * mmap tracking */ @@ -7855,7 +7994,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) flags |= MAP_EXECUTABLE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; - if (vma->vm_flags & VM_HUGETLB) + if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; if (file) { @@ -10778,6 +10917,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_comm_events); if (event->attr.namespaces) atomic_inc(&nr_namespaces_events); + if (event->attr.cgroup) + atomic_inc(&nr_cgroup_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) @@ -11157,6 +11298,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); + +#ifndef CONFIG_CGROUP_PERF + if (attr->sample_type & PERF_SAMPLE_CGROUP) + return -EINVAL; +#endif + out: return ret; @@ -12754,6 +12901,12 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css) kfree(jc); } +static int perf_cgroup_css_online(struct cgroup_subsys_state *css) +{ + perf_event_cgroup(css->cgroup); + return 0; +} + static int __perf_cgroup_move(void *info) { struct task_struct *task = info; @@ -12775,6 +12928,7 @@ static void perf_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, + .css_online = perf_cgroup_css_online, .attach = perf_cgroup_attach, /* * Implicitly enable on dfl hierarchy so that perf events can diff --git a/kernel/extable.c b/kernel/extable.c index 7681f87e89dd..b0ea5eb0c3b4 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -34,7 +34,8 @@ u32 __initdata __visible main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) { + if (main_extable_sort_needed && + &__stop___ex_table > &__start___ex_table) { pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); } diff --git a/kernel/fork.c b/kernel/fork.c index ba122d6f5127..8c700f881d92 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -361,6 +361,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) if (new) { *new = *orig; INIT_LIST_HEAD(&new->anon_vma_chain); + new->vm_next = new->vm_prev = NULL; } return new; } @@ -553,14 +554,15 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto fail_nomem_anon_vma_fork; if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ + /* + * VM_WIPEONFORK gets a clean slate in the child. + * Don't prepare anon_vma until fault since we don't + * copy page for current vma. + */ tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); @@ -2176,16 +2178,15 @@ static __latent_entropy struct task_struct *copy_process( INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - cgroup_threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ - retval = cgroup_can_fork(p); + retval = cgroup_can_fork(p, args); if (retval) - goto bad_fork_cgroup_threadgroup_change_end; + goto bad_fork_put_pidfd; /* * From this point on we must avoid any synchronous user-space @@ -2290,8 +2291,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p); - cgroup_threadgroup_change_end(current); + cgroup_post_fork(p, args); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -2302,9 +2302,7 @@ static __latent_entropy struct task_struct *copy_process( bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); - cgroup_cancel_fork(p); -bad_fork_cgroup_threadgroup_change_end: - cgroup_threadgroup_change_end(current); + cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { fput(pidfile); @@ -2607,6 +2605,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args args; pid_t *kset_tid = kargs->set_tid; + BUILD_BUG_ON(offsetofend(struct clone_args, tls) != + CLONE_ARGS_SIZE_VER0); + BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != + CLONE_ARGS_SIZE_VER1); + BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != + CLONE_ARGS_SIZE_VER2); + BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); + if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) @@ -2633,6 +2639,10 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, !valid_signal(args.exit_signal))) return -EINVAL; + if ((args.flags & CLONE_INTO_CGROUP) && + (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) + return -EINVAL; + *kargs = (struct kernel_clone_args){ .flags = args.flags, .pidfd = u64_to_user_ptr(args.pidfd), @@ -2643,6 +2653,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack_size = args.stack_size, .tls = args.tls, .set_tid_size = args.set_tid_size, + .cgroup = args.cgroup, }; if (args.set_tid && @@ -2686,7 +2697,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ - if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) + if (kargs->flags & + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) return false; /* diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index e5eb5ea7ea59..82babf5aa077 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -58,7 +58,7 @@ struct gcov_node { struct dentry *dentry; struct dentry **links; int num_loaded; - char name[0]; + char name[]; }; static const char objtree[] = OBJTREE; @@ -108,9 +108,9 @@ static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos) { struct gcov_iterator *iter = data; + (*pos)++; if (gcov_iter_next(iter)) return NULL; - (*pos)++; return iter; } diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 801ee4b0b969..acb83558e5df 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -38,7 +38,7 @@ static struct gcov_info *gcov_info_head; struct gcov_fn_info { unsigned int ident; unsigned int checksum; - unsigned int n_ctrs[0]; + unsigned int n_ctrs[]; }; /** @@ -78,7 +78,7 @@ struct gcov_info { unsigned int n_functions; const struct gcov_fn_info *functions; unsigned int ctr_mask; - struct gcov_ctr_info counts[0]; + struct gcov_ctr_info counts[]; }; /** @@ -352,7 +352,7 @@ struct gcov_iterator { unsigned int count; int num_types; - struct type_info type_info[0]; + struct type_info type_info[]; }; static struct gcov_fn_info *get_func(struct gcov_iterator *iter) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ec37563674d6..908fdf5098c3 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -68,7 +68,7 @@ struct gcov_fn_info { unsigned int ident; unsigned int lineno_checksum; unsigned int cfg_checksum; - struct gcov_ctr_info ctrs[0]; + struct gcov_ctr_info ctrs[]; }; /** diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fe40c658f86f..453a8a0f4804 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1690,34 +1690,6 @@ out_mput: return ret; } -/** - * setup_irq - setup an interrupt - * @irq: Interrupt line to setup - * @act: irqaction for the interrupt - * - * Used to statically setup interrupts in the early boot process. - */ -int setup_irq(unsigned int irq, struct irqaction *act) -{ - int retval; - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) - return -EINVAL; - - retval = irq_chip_pm_get(&desc->irq_data); - if (retval < 0) - return retval; - - retval = __setup_irq(irq, desc, act); - - if (retval) - irq_chip_pm_put(&desc->irq_data); - - return retval; -} -EXPORT_SYMBOL_GPL(setup_irq); - /* * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. @@ -1859,22 +1831,6 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) } /** - * remove_irq - free an interrupt - * @irq: Interrupt line to free - * @act: irqaction for the interrupt - * - * Used to remove interrupts statically setup by the early boot process. - */ -void remove_irq(unsigned int irq, struct irqaction *act) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) - __free_irq(desc, act->dev_id); -} -EXPORT_SYMBOL_GPL(remove_irq); - -/** * free_irq - free an interrupt allocated with request_irq * @irq: Interrupt line to free * @dev_id: Device identity to free diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a9b3f660dee7..16c8c605f4b0 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -175,7 +175,6 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } -EXPORT_SYMBOL_GPL(kallsyms_lookup_name); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), @@ -194,7 +193,6 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, } return module_kallsyms_on_each_symbol(fn, data); } -EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol); static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, diff --git a/kernel/kmod.c b/kernel/kmod.c index bc6addd9152b..37c3c4b97b8e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -35,7 +35,7 @@ * (u64) THREAD_SIZE * 8UL); * * If you need less than 50 threads would mean we're dealing with systems - * smaller than 3200 pages. This assuems you are capable of having ~13M memory, + * smaller than 3200 pages. This assumes you are capable of having ~13M memory, * and this would only be an be an upper limit, after which the OOM killer * would take effect. Systems like these are very unlikely if modules are * enabled. @@ -120,7 +120,7 @@ out: * invoke it. * * If module auto-loading support is disabled then this function - * becomes a no-operation. + * simply returns -ENOENT. */ int __request_module(bool wait, const char *fmt, ...) { @@ -137,7 +137,7 @@ int __request_module(bool wait, const char *fmt, ...) WARN_ON_ONCE(wait && current_is_async()); if (!modprobe_path[0]) - return 0; + return -ENOENT; va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1511690e4de7..ac10db66cc63 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3952,10 +3952,36 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } +static inline short task_wait_context(struct task_struct *curr) +{ + /* + * Set appropriate wait type for the context; for IRQs we have to take + * into account force_irqthread as that is implied by PREEMPT_RT. + */ + if (curr->hardirq_context) { + /* + * Check if force_irqthreads will run us threaded. + */ + if (curr->hardirq_threaded || curr->irq_config) + return LD_WAIT_CONFIG; + + return LD_WAIT_SPIN; + } else if (curr->softirq_context) { + /* + * Softirqs are always threaded. + */ + return LD_WAIT_CONFIG; + } + + return LD_WAIT_MAX; +} + static int print_lock_invalid_wait_context(struct task_struct *curr, struct held_lock *hlock) { + short curr_inner; + if (!debug_locks_off()) return 0; if (debug_locks_silent) @@ -3971,6 +3997,10 @@ print_lock_invalid_wait_context(struct task_struct *curr, print_lock(hlock); pr_warn("other info that might help us debug this:\n"); + + curr_inner = task_wait_context(curr); + pr_warn("context-{%d:%d}\n", curr_inner, curr_inner); + lockdep_print_held_locks(curr); pr_warn("stack backtrace:\n"); @@ -4017,26 +4047,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) } depth++; - /* - * Set appropriate wait type for the context; for IRQs we have to take - * into account force_irqthread as that is implied by PREEMPT_RT. - */ - if (curr->hardirq_context) { - /* - * Check if force_irqthreads will run us threaded. - */ - if (curr->hardirq_threaded || curr->irq_config) - curr_inner = LD_WAIT_CONFIG; - else - curr_inner = LD_WAIT_SPIN; - } else if (curr->softirq_context) { - /* - * Softirqs are always threaded. - */ - curr_inner = LD_WAIT_CONFIG; - } else { - curr_inner = LD_WAIT_MAX; - } + curr_inner = task_wait_context(curr); for (; depth < curr->lockdep_depth; depth++) { struct held_lock *prev = curr->held_locks + depth; diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index a008a1ba21a7..8bbafe3e5203 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -118,14 +118,15 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int wake_flags, void *key) { - struct task_struct *p = get_task_struct(wq_entry->private); bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; struct percpu_rw_semaphore *sem = key; + struct task_struct *p; /* concurrent against percpu_down_write(), can get stolen */ if (!__percpu_rwsem_trylock(sem, reader)) return 1; + p = get_task_struct(wq_entry->private); list_del_init(&wq_entry->entry); smp_store_release(&wq_entry->private, NULL); diff --git a/kernel/module.c b/kernel/module.c index 33569a01d6e1..646f1e2330d2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1515,7 +1515,7 @@ struct module_sect_attr { struct module_sect_attrs { struct attribute_group grp; unsigned int nsections; - struct module_sect_attr attrs[0]; + struct module_sect_attr attrs[]; }; static ssize_t module_sect_show(struct module_attribute *mattr, @@ -1608,7 +1608,7 @@ static void remove_sect_attrs(struct module *mod) struct module_notes_attrs { struct kobject *dir; unsigned int notes; - struct bin_attribute attrs[0]; + struct bin_attribute attrs[]; }; static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, @@ -4355,6 +4355,7 @@ static int modules_open(struct inode *inode, struct file *file) } static const struct proc_ops modules_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = modules_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/kernel/pid.c b/kernel/pid.c index bc21c0fb26d8..c835b844aca7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -256,6 +256,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, get_pid_ns(ns); refcount_set(&pid->count, 1); + spin_lock_init(&pid->lock); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 7cbfbeacd68a..c208566c844b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -80,9 +80,6 @@ config HIBERNATION For more information take a look at <file:Documentation/power/swsusp.rst>. -config ARCH_SAVE_PAGE_KEYS - bool - config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6dbeedb7354c..86aba8706b16 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -678,7 +678,7 @@ static int load_image_and_restore(void) error = swsusp_read(&flags); swsusp_close(FMODE_READ); if (!error) - hibernation_restore(flags & SF_PLATFORM_MODE); + error = hibernation_restore(flags & SF_PLATFORM_MODE); pr_err("Failed to load image, recovering.\n"); swsusp_free(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 69b7a8aeca3b..40f86ec4ab30 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -535,6 +535,13 @@ static ssize_t pm_debug_messages_store(struct kobject *kobj, power_attr(pm_debug_messages); +static int __init pm_debug_messages_setup(char *str) +{ + pm_debug_messages_on = true; + return 1; +} +__setup("pm_debug_messages", pm_debug_messages_setup); + /** * __pm_pr_dbg - Print a suspend debug message to the kernel log. * @defer: Whether or not to use printk_deferred() to print the message. diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d82b7b88d616..659800157b17 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1744,9 +1744,6 @@ int hibernate_preallocate_memory(void) count += highmem; count -= totalreserve_pages; - /* Add number of pages required for page keys (s390 only). */ - size += page_key_additional_pages(saveable); - /* Compute the maximum number of saveable pages to leave in memory. */ max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); @@ -2075,8 +2072,6 @@ static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; - /* Save page key for data page (s390 only). */ - page_key_read(buf + j); } } @@ -2226,9 +2221,6 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - /* Extract and buffer page key for data page (s390 only). */ - page_key_memorize(buf + j); - if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) memory_bm_set_bit(bm, buf[j]); else @@ -2623,11 +2615,6 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; - /* Allocate buffer for page keys. */ - error = page_key_alloc(nr_copy_pages); - if (error) - return error; - hibernate_restore_protection_begin(); } else if (handle->cur <= nr_meta_pages + 1) { error = unpack_orig_pfns(buffer, ©_bm); @@ -2649,8 +2636,6 @@ int snapshot_write_next(struct snapshot_handle *handle) } } else { copy_last_highmem_page(); - /* Restore page key for data page (s390 only). */ - page_key_write(handle->buffer); hibernate_restore_protect_page(handle->buffer); handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) @@ -2673,9 +2658,6 @@ int snapshot_write_next(struct snapshot_handle *handle) void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); - /* Restore page key for data page (s390 only). */ - page_key_write(handle->buffer); - page_key_free(); hibernate_restore_protect_page(handle->buffer); /* Do that only if we have loaded the image entirely */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { diff --git a/kernel/power/user.c b/kernel/power/user.c index 58ed9478787f..7959449765d9 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -27,8 +27,6 @@ #include "power.h" -#define SNAPSHOT_MINOR 231 - static struct snapshot_data { struct snapshot_handle handle; int swap; @@ -198,6 +196,50 @@ unlock: return res; } +struct compat_resume_swap_area { + compat_loff_t offset; + u32 dev; +} __packed; + +static int snapshot_set_swap_area(struct snapshot_data *data, + void __user *argp) +{ + sector_t offset; + dev_t swdev; + + if (swsusp_swap_in_use()) + return -EPERM; + + if (in_compat_syscall()) { + struct compat_resume_swap_area swap_area; + + if (copy_from_user(&swap_area, argp, sizeof(swap_area))) + return -EFAULT; + swdev = new_decode_dev(swap_area.dev); + offset = swap_area.offset; + } else { + struct resume_swap_area swap_area; + + if (copy_from_user(&swap_area, argp, sizeof(swap_area))) + return -EFAULT; + swdev = new_decode_dev(swap_area.dev); + offset = swap_area.offset; + } + + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + if (!swdev) { + data->swap = -1; + return -EINVAL; + } + data->swap = swap_type_of(swdev, offset, NULL); + if (data->swap < 0) + return -ENODEV; + return 0; +} + static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -353,34 +395,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; case SNAPSHOT_SET_SWAP_AREA: - if (swsusp_swap_in_use()) { - error = -EPERM; - } else { - struct resume_swap_area swap_area; - dev_t swdev; - - error = copy_from_user(&swap_area, (void __user *)arg, - sizeof(struct resume_swap_area)); - if (error) { - error = -EFAULT; - break; - } - - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - swdev = new_decode_dev(swap_area.dev); - if (swdev) { - offset = swap_area.offset; - data->swap = swap_type_of(swdev, offset, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } + error = snapshot_set_swap_area(data, (void __user *)arg); break; default: @@ -395,12 +410,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } #ifdef CONFIG_COMPAT - -struct compat_resume_swap_area { - compat_loff_t offset; - u32 dev; -} __packed; - static long snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -411,33 +420,13 @@ snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case SNAPSHOT_AVAIL_SWAP_SIZE: case SNAPSHOT_ALLOC_SWAP_PAGE: case SNAPSHOT_CREATE_IMAGE: + case SNAPSHOT_SET_SWAP_AREA: return snapshot_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); - - case SNAPSHOT_SET_SWAP_AREA: { - struct compat_resume_swap_area __user *u_swap_area = - compat_ptr(arg); - struct resume_swap_area swap_area; - mm_segment_t old_fs; - int err; - - err = get_user(swap_area.offset, &u_swap_area->offset); - err |= get_user(swap_area.dev, &u_swap_area->dev); - if (err) - return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, - (unsigned long) &swap_area); - set_fs(old_fs); - return err; - } - default: return snapshot_ioctl(file, cmd, arg); } } - #endif /* CONFIG_COMPAT */ static const struct file_operations snapshot_fops = { diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index c8e6ab689d42..b2b0f526f249 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -23,6 +23,9 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); +void printk_safe_init(void); +bool printk_percpu_data_ready(void); + #define printk_safe_enter_irqsave(flags) \ do { \ local_irq_save(flags); \ @@ -64,4 +67,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } #define printk_safe_enter_irq() local_irq_disable() #define printk_safe_exit_irq() local_irq_enable() +static inline void printk_safe_init(void) { } +static inline bool printk_percpu_data_ready(void) { return false; } #endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 633f41a11d75..9a9b6156270b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -460,6 +460,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* + * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before + * per_cpu_areas are initialised. This variable is set to true when + * it's safe to access per-CPU data. + */ +static bool __printk_percpu_data_ready __read_mostly; + +bool printk_percpu_data_ready(void) +{ + return __printk_percpu_data_ready; +} + /* Return log buffer address */ char *log_buf_addr_get(void) { @@ -1146,12 +1158,28 @@ static void __init log_buf_add_cpu(void) static inline void log_buf_add_cpu(void) {} #endif /* CONFIG_SMP */ +static void __init set_percpu_data_ready(void) +{ + printk_safe_init(); + /* Make sure we set this flag only after printk_safe() init is done */ + barrier(); + __printk_percpu_data_ready = true; +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; unsigned int free; + /* + * Some archs call setup_log_buf() multiple times - first is very + * early, e.g. from setup_arch(), and second - when percpu_areas + * are initialised. + */ + if (!early) + set_percpu_data_ready(); + if (log_buf != __log_buf) return; @@ -2975,6 +3003,9 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { void wake_up_klogd(void) { + if (!printk_percpu_data_ready()) + return; + preempt_disable(); if (waitqueue_active(&log_wait)) { this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); @@ -2985,6 +3016,9 @@ void wake_up_klogd(void) void defer_console_output(void) { + if (!printk_percpu_data_ready()) + return; + preempt_disable(); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index b4045e782743..d9a659a686f3 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -27,7 +27,6 @@ * There are situations when we want to make sure that all buffers * were handled or when IRQs are blocked. */ -static int printk_safe_irq_ready __read_mostly; #define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ sizeof(atomic_t) - \ @@ -51,7 +50,7 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); /* Get flushed in a more safe context. */ static void queue_flush_work(struct printk_safe_seq_buf *s) { - if (printk_safe_irq_ready) + if (printk_percpu_data_ready()) irq_work_queue(&s->work); } @@ -402,14 +401,6 @@ void __init printk_safe_init(void) #endif } - /* - * In the highly unlikely event that a NMI were to trigger at - * this moment. Make sure IRQ work is set up before this - * variable is set. - */ - barrier(); - printk_safe_irq_ready = 1; - /* Flush pending messages that did not have scheduled IRQ works. */ printk_safe_flush(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 06548e2ebb72..d9a49cd6065a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -825,7 +825,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq) rcu_cleanup_after_idle(); incby = 1; - } else if (tick_nohz_full_cpu(rdp->cpu) && + } else if (irq && tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && READ_ONCE(rdp->rcu_urgent_qs) && !READ_ONCE(rdp->rcu_forced_tick)) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a2694ba82874..3a61a3b8eaa9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2119,12 +2119,6 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) return cpu; } -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -4126,7 +4120,8 @@ static inline void sched_submit_work(struct task_struct *tsk) * it wants to wake up a task to maintain concurrency. * As this function is called inside the schedule() context, * we disable preemption to avoid it calling schedule() again - * in the possible wakeup of a kworker. + * in the possible wakeup of a kworker and because wq_worker_sleeping() + * requires it. */ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); @@ -6699,7 +6694,6 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON - rq->last_load_update_tick = jiffies; rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index dac9104d126f..ff9435dee1df 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -1003,12 +1003,12 @@ u64 kcpustat_field(struct kernel_cpustat *kcpustat, enum cpu_usage_stat usage, int cpu) { u64 *cpustat = kcpustat->cpustat; + u64 val = cpustat[usage]; struct rq *rq; - u64 val; int err; if (!vtime_accounting_enabled_cpu(cpu)) - return cpustat[usage]; + return val; rq = cpu_rq(cpu); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8331bc04aea2..a562df57a86e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -816,10 +816,12 @@ static int __init init_sched_debug_procfs(void) __initcall(init_sched_debug_procfs); -#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) +#define __P(F) __PS(#F, F) +#define P(F) __PS(#F, p->F) +#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F))) +#define __PN(F) __PSN(#F, F) +#define PN(F) __PSN(#F, p->F) #ifdef CONFIG_NUMA_BALANCING @@ -868,18 +870,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, SEQ_printf(m, "---------------------------------------------------------" "----------\n"); -#define __P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define P_SCHEDSTAT(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) -#define __PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) -#define PN_SCHEDSTAT(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) + +#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F)) +#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F)) PN(se.exec_start); PN(se.vruntime); @@ -939,10 +932,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, } __P(nr_switches); - SEQ_printf(m, "%-45s:%21Ld\n", - "nr_voluntary_switches", (long long)p->nvcsw); - SEQ_printf(m, "%-45s:%21Ld\n", - "nr_involuntary_switches", (long long)p->nivcsw); + __PS("nr_voluntary_switches", p->nvcsw); + __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); #ifdef CONFIG_SMP @@ -956,6 +947,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_est.ewma); P(se.avg.util_est.enqueued); #endif +#ifdef CONFIG_UCLAMP_TASK + __PS("uclamp.min", p->uclamp[UCLAMP_MIN].value); + __PS("uclamp.max", p->uclamp[UCLAMP_MAX].value); + __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); + __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); +#endif P(policy); P(prio); if (task_has_dl_policy(p)) { @@ -963,11 +960,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(dl.deadline); } #undef PN_SCHEDSTAT -#undef PN -#undef __PN #undef P_SCHEDSTAT -#undef P -#undef __P { unsigned int this_cpu = raw_smp_processor_id(); @@ -975,8 +968,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, t0 = cpu_clock(this_cpu); t1 = cpu_clock(this_cpu); - SEQ_printf(m, "%-45s:%21Ld\n", - "clock-delta", (long long)(t1-t0)); + __PS("clock-delta", t1-t0); } sched_show_numa(p, m); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7fb20adabeb..02f323b85b6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2799,7 +2799,7 @@ static void task_numa_work(struct callback_head *work) * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + if (!vma_is_accessible(vma)) continue; do { @@ -4836,11 +4836,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) +static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { struct cfs_rq *cfs_rq; - u64 runtime; - u64 starting_runtime = remaining; + u64 runtime, remaining = 1; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -4855,10 +4854,13 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) /* By the above check, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; - if (runtime > remaining) - runtime = remaining; - remaining -= runtime; + if (runtime > cfs_b->runtime) + runtime = cfs_b->runtime; + cfs_b->runtime -= runtime; + remaining = cfs_b->runtime; + raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += runtime; @@ -4873,8 +4875,6 @@ next: break; } rcu_read_unlock(); - - return starting_runtime - remaining; } /* @@ -4885,7 +4885,6 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { - u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4914,24 +4913,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u cfs_b->nr_throttled += overrun; /* - * This check is repeated as we are holding onto the new bandwidth while - * we unthrottle. This can potentially race with an unthrottled group - * trying to acquire new bandwidth from the global pool. This can result - * in us over-using our runtime if it is all used during this loop, but - * only by limited amounts in that extreme case. + * This check is repeated as we release cfs_b->lock while we unthrottle. */ while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - runtime = cfs_b->runtime; cfs_b->distribute_running = 1; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime); + distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - - lsub_positive(&cfs_b->runtime, runtime); } /* @@ -5065,10 +5057,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime); + distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -6080,8 +6071,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; - u64 time, cost; - s64 delta; + u64 time; int this = smp_processor_id(); int cpu, nr = INT_MAX; @@ -6119,9 +6109,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } time = cpu_clock(this) - time; - cost = this_sd->avg_scan_cost; - delta = (s64)(time - cost) / 8; - this_sd->avg_scan_cost += delta; + update_avg(&this_sd->avg_scan_cost, time); return cpu; } @@ -9048,6 +9036,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / sds->total_capacity; + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. + */ + if (local->avg_load >= busiest->avg_load) { + env->imbalance = 0; + return; + } } /* diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 008d6ac2342b..808244f3ddd9 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -149,6 +149,9 @@ __setup("nohz_full=", housekeeping_nohz_full_setup); static int __init housekeeping_isolcpus_setup(char *str) { unsigned int flags = 0; + bool illegal = false; + char *par; + int len; while (isalpha(*str)) { if (!strncmp(str, "nohz,", 5)) { @@ -169,8 +172,22 @@ static int __init housekeeping_isolcpus_setup(char *str) continue; } - pr_warn("isolcpus: Error, unknown flag\n"); - return 0; + /* + * Skip unknown sub-parameter and validate that it is not + * containing an invalid character. + */ + for (par = str, len = 0; *str && *str != ','; str++, len++) { + if (!isalpha(*str) && *str != '_') + illegal = true; + } + + if (illegal) { + pr_warn("isolcpus: Invalid flag %.*s\n", len, par); + return 0; + } + + pr_info("isolcpus: Skipped unknown flag %.*s\n", len, par); + str++; } /* Default behaviour for isolcpus without flags */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0f616bf7bce3..db3a57675ccf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -195,6 +195,12 @@ static inline int task_has_dl_policy(struct task_struct *p) #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +static inline void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff / 8; +} + /* * !! For sched_setattr_nocheck() (kernel) only !! * @@ -882,7 +888,6 @@ struct rq { #endif #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP - unsigned long last_load_update_tick; unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; #endif /* CONFIG_SMP */ diff --git a/kernel/signal.c b/kernel/signal.c index e58a6c619824..713104884414 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1510,15 +1510,15 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, unsigned long flags; int ret = -EINVAL; + if (!valid_signal(sig)) + return ret; + clear_siginfo(&info); info.si_signo = sig; info.si_errno = errno; info.si_code = SI_ASYNCIO; *((sigval_t *)&info.si_pid) = addr; - if (!valid_signal(sig)) - return ret; - rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (!p) { @@ -1557,12 +1557,8 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) { int ret; - if (pid > 0) { - rcu_read_lock(); - ret = kill_pid_info(sig, info, find_vpid(pid)); - rcu_read_unlock(); - return ret; - } + if (pid > 0) + return kill_proc_info(sig, info, pid); /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ if (pid == INT_MIN) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d0a5ba37aff4..d89da1c7e005 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1480,6 +1480,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); + bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); @@ -1514,11 +1515,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); - lockdep_hrtimer_enter(timer); + expires_in_hardirq = lockdep_hrtimer_enter(timer); restart = fn(timer); - lockdep_hrtimer_exit(timer); + lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index e6ba064ce773..53bce347cd50 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -338,7 +338,20 @@ static struct user_namespace *timens_owner(struct ns_common *ns) static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) { - seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec); + char *clock; + + switch (clockid) { + case CLOCK_BOOTTIME: + clock = "boottime"; + break; + case CLOCK_MONOTONIC: + clock = "monotonic"; + break; + default: + clock = "unknown"; + break; + } + seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec); } void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) @@ -447,6 +460,7 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", + .real_ns_name = "time", .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fd81c7de77a7..041694a1eb74 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -102,7 +102,7 @@ static bool ftrace_pids_enabled(struct ftrace_ops *ops) tr = ops->private; - return tr->function_pids != NULL; + return tr->function_pids != NULL || tr->function_no_pids != NULL; } static void ftrace_update_trampoline(struct ftrace_ops *ops); @@ -139,13 +139,23 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #endif } +#define FTRACE_PID_IGNORE -1 +#define FTRACE_PID_TRACE -2 + static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { struct trace_array *tr = op->private; + int pid; - if (tr && this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid)) - return; + if (tr) { + pid = this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid); + if (pid == FTRACE_PID_IGNORE) + return; + if (pid != FTRACE_PID_TRACE && + pid != current->pid) + return; + } op->saved_func(ip, parent_ip, op, regs); } @@ -6923,11 +6933,17 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, { struct trace_array *tr = data; struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; pid_list = rcu_dereference_sched(tr->function_pids); + no_pid_list = rcu_dereference_sched(tr->function_no_pids); - this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, - trace_ignore_this_task(pid_list, next)); + if (trace_ignore_this_task(pid_list, no_pid_list, next)) + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + FTRACE_PID_IGNORE); + else + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + next->pid); } static void @@ -6940,6 +6956,9 @@ ftrace_pid_follow_sched_process_fork(void *data, pid_list = rcu_dereference_sched(tr->function_pids); trace_filter_add_remove_task(pid_list, self, task); + + pid_list = rcu_dereference_sched(tr->function_no_pids); + trace_filter_add_remove_task(pid_list, self, task); } static void @@ -6950,6 +6969,9 @@ ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) pid_list = rcu_dereference_sched(tr->function_pids); trace_filter_add_remove_task(pid_list, NULL, task); + + pid_list = rcu_dereference_sched(tr->function_no_pids); + trace_filter_add_remove_task(pid_list, NULL, task); } void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) @@ -6967,42 +6989,57 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) } } -static void clear_ftrace_pids(struct trace_array *tr) +static void clear_ftrace_pids(struct trace_array *tr, int type) { struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; int cpu; pid_list = rcu_dereference_protected(tr->function_pids, lockdep_is_held(&ftrace_lock)); - if (!pid_list) + no_pid_list = rcu_dereference_protected(tr->function_no_pids, + lockdep_is_held(&ftrace_lock)); + + /* Make sure there's something to do */ + if (!pid_type_enabled(type, pid_list, no_pid_list)) return; - unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + /* See if the pids still need to be checked after this */ + if (!still_need_pid_events(type, pid_list, no_pid_list)) { + unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = FTRACE_PID_TRACE; + } - for_each_possible_cpu(cpu) - per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = false; + if (type & TRACE_PIDS) + rcu_assign_pointer(tr->function_pids, NULL); - rcu_assign_pointer(tr->function_pids, NULL); + if (type & TRACE_NO_PIDS) + rcu_assign_pointer(tr->function_no_pids, NULL); /* Wait till all users are no longer using pid filtering */ synchronize_rcu(); - trace_free_pid_list(pid_list); + if ((type & TRACE_PIDS) && pid_list) + trace_free_pid_list(pid_list); + + if ((type & TRACE_NO_PIDS) && no_pid_list) + trace_free_pid_list(no_pid_list); } void ftrace_clear_pids(struct trace_array *tr) { mutex_lock(&ftrace_lock); - clear_ftrace_pids(tr); + clear_ftrace_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); mutex_unlock(&ftrace_lock); } -static void ftrace_pid_reset(struct trace_array *tr) +static void ftrace_pid_reset(struct trace_array *tr, int type) { mutex_lock(&ftrace_lock); - clear_ftrace_pids(tr); + clear_ftrace_pids(tr, type); ftrace_update_pid_func(); ftrace_startup_all(0); @@ -7066,9 +7103,45 @@ static const struct seq_operations ftrace_pid_sops = { .show = fpid_show, }; -static int -ftrace_pid_open(struct inode *inode, struct file *file) +static void *fnpid_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + + mutex_lock(&ftrace_lock); + rcu_read_lock_sched(); + + pid_list = rcu_dereference_sched(tr->function_no_pids); + + if (!pid_list) + return !(*pos) ? FTRACE_NO_PIDS : NULL; + + return trace_pid_start(pid_list, pos); +} + +static void *fnpid_next(struct seq_file *m, void *v, loff_t *pos) { + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_no_pids); + + if (v == FTRACE_NO_PIDS) { + (*pos)++; + return NULL; + } + return trace_pid_next(pid_list, v, pos); +} + +static const struct seq_operations ftrace_no_pid_sops = { + .start = fnpid_start, + .next = fnpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int pid_open(struct inode *inode, struct file *file, int type) +{ + const struct seq_operations *seq_ops; struct trace_array *tr = inode->i_private; struct seq_file *m; int ret = 0; @@ -7079,9 +7152,18 @@ ftrace_pid_open(struct inode *inode, struct file *file) if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_pid_reset(tr); + ftrace_pid_reset(tr, type); + + switch (type) { + case TRACE_PIDS: + seq_ops = &ftrace_pid_sops; + break; + case TRACE_NO_PIDS: + seq_ops = &ftrace_no_pid_sops; + break; + } - ret = seq_open(file, &ftrace_pid_sops); + ret = seq_open(file, seq_ops); if (ret < 0) { trace_array_put(tr); } else { @@ -7093,10 +7175,23 @@ ftrace_pid_open(struct inode *inode, struct file *file) return ret; } +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + return pid_open(inode, file, TRACE_PIDS); +} + +static int +ftrace_no_pid_open(struct inode *inode, struct file *file) +{ + return pid_open(inode, file, TRACE_NO_PIDS); +} + static void ignore_task_cpu(void *data) { struct trace_array *tr = data; struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; /* * This function is called by on_each_cpu() while the @@ -7104,18 +7199,25 @@ static void ignore_task_cpu(void *data) */ pid_list = rcu_dereference_protected(tr->function_pids, mutex_is_locked(&ftrace_lock)); + no_pid_list = rcu_dereference_protected(tr->function_no_pids, + mutex_is_locked(&ftrace_lock)); - this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, - trace_ignore_this_task(pid_list, current)); + if (trace_ignore_this_task(pid_list, no_pid_list, current)) + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + FTRACE_PID_IGNORE); + else + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + current->pid); } static ssize_t -ftrace_pid_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, int type) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; - struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *filtered_pids; + struct trace_pid_list *other_pids; struct trace_pid_list *pid_list; ssize_t ret; @@ -7124,19 +7226,39 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, mutex_lock(&ftrace_lock); - filtered_pids = rcu_dereference_protected(tr->function_pids, + switch (type) { + case TRACE_PIDS: + filtered_pids = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + other_pids = rcu_dereference_protected(tr->function_no_pids, + lockdep_is_held(&ftrace_lock)); + break; + case TRACE_NO_PIDS: + filtered_pids = rcu_dereference_protected(tr->function_no_pids, + lockdep_is_held(&ftrace_lock)); + other_pids = rcu_dereference_protected(tr->function_pids, lockdep_is_held(&ftrace_lock)); + break; + } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) goto out; - rcu_assign_pointer(tr->function_pids, pid_list); + switch (type) { + case TRACE_PIDS: + rcu_assign_pointer(tr->function_pids, pid_list); + break; + case TRACE_NO_PIDS: + rcu_assign_pointer(tr->function_no_pids, pid_list); + break; + } + if (filtered_pids) { synchronize_rcu(); trace_free_pid_list(filtered_pids); - } else if (pid_list) { + } else if (pid_list && !other_pids) { /* Register a probe to set whether to ignore the tracing of a task */ register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); } @@ -7159,6 +7281,20 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, return ret; } +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS); +} + +static ssize_t +ftrace_no_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS); +} + static int ftrace_pid_release(struct inode *inode, struct file *file) { @@ -7177,10 +7313,20 @@ static const struct file_operations ftrace_pid_fops = { .release = ftrace_pid_release, }; +static const struct file_operations ftrace_no_pid_fops = { + .open = ftrace_no_pid_open, + .write = ftrace_no_pid_write, + .read = seq_read, + .llseek = tracing_lseek, + .release = ftrace_pid_release, +}; + void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { trace_create_file("set_ftrace_pid", 0644, d_tracer, tr, &ftrace_pid_fops); + trace_create_file("set_ftrace_notrace_pid", 0644, d_tracer, + tr, &ftrace_no_pid_fops); } void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 61f0e92ace99..6f0b42ceeb00 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -441,6 +441,7 @@ enum { struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; + atomic_t resize_disabled; struct trace_buffer *buffer; raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; @@ -484,7 +485,6 @@ struct trace_buffer { unsigned flags; int cpus; atomic_t record_disabled; - atomic_t resize_disabled; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; @@ -503,10 +503,14 @@ struct trace_buffer { struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; + unsigned long next_event; struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; u64 read_stamp; + u64 page_stamp; + struct ring_buffer_event *event; + int missed_events; }; /** @@ -1737,18 +1741,24 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, size = nr_pages * BUF_PAGE_SIZE; - /* - * Don't succeed if resizing is disabled, as a reader might be - * manipulating the ring buffer and is expecting a sane state while - * this is true. - */ - if (atomic_read(&buffer->resize_disabled)) - return -EBUSY; - /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); + if (cpu_id == RING_BUFFER_ALL_CPUS) { + /* + * Don't succeed if resizing is disabled, as a reader might be + * manipulating the ring buffer and is expecting a sane state while + * this is true. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (atomic_read(&cpu_buffer->resize_disabled)) { + err = -EBUSY; + goto out_err_unlock; + } + } + /* calculate the pages to update */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -1816,6 +1826,16 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages == cpu_buffer->nr_pages) goto out; + /* + * Don't succeed if resizing is disabled, as a reader might be + * manipulating the ring buffer and is expecting a sane state while + * this is true. + */ + if (atomic_read(&cpu_buffer->resize_disabled)) { + err = -EBUSY; + goto out_err_unlock; + } + cpu_buffer->nr_pages_to_update = nr_pages - cpu_buffer->nr_pages; @@ -1885,6 +1905,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, free_buffer_page(bpage); } } + out_err_unlock: mutex_unlock(&buffer->mutex); return err; } @@ -1913,15 +1934,63 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static __always_inline struct ring_buffer_event * -rb_iter_head_event(struct ring_buffer_iter *iter) +static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) { - return __rb_page_index(iter->head_page, iter->head); + return local_read(&bpage->page->commit); } -static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) +static struct ring_buffer_event * +rb_iter_head_event(struct ring_buffer_iter *iter) { - return local_read(&bpage->page->commit); + struct ring_buffer_event *event; + struct buffer_page *iter_head_page = iter->head_page; + unsigned long commit; + unsigned length; + + if (iter->head != iter->next_event) + return iter->event; + + /* + * When the writer goes across pages, it issues a cmpxchg which + * is a mb(), which will synchronize with the rmb here. + * (see rb_tail_page_update() and __rb_reserve_next()) + */ + commit = rb_page_commit(iter_head_page); + smp_rmb(); + event = __rb_page_index(iter_head_page, iter->head); + length = rb_event_length(event); + + /* + * READ_ONCE() doesn't work on functions and we don't want the + * compiler doing any crazy optimizations with length. + */ + barrier(); + + if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE) + /* Writer corrupted the read? */ + goto reset; + + memcpy(iter->event, event, length); + /* + * If the page stamp is still the same after this rmb() then the + * event was safely copied without the writer entering the page. + */ + smp_rmb(); + + /* Make sure the page didn't change since we read this */ + if (iter->page_stamp != iter_head_page->page->time_stamp || + commit > rb_page_commit(iter_head_page)) + goto reset; + + iter->next_event = iter->head + length; + return iter->event; + reset: + /* Reset to the beginning */ + iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; + iter->head = 0; + iter->next_event = 0; + iter->missed_events = 1; + return NULL; } /* Size is determined by what has been committed */ @@ -1959,8 +2028,9 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) else rb_inc_page(cpu_buffer, &iter->head_page); - iter->read_stamp = iter->head_page->page->time_stamp; + iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; + iter->next_event = 0; } /* @@ -3547,14 +3617,18 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; + iter->next_event = iter->head; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; - if (iter->head) + if (iter->head) { iter->read_stamp = cpu_buffer->read_stamp; - else + iter->page_stamp = cpu_buffer->reader_page->page->time_stamp; + } else { iter->read_stamp = iter->head_page->page->time_stamp; + iter->page_stamp = iter->read_stamp; + } } /** @@ -3590,17 +3664,38 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) struct buffer_page *reader; struct buffer_page *head_page; struct buffer_page *commit_page; + struct buffer_page *curr_commit_page; unsigned commit; + u64 curr_commit_ts; + u64 commit_ts; cpu_buffer = iter->cpu_buffer; - - /* Remember, trace recording is off when iterator is in use */ reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; commit_page = cpu_buffer->commit_page; + commit_ts = commit_page->page->time_stamp; + + /* + * When the writer goes across pages, it issues a cmpxchg which + * is a mb(), which will synchronize with the rmb here. + * (see rb_tail_page_update()) + */ + smp_rmb(); commit = rb_page_commit(commit_page); + /* We want to make sure that the commit page doesn't change */ + smp_rmb(); - return ((iter->head_page == commit_page && iter->head == commit) || + /* Make sure commit page didn't change */ + curr_commit_page = READ_ONCE(cpu_buffer->commit_page); + curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp); + + /* If the commit page changed, then there's more data */ + if (curr_commit_page != commit_page || + curr_commit_ts != commit_ts) + return 0; + + /* Still racy, as it may return a false positive, but that's OK */ + return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && iter->head == rb_page_commit(cpu_buffer->reader_page))); @@ -3828,15 +3923,22 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) static void rb_advance_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_event *event; - unsigned length; cpu_buffer = iter->cpu_buffer; + /* If head == next_event then we need to jump to the next event */ + if (iter->head == iter->next_event) { + /* If the event gets overwritten again, there's nothing to do */ + if (rb_iter_head_event(iter) == NULL) + return; + } + + iter->head = iter->next_event; + /* * Check if we are at the end of the buffer. */ - if (iter->head >= rb_page_size(iter->head_page)) { + if (iter->next_event >= rb_page_size(iter->head_page)) { /* discarded commits can make the page empty */ if (iter->head_page == cpu_buffer->commit_page) return; @@ -3844,27 +3946,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) return; } - event = rb_iter_head_event(iter); - - length = rb_event_length(event); - - /* - * This should not be called to advance the header if we are - * at the tail of the buffer. - */ - if (RB_WARN_ON(cpu_buffer, - (iter->head_page == cpu_buffer->commit_page) && - (iter->head + length > rb_commit_index(cpu_buffer)))) - return; - - rb_update_iter_read_stamp(iter, event); - - iter->head += length; - - /* check for end of page padding */ - if ((iter->head >= rb_page_size(iter->head_page)) && - (iter->head_page != cpu_buffer->commit_page)) - rb_inc_iter(iter); + rb_update_iter_read_stamp(iter, iter->event); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) @@ -3952,6 +4034,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int nr_loops = 0; + bool failed = false; if (ts) *ts = 0; @@ -3978,10 +4061,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) * to a data event, we should never loop more than three times. * Once for going to next page, once on time extend, and * finally once to get the event. - * (We never hit the following condition more than thrice). + * We should never hit the following condition more than thrice, + * unless the buffer is very small, and there's a writer + * that is causing the reader to fail getting an event. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) + if (++nr_loops > 3) { + RB_WARN_ON(cpu_buffer, !failed); return NULL; + } if (rb_per_cpu_empty(cpu_buffer)) return NULL; @@ -3992,6 +4079,10 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } event = rb_iter_head_event(iter); + if (!event) { + failed = true; + goto again; + } switch (event->type_len) { case RINGBUF_TYPE_PADDING: @@ -4102,6 +4193,20 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, return event; } +/** ring_buffer_iter_dropped - report if there are dropped events + * @iter: The ring buffer iterator + * + * Returns true if there was dropped events since the last peek. + */ +bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) +{ + bool ret = iter->missed_events != 0; + + iter->missed_events = 0; + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); + /** * ring_buffer_iter_peek - peek at the next event to be read * @iter: The ring buffer iterator @@ -4208,16 +4313,21 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kmalloc(sizeof(*iter), flags); + iter = kzalloc(sizeof(*iter), flags); if (!iter) return NULL; + iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags); + if (!iter->event) { + kfree(iter); + return NULL; + } + cpu_buffer = buffer->buffers[cpu]; iter->cpu_buffer = cpu_buffer; - atomic_inc(&buffer->resize_disabled); - atomic_inc(&cpu_buffer->record_disabled); + atomic_inc(&cpu_buffer->resize_disabled); return iter; } @@ -4290,42 +4400,31 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) rb_check_pages(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - atomic_dec(&cpu_buffer->record_disabled); - atomic_dec(&cpu_buffer->buffer->resize_disabled); + atomic_dec(&cpu_buffer->resize_disabled); + kfree(iter->event); kfree(iter); } EXPORT_SYMBOL_GPL(ring_buffer_read_finish); /** - * ring_buffer_read - read the next item in the ring buffer by the iterator + * ring_buffer_iter_advance - advance the iterator to the next location * @iter: The ring buffer iterator - * @ts: The time stamp of the event read. * - * This reads the next event in the ring buffer and increments the iterator. + * Move the location of the iterator such that the next read will + * be the next location of the iterator. */ -struct ring_buffer_event * -ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) +void ring_buffer_iter_advance(struct ring_buffer_iter *iter) { - struct ring_buffer_event *event; struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - again: - event = rb_iter_peek(iter, ts); - if (!event) - goto out; - - if (event->type_len == RINGBUF_TYPE_PADDING) - goto again; rb_advance_iter(iter); - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - return event; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } -EXPORT_SYMBOL_GPL(ring_buffer_read); +EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); /** * ring_buffer_size - return the size of the ring buffer (in bytes) @@ -4406,7 +4505,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; - atomic_inc(&buffer->resize_disabled); + atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); /* Make sure all commits have finished */ @@ -4427,7 +4526,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); - atomic_dec(&buffer->resize_disabled); + atomic_dec(&cpu_buffer->resize_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6b11e4e2150c..8d2b98812625 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -386,16 +386,22 @@ trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) * Returns false if @task should be traced. */ bool -trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct trace_pid_list *filtered_no_pids, + struct task_struct *task) { /* - * Return false, because if filtered_pids does not exist, - * all pids are good to trace. + * If filterd_no_pids is not empty, and the task's pid is listed + * in filtered_no_pids, then return true. + * Otherwise, if filtered_pids is empty, that means we can + * trace all tasks. If it has content, then only trace pids + * within filtered_pids. */ - if (!filtered_pids) - return false; - return !trace_find_filtered_pid(filtered_pids, task->pid); + return (filtered_pids && + !trace_find_filtered_pid(filtered_pids, task->pid)) || + (filtered_no_pids && + trace_find_filtered_pid(filtered_no_pids, task->pid)); } /** @@ -3378,7 +3384,7 @@ static void trace_iterator_increment(struct trace_iterator *iter) iter->idx++; if (buf_iter) - ring_buffer_read(buf_iter, NULL); + ring_buffer_iter_advance(buf_iter); } static struct trace_entry * @@ -3388,11 +3394,15 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); - if (buf_iter) + if (buf_iter) { event = ring_buffer_iter_peek(buf_iter, ts); - else + if (lost_events) + *lost_events = ring_buffer_iter_dropped(buf_iter) ? + (unsigned long)-1 : 0; + } else { event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts, lost_events); + } if (event) { iter->ent_size = ring_buffer_event_length(event); @@ -3462,11 +3472,51 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, return next; } +#define STATIC_TEMP_BUF_SIZE 128 +static char static_temp_buf[STATIC_TEMP_BUF_SIZE]; + /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - return __find_next_entry(iter, ent_cpu, NULL, ent_ts); + /* __find_next_entry will reset ent_size */ + int ent_size = iter->ent_size; + struct trace_entry *entry; + + /* + * If called from ftrace_dump(), then the iter->temp buffer + * will be the static_temp_buf and not created from kmalloc. + * If the entry size is greater than the buffer, we can + * not save it. Just return NULL in that case. This is only + * used to add markers when two consecutive events' time + * stamps have a large delta. See trace_print_lat_context() + */ + if (iter->temp == static_temp_buf && + STATIC_TEMP_BUF_SIZE < ent_size) + return NULL; + + /* + * The __find_next_entry() may call peek_next_entry(), which may + * call ring_buffer_peek() that may make the contents of iter->ent + * undefined. Need to copy iter->ent now. + */ + if (iter->ent && iter->ent != iter->temp) { + if ((!iter->temp || iter->temp_size < iter->ent_size) && + !WARN_ON_ONCE(iter->temp == static_temp_buf)) { + kfree(iter->temp); + iter->temp = kmalloc(iter->ent_size, GFP_KERNEL); + if (!iter->temp) + return NULL; + } + memcpy(iter->temp, iter->ent, iter->ent_size); + iter->temp_size = iter->ent_size; + iter->ent = iter->temp; + } + entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); + /* Put back the original ent_size */ + iter->ent_size = ent_size; + + return entry; } /* Find the next real entry, and increment the iterator to the next entry */ @@ -3538,7 +3588,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) if (ts >= iter->array_buffer->time_start) break; entries++; - ring_buffer_read(buf_iter, NULL); + ring_buffer_iter_advance(buf_iter); } per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; @@ -3981,8 +4031,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) enum print_line_t ret; if (iter->lost_events) { - trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", - iter->cpu, iter->lost_events); + if (iter->lost_events == (unsigned long)-1) + trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n", + iter->cpu); + else + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); if (trace_seq_has_overflowed(&iter->seq)) return TRACE_TYPE_PARTIAL_LINE; } @@ -4198,6 +4252,18 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) goto release; /* + * trace_find_next_entry() may need to save off iter->ent. + * It will place it into the iter->temp buffer. As most + * events are less than 128, allocate a buffer of that size. + * If one is greater, then trace_find_next_entry() will + * allocate a new buffer to adjust for the bigger iter->ent. + * It's not critical if it fails to get allocated here. + */ + iter->temp = kmalloc(128, GFP_KERNEL); + if (iter->temp) + iter->temp_size = 128; + + /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. */ @@ -4237,8 +4303,11 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - /* stop the trace while dumping if we are not opening "snapshot" */ - if (!iter->snapshot) + /* + * If pause-on-trace is enabled, then stop the trace while + * dumping, unless this is the "snapshot" file + */ + if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE)) tracing_stop_tr(tr); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { @@ -4269,6 +4338,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); + kfree(iter->temp); kfree(iter->buffer_iter); release: seq_release_private(inode, file); @@ -4334,7 +4404,7 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - if (!iter->snapshot) + if (!iter->snapshot && tr->stop_count) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); @@ -4344,6 +4414,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); + kfree(iter->temp); kfree(iter->trace); kfree(iter->buffer_iter); seq_release_private(inode, file); @@ -4964,6 +5035,8 @@ static const char readme_msg[] = #ifdef CONFIG_FUNCTION_TRACER " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" "\t\t (function)\n" + " set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n" + "\t\t (function)\n" #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" @@ -9146,6 +9219,9 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* Simulate the iterator */ trace_init_global_iter(&iter); + /* Can not use kmalloc for iter.temp */ + iter.temp = static_temp_buf; + iter.temp_size = STATIC_TEMP_BUF_SIZE; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); @@ -9334,7 +9410,7 @@ __init static int tracer_alloc_buffers(void) goto out_free_buffer_mask; /* Only allocate trace_printk buffers if a trace_printk exists */ - if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) + if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt) /* Must be called before global_trace.buffer is allocated */ trace_printk_init_buffers(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 99372dd7d168..4eb1d004d5f2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -178,10 +178,10 @@ struct trace_array_cpu { kuid_t uid; char comm[TASK_COMM_LEN]; - bool ignore_pid; #ifdef CONFIG_FUNCTION_TRACER - bool ftrace_ignore_pid; + int ftrace_ignore_pid; #endif + bool ignore_pid; }; struct tracer; @@ -207,6 +207,30 @@ struct trace_pid_list { unsigned long *pids; }; +enum { + TRACE_PIDS = BIT(0), + TRACE_NO_PIDS = BIT(1), +}; + +static inline bool pid_type_enabled(int type, struct trace_pid_list *pid_list, + struct trace_pid_list *no_pid_list) +{ + /* Return true if the pid list in type has pids */ + return ((type & TRACE_PIDS) && pid_list) || + ((type & TRACE_NO_PIDS) && no_pid_list); +} + +static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_list, + struct trace_pid_list *no_pid_list) +{ + /* + * Turning off what is in @type, return true if the "other" + * pid list, still has pids in it. + */ + return (!(type & TRACE_PIDS) && pid_list) || + (!(type & TRACE_NO_PIDS) && no_pid_list); +} + typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); /** @@ -285,6 +309,7 @@ struct trace_array { #endif #endif struct trace_pid_list __rcu *filtered_pids; + struct trace_pid_list __rcu *filtered_no_pids; /* * max_lock is used to protect the swapping of buffers * when taking a max snapshot. The buffers themselves are @@ -331,6 +356,7 @@ struct trace_array { #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; + struct trace_pid_list __rcu *function_no_pids; #ifdef CONFIG_DYNAMIC_FTRACE /* All of these are protected by the ftrace_lock */ struct list_head func_probes; @@ -557,12 +583,7 @@ struct tracer { * caller, and we can skip the current check. */ enum { - TRACE_BUFFER_BIT, - TRACE_BUFFER_NMI_BIT, - TRACE_BUFFER_IRQ_BIT, - TRACE_BUFFER_SIRQ_BIT, - - /* Start of function recursion bits */ + /* Function recursion bits */ TRACE_FTRACE_BIT, TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, @@ -787,6 +808,7 @@ extern int pid_max; bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct trace_pid_list *filtered_no_pids, struct task_struct *task); void trace_filter_add_remove_task(struct trace_pid_list *pid_list, struct task_struct *self, @@ -1307,6 +1329,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ + C(PAUSE_ON_TRACE, "pause-on-trace"), \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index f22746f3c132..a523da0dae0a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -325,14 +325,16 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __field_desc( long, timestamp, tv_nsec ) __field( unsigned int, nmi_count ) __field( unsigned int, seqnum ) + __field( unsigned int, count ) ), - F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n", + F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tcount:%d\tnmi-ts:%llu\tnmi-count:%u\n", __entry->seqnum, __entry->tv_sec, __entry->tv_nsec, __entry->duration, __entry->outer_duration, + __entry->count, __entry->nmi_total_ts, __entry->nmi_count) ); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f38234ecea18..242f59e7f17d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -232,10 +232,13 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; struct trace_array_cpu *data; + struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; pid_list = rcu_dereference_raw(tr->filtered_pids); - if (!pid_list) + no_pid_list = rcu_dereference_raw(tr->filtered_no_pids); + + if (!pid_list && !no_pid_list) return false; data = this_cpu_ptr(tr->array_buffer.data); @@ -510,6 +513,9 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); + + pid_list = rcu_dereference_raw(tr->filtered_no_pids); + trace_filter_add_remove_task(pid_list, NULL, task); } static void @@ -522,6 +528,9 @@ event_filter_pid_sched_process_fork(void *data, pid_list = rcu_dereference_sched(tr->filtered_pids); trace_filter_add_remove_task(pid_list, self, task); + + pid_list = rcu_dereference_sched(tr->filtered_no_pids); + trace_filter_add_remove_task(pid_list, self, task); } void trace_event_follow_fork(struct trace_array *tr, bool enable) @@ -544,13 +553,23 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; + bool ret; pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); - this_cpu_write(tr->array_buffer.data->ignore_pid, - trace_ignore_this_task(pid_list, prev) && - trace_ignore_this_task(pid_list, next)); + /* + * Sched switch is funny, as we only want to ignore it + * in the notrace case if both prev and next should be ignored. + */ + ret = trace_ignore_this_task(NULL, no_pid_list, prev) && + trace_ignore_this_task(NULL, no_pid_list, next); + + this_cpu_write(tr->array_buffer.data->ignore_pid, ret || + (trace_ignore_this_task(pid_list, NULL, prev) && + trace_ignore_this_task(pid_list, NULL, next))); } static void @@ -558,18 +577,21 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); this_cpu_write(tr->array_buffer.data->ignore_pid, - trace_ignore_this_task(pid_list, next)); + trace_ignore_this_task(pid_list, no_pid_list, next)); } static void event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) { struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; /* Nothing to do if we are already tracing */ @@ -577,15 +599,17 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) return; pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); this_cpu_write(tr->array_buffer.data->ignore_pid, - trace_ignore_this_task(pid_list, task)); + trace_ignore_this_task(pid_list, no_pid_list, task)); } static void event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) { struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; /* Nothing to do if we are not tracing */ @@ -593,23 +617,15 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) return; pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); /* Set tracing if current is enabled */ this_cpu_write(tr->array_buffer.data->ignore_pid, - trace_ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, no_pid_list, current)); } -static void __ftrace_clear_event_pids(struct trace_array *tr) +static void unregister_pid_events(struct trace_array *tr) { - struct trace_pid_list *pid_list; - struct trace_event_file *file; - int cpu; - - pid_list = rcu_dereference_protected(tr->filtered_pids, - lockdep_is_held(&event_mutex)); - if (!pid_list) - return; - unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr); unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr); @@ -621,26 +637,55 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); +} - list_for_each_entry(file, &tr->events, list) { - clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); +static void __ftrace_clear_event_pids(struct trace_array *tr, int type) +{ + struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; + struct trace_event_file *file; + int cpu; + + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + + /* Make sure there's something to do */ + if (!pid_type_enabled(type, pid_list, no_pid_list)) + return; + + if (!still_need_pid_events(type, pid_list, no_pid_list)) { + unregister_pid_events(tr); + + list_for_each_entry(file, &tr->events, list) { + clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); + } + + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false; } - for_each_possible_cpu(cpu) - per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false; + if (type & TRACE_PIDS) + rcu_assign_pointer(tr->filtered_pids, NULL); - rcu_assign_pointer(tr->filtered_pids, NULL); + if (type & TRACE_NO_PIDS) + rcu_assign_pointer(tr->filtered_no_pids, NULL); /* Wait till all users are no longer using pid filtering */ tracepoint_synchronize_unregister(); - trace_free_pid_list(pid_list); + if ((type & TRACE_PIDS) && pid_list) + trace_free_pid_list(pid_list); + + if ((type & TRACE_NO_PIDS) && no_pid_list) + trace_free_pid_list(no_pid_list); } -static void ftrace_clear_event_pids(struct trace_array *tr) +static void ftrace_clear_event_pids(struct trace_array *tr, int type) { mutex_lock(&event_mutex); - __ftrace_clear_event_pids(tr); + __ftrace_clear_event_pids(tr, type); mutex_unlock(&event_mutex); } @@ -1013,15 +1058,32 @@ static void t_stop(struct seq_file *m, void *p) } static void * -p_next(struct seq_file *m, void *v, loff_t *pos) +__next(struct seq_file *m, void *v, loff_t *pos, int type) { struct trace_array *tr = m->private; - struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + struct trace_pid_list *pid_list; + + if (type == TRACE_PIDS) + pid_list = rcu_dereference_sched(tr->filtered_pids); + else + pid_list = rcu_dereference_sched(tr->filtered_no_pids); return trace_pid_next(pid_list, v, pos); } -static void *p_start(struct seq_file *m, loff_t *pos) +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + return __next(m, v, pos, TRACE_PIDS); +} + +static void * +np_next(struct seq_file *m, void *v, loff_t *pos) +{ + return __next(m, v, pos, TRACE_NO_PIDS); +} + +static void *__start(struct seq_file *m, loff_t *pos, int type) __acquires(RCU) { struct trace_pid_list *pid_list; @@ -1036,7 +1098,10 @@ static void *p_start(struct seq_file *m, loff_t *pos) mutex_lock(&event_mutex); rcu_read_lock_sched(); - pid_list = rcu_dereference_sched(tr->filtered_pids); + if (type == TRACE_PIDS) + pid_list = rcu_dereference_sched(tr->filtered_pids); + else + pid_list = rcu_dereference_sched(tr->filtered_no_pids); if (!pid_list) return NULL; @@ -1044,6 +1109,18 @@ static void *p_start(struct seq_file *m, loff_t *pos) return trace_pid_start(pid_list, pos); } +static void *p_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + return __start(m, pos, TRACE_PIDS); +} + +static void *np_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + return __start(m, pos, TRACE_NO_PIDS); +} + static void p_stop(struct seq_file *m, void *p) __releases(RCU) { @@ -1588,6 +1665,7 @@ static void ignore_task_cpu(void *data) { struct trace_array *tr = data; struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; /* * This function is called by on_each_cpu() while the @@ -1595,18 +1673,50 @@ static void ignore_task_cpu(void *data) */ pid_list = rcu_dereference_protected(tr->filtered_pids, mutex_is_locked(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + mutex_is_locked(&event_mutex)); this_cpu_write(tr->array_buffer.data->ignore_pid, - trace_ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, no_pid_list, current)); +} + +static void register_pid_events(struct trace_array *tr) +{ + /* + * Register a probe that is called before all other probes + * to set ignore_pid if next or prev do not match. + * Register a probe this is called after all other probes + * to only keep ignore_pid set if next pid matches. + */ + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, + tr, 0); + + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, + tr, 0); } static ssize_t -ftrace_event_pid_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +event_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, int type) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *other_pids = NULL; struct trace_pid_list *pid_list; struct trace_event_file *file; ssize_t ret; @@ -1620,14 +1730,26 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, mutex_lock(&event_mutex); - filtered_pids = rcu_dereference_protected(tr->filtered_pids, - lockdep_is_held(&event_mutex)); + if (type == TRACE_PIDS) { + filtered_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + other_pids = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + } else { + filtered_pids = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + other_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) goto out; - rcu_assign_pointer(tr->filtered_pids, pid_list); + if (type == TRACE_PIDS) + rcu_assign_pointer(tr->filtered_pids, pid_list); + else + rcu_assign_pointer(tr->filtered_no_pids, pid_list); list_for_each_entry(file, &tr->events, list) { set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); @@ -1636,32 +1758,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { tracepoint_synchronize_unregister(); trace_free_pid_list(filtered_pids); - } else if (pid_list) { - /* - * Register a probe that is called before all other probes - * to set ignore_pid if next or prev do not match. - * Register a probe this is called after all other probes - * to only keep ignore_pid set if next pid matches. - */ - register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, - tr, INT_MAX); - register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, - tr, 0); - - register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, - tr, INT_MAX); - register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, - tr, 0); - - register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, - tr, INT_MAX); - register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, - tr, 0); - - register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, - tr, INT_MAX); - register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, - tr, 0); + } else if (pid_list && !other_pids) { + register_pid_events(tr); } /* @@ -1680,9 +1778,24 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, return ret; } +static ssize_t +ftrace_event_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS); +} + +static ssize_t +ftrace_event_npid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS); +} + static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); +static int ftrace_event_set_npid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { @@ -1706,6 +1819,13 @@ static const struct seq_operations show_set_pid_seq_ops = { .stop = p_stop, }; +static const struct seq_operations show_set_no_pid_seq_ops = { + .start = np_start, + .next = np_next, + .show = trace_pid_show, + .stop = p_stop, +}; + static const struct file_operations ftrace_avail_fops = { .open = ftrace_event_avail_open, .read = seq_read, @@ -1729,6 +1849,14 @@ static const struct file_operations ftrace_set_event_pid_fops = { .release = ftrace_event_release, }; +static const struct file_operations ftrace_set_event_notrace_pid_fops = { + .open = ftrace_event_set_npid_open, + .read = seq_read, + .write = ftrace_event_npid_write, + .llseek = seq_lseek, + .release = ftrace_event_release, +}; + static const struct file_operations ftrace_enable_fops = { .open = tracing_open_generic, .read = event_enable_read, @@ -1858,7 +1986,28 @@ ftrace_event_set_pid_open(struct inode *inode, struct file *file) if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_clear_event_pids(tr); + ftrace_clear_event_pids(tr, TRACE_PIDS); + + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; +} + +static int +ftrace_event_set_npid_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_no_pid_seq_ops; + struct trace_array *tr = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_event_pids(tr, TRACE_NO_PIDS); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) @@ -3075,6 +3224,11 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) if (!entry) pr_warn("Could not create tracefs 'set_event_pid' entry\n"); + entry = tracefs_create_file("set_event_notrace_pid", 0644, parent, + tr, &ftrace_set_event_notrace_pid_fops); + if (!entry) + pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); + /* ring buffer internal formats */ entry = trace_create_file("header_page", 0444, d_events, ring_buffer_print_page_header, @@ -3158,7 +3312,7 @@ int event_trace_del_tracer(struct trace_array *tr) clear_event_triggers(tr); /* Clear the pid list */ - __ftrace_clear_event_pids(tr); + __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index dd34a1b46a86..3a74736da363 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1088,14 +1088,10 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { - int ret = register_trigger(glob, ops, data, file); - - if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) { - unregister_trigger(glob, ops, data, file); - ret = 0; - } + if (tracing_alloc_snapshot_instance(file->tr) != 0) + return 0; - return ret; + return register_trigger(glob, ops, data, file); } static int diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7d71546ba00a..4a9c49c08ec9 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -482,7 +482,7 @@ get_return_for_leaf(struct trace_iterator *iter, /* this is a leaf, now advance the iterator */ if (ring_iter) - ring_buffer_read(ring_iter, NULL); + ring_buffer_iter_advance(ring_iter); return next; } diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index a48808c43249..e2be7bb7ef7e 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -83,6 +83,7 @@ struct hwlat_sample { u64 nmi_total_ts; /* Total time spent in NMIs */ struct timespec64 timestamp; /* wall time */ int nmi_count; /* # NMIs during this sample */ + int count; /* # of iteratons over threash */ }; /* keep the global state somewhere. */ @@ -124,6 +125,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) entry->timestamp = sample->timestamp; entry->nmi_total_ts = sample->nmi_total_ts; entry->nmi_count = sample->nmi_count; + entry->count = sample->count; if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit_nostack(buffer, event); @@ -167,12 +169,14 @@ void trace_hwlat_callback(bool enter) static int get_sample(void) { struct trace_array *tr = hwlat_trace; + struct hwlat_sample s; time_type start, t1, t2, last_t2; - s64 diff, total, last_total = 0; + s64 diff, outer_diff, total, last_total = 0; u64 sample = 0; u64 thresh = tracing_thresh; u64 outer_sample = 0; int ret = -1; + unsigned int count = 0; do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ @@ -186,6 +190,7 @@ static int get_sample(void) init_time(last_t2, 0); start = time_get(); /* start timestamp */ + outer_diff = 0; do { @@ -194,14 +199,14 @@ static int get_sample(void) if (time_u64(last_t2)) { /* Check the delta from outer loop (t2 to next t1) */ - diff = time_to_us(time_sub(t1, last_t2)); + outer_diff = time_to_us(time_sub(t1, last_t2)); /* This shouldn't happen */ - if (diff < 0) { + if (outer_diff < 0) { pr_err(BANNER "time running backwards\n"); goto out; } - if (diff > outer_sample) - outer_sample = diff; + if (outer_diff > outer_sample) + outer_sample = outer_diff; } last_t2 = t2; @@ -217,6 +222,12 @@ static int get_sample(void) /* This checks the inner loop (t1 to t2) */ diff = time_to_us(time_sub(t2, t1)); /* current diff */ + if (diff > thresh || outer_diff > thresh) { + if (!count) + ktime_get_real_ts64(&s.timestamp); + count++; + } + /* This shouldn't happen */ if (diff < 0) { pr_err(BANNER "time running backwards\n"); @@ -236,7 +247,6 @@ static int get_sample(void) /* If we exceed the threshold value, we have found a hardware latency */ if (sample > thresh || outer_sample > thresh) { - struct hwlat_sample s; u64 latency; ret = 1; @@ -249,9 +259,9 @@ static int get_sample(void) s.seqnum = hwlat_data.count; s.duration = sample; s.outer_duration = outer_sample; - ktime_get_real_ts64(&s.timestamp); s.nmi_total_ts = nmi_total_ts; s.nmi_count = nmi_count; + s.count = count; trace_hwlat_sample(&s); latency = max(sample, outer_sample); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 362cca52f5de..d0568af4a0ef 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1078,6 +1078,8 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev) int i; seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); + if (trace_kprobe_is_return(tk) && tk->rp.maxactive) + seq_printf(m, "%d", tk->rp.maxactive); seq_printf(m, ":%s/%s", trace_probe_group_name(&tk->tp), trace_probe_name(&tk->tp)); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b4909082f6a4..9a121e147102 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -617,22 +617,19 @@ int trace_print_context(struct trace_iterator *iter) int trace_print_lat_context(struct trace_iterator *iter) { + struct trace_entry *entry, *next_entry; struct trace_array *tr = iter->tr; - /* trace_find_next_entry will reset ent_size */ - int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; - u64 next_ts; - struct trace_entry *entry = iter->ent, - *next_entry = trace_find_next_entry(iter, NULL, - &next_ts); unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE); + u64 next_ts; - /* Restore the original ent_size */ - iter->ent_size = ent_size; - + next_entry = trace_find_next_entry(iter, NULL, &next_ts); if (!next_entry) next_ts = iter->ts; + /* trace_find_next_entry() may change iter->ent */ + entry = iter->ent; + if (verbose) { char comm[TASK_COMM_LEN]; @@ -1158,12 +1155,12 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld", + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d", field->seqnum, field->duration, field->outer_duration, (long long)field->timestamp.tv_sec, - field->timestamp.tv_nsec); + field->timestamp.tv_nsec, field->count); if (field->nmi_count) { /* diff --git a/kernel/ucount.c b/kernel/ucount.c index a53cc2b4179c..11b1596e2542 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -69,6 +69,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), + UCOUNT_ENTRY("max_time_namespaces"), #ifdef CONFIG_INOTIFY_USER UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), @@ -81,6 +82,8 @@ bool setup_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL struct ctl_table *tbl; + + BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1); setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4e01c448b4b4..891ccad5f271 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -858,7 +858,8 @@ void wq_worker_running(struct task_struct *task) * @task: task going to sleep * * This function is called from schedule() when a busy worker is - * going to sleep. + * going to sleep. Preemption needs to be disabled to protect ->sleeping + * assignment. */ void wq_worker_sleeping(struct task_struct *task) { @@ -875,7 +876,8 @@ void wq_worker_sleeping(struct task_struct *task) pool = worker->pool; - if (WARN_ON_ONCE(worker->sleeping)) + /* Return if preempted before wq_worker_running() was reached */ + if (worker->sleeping) return; worker->sleeping = 1; @@ -2834,7 +2836,7 @@ void flush_workqueue(struct workqueue_struct *wq) * First flushers are responsible for cascading flushes and * handling overflow. Non-first flushers can simply return. */ - if (wq->first_flusher != &this_flusher) + if (READ_ONCE(wq->first_flusher) != &this_flusher) return; mutex_lock(&wq->mutex); @@ -2843,7 +2845,7 @@ void flush_workqueue(struct workqueue_struct *wq) if (wq->first_flusher != &this_flusher) goto out_unlock; - wq->first_flusher = NULL; + WRITE_ONCE(wq->first_flusher, NULL); WARN_ON_ONCE(!list_empty(&this_flusher.list)); WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); @@ -5898,7 +5900,7 @@ static void __init wq_numa_init(void) * items. Actual work item execution starts only after kthreads can be * created and scheduled right before early initcalls. */ -int __init workqueue_init_early(void) +void __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; @@ -5965,8 +5967,6 @@ int __init workqueue_init_early(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); - - return 0; } /** @@ -5978,7 +5978,7 @@ int __init workqueue_init_early(void) * are no kworkers executing the work items yet. Populate the worker pools * with the initial workers and enable future kworker creations. */ -int __init workqueue_init(void) +void __init workqueue_init(void) { struct workqueue_struct *wq; struct worker_pool *pool; @@ -6025,6 +6025,4 @@ int __init workqueue_init(void) wq_online = true; wq_watchdog_init(); - - return 0; } |