diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/Kconfig | 7 | ||||
-rw-r--r-- | fs/Makefile | 2 | ||||
-rw-r--r-- | fs/exec.c | 1 | ||||
-rw-r--r-- | fs/internal.h | 7 | ||||
-rw-r--r-- | fs/libfs.c | 142 | ||||
-rw-r--r-- | fs/nsfs.c | 121 | ||||
-rw-r--r-- | fs/pidfs.c | 290 |
7 files changed, 486 insertions, 84 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index ea2f77446080..4bc7dd420874 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -173,6 +173,13 @@ source "fs/proc/Kconfig" source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" +config FS_PID + bool "Pseudo filesystem for process file descriptors" + depends on 64BIT + default y + help + Pidfs implements advanced features for process file descriptors. + config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM diff --git a/fs/Makefile b/fs/Makefile index c32b8c586800..6ecc9b0a53f2 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -15,7 +15,7 @@ obj-y := open.o read_write.o file_table.o super.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ - kernel_read_file.o mnt_idmapping.o remap_range.o + kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o diff --git a/fs/exec.c b/fs/exec.c index af4fbb61cd53..ece3ab0998e1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1158,7 +1158,6 @@ static int de_thread(struct task_struct *tsk) BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; - /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees diff --git a/fs/internal.h b/fs/internal.h index b67406435fc0..7d3edcdf59cc 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -310,3 +310,10 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); +struct stashed_operations { + void (*put_data)(void *data); + void (*init_inode)(struct inode *inode, void *data); +}; +int path_from_stashed(struct dentry **stashed, unsigned long ino, + struct vfsmount *mnt, void *data, struct path *path); +void stashed_dentry_prune(struct dentry *dentry); diff --git a/fs/libfs.c b/fs/libfs.c index 78c71a9e2e18..680c727d1bbc 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -23,6 +23,7 @@ #include <linux/fsnotify.h> #include <linux/unicode.h> #include <linux/fscrypt.h> +#include <linux/pidfs.h> #include <linux/uaccess.h> @@ -1985,3 +1986,144 @@ struct timespec64 simple_inode_init_ts(struct inode *inode) return ts; } EXPORT_SYMBOL(simple_inode_init_ts); + +static inline struct dentry *get_stashed_dentry(struct dentry *stashed) +{ + struct dentry *dentry; + + guard(rcu)(); + dentry = READ_ONCE(stashed); + if (!dentry) + return NULL; + if (!lockref_get_not_dead(&dentry->d_lockref)) + return NULL; + return dentry; +} + +static struct dentry *prepare_anon_dentry(struct dentry **stashed, + unsigned long ino, + struct super_block *sb, + void *data) +{ + struct dentry *dentry; + struct inode *inode; + const struct stashed_operations *sops = sb->s_fs_info; + + dentry = d_alloc_anon(sb); + if (!dentry) + return ERR_PTR(-ENOMEM); + + inode = new_inode_pseudo(sb); + if (!inode) { + dput(dentry); + return ERR_PTR(-ENOMEM); + } + + inode->i_ino = ino; + inode->i_flags |= S_IMMUTABLE; + inode->i_mode = S_IFREG; + simple_inode_init_ts(inode); + sops->init_inode(inode, data); + + /* Notice when this is changed. */ + WARN_ON_ONCE(!S_ISREG(inode->i_mode)); + WARN_ON_ONCE(!IS_IMMUTABLE(inode)); + + /* Store address of location where dentry's supposed to be stashed. */ + dentry->d_fsdata = stashed; + + /* @data is now owned by the fs */ + d_instantiate(dentry, inode); + return dentry; +} + +static struct dentry *stash_dentry(struct dentry **stashed, + struct dentry *dentry) +{ + guard(rcu)(); + for (;;) { + struct dentry *old; + + /* Assume any old dentry was cleared out. */ + old = cmpxchg(stashed, NULL, dentry); + if (likely(!old)) + return dentry; + + /* Check if somebody else installed a reusable dentry. */ + if (lockref_get_not_dead(&old->d_lockref)) + return old; + + /* There's an old dead dentry there, try to take it over. */ + if (likely(try_cmpxchg(stashed, &old, dentry))) + return dentry; + } +} + +/** + * path_from_stashed - create path from stashed or new dentry + * @stashed: where to retrieve or stash dentry + * @ino: inode number to use + * @mnt: mnt of the filesystems to use + * @data: data to store in inode->i_private + * @path: path to create + * + * The function tries to retrieve a stashed dentry from @stashed. If the dentry + * is still valid then it will be reused. If the dentry isn't able the function + * will allocate a new dentry and inode. It will then check again whether it + * can reuse an existing dentry in case one has been added in the meantime or + * update @stashed with the newly added dentry. + * + * Special-purpose helper for nsfs and pidfs. + * + * Return: On success zero and on failure a negative error is returned. + */ +int path_from_stashed(struct dentry **stashed, unsigned long ino, + struct vfsmount *mnt, void *data, struct path *path) +{ + struct dentry *dentry; + const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; + + /* See if dentry can be reused. */ + path->dentry = get_stashed_dentry(*stashed); + if (path->dentry) { + sops->put_data(data); + goto out_path; + } + + /* Allocate a new dentry. */ + dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, data); + if (IS_ERR(dentry)) { + sops->put_data(data); + return PTR_ERR(dentry); + } + + /* Added a new dentry. @data is now owned by the filesystem. */ + path->dentry = stash_dentry(stashed, dentry); + if (path->dentry != dentry) + dput(dentry); + +out_path: + WARN_ON_ONCE(path->dentry->d_fsdata != stashed); + WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); + path->mnt = mntget(mnt); + return 0; +} + +void stashed_dentry_prune(struct dentry *dentry) +{ + struct dentry **stashed = dentry->d_fsdata; + struct inode *inode = d_inode(dentry); + + if (WARN_ON_ONCE(!stashed)) + return; + + if (!inode) + return; + + /* + * Only replace our own @dentry as someone else might've + * already cleared out @dentry and stashed their own + * dentry in there. + */ + cmpxchg(stashed, dentry, NULL); +} diff --git a/fs/nsfs.c b/fs/nsfs.c index 34e1e3e36733..7aaafb5cb9fc 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -27,26 +27,17 @@ static const struct file_operations ns_file_operations = { static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = d_inode(dentry); - const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + struct ns_common *ns = inode->i_private; + const struct proc_ns_operations *ns_ops = ns->ops; return dynamic_dname(buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); } -static void ns_prune_dentry(struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - if (inode) { - struct ns_common *ns = inode->i_private; - atomic_long_set(&ns->stashed, 0); - } -} - -const struct dentry_operations ns_dentry_operations = -{ - .d_prune = ns_prune_dentry, +const struct dentry_operations ns_dentry_operations = { .d_delete = always_delete_dentry, .d_dname = ns_dname, + .d_prune = stashed_dentry_prune, }; static void nsfs_evict(struct inode *inode) @@ -56,67 +47,16 @@ static void nsfs_evict(struct inode *inode) ns->ops->put(ns); } -static int __ns_get_path(struct path *path, struct ns_common *ns) -{ - struct vfsmount *mnt = nsfs_mnt; - struct dentry *dentry; - struct inode *inode; - unsigned long d; - - rcu_read_lock(); - d = atomic_long_read(&ns->stashed); - if (!d) - goto slow; - dentry = (struct dentry *)d; - if (!lockref_get_not_dead(&dentry->d_lockref)) - goto slow; - rcu_read_unlock(); - ns->ops->put(ns); -got_it: - path->mnt = mntget(mnt); - path->dentry = dentry; - return 0; -slow: - rcu_read_unlock(); - inode = new_inode_pseudo(mnt->mnt_sb); - if (!inode) { - ns->ops->put(ns); - return -ENOMEM; - } - inode->i_ino = ns->inum; - simple_inode_init_ts(inode); - inode->i_flags |= S_IMMUTABLE; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - inode->i_private = ns; - - dentry = d_make_root(inode); /* not the normal use, but... */ - if (!dentry) - return -ENOMEM; - dentry->d_fsdata = (void *)ns->ops; - d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); - if (d) { - d_delete(dentry); /* make sure ->d_prune() does nothing */ - dput(dentry); - cpu_relax(); - return -EAGAIN; - } - goto got_it; -} - int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, void *private_data) { - int ret; + struct ns_common *ns; - do { - struct ns_common *ns = ns_get_cb(private_data); - if (!ns) - return -ENOENT; - ret = __ns_get_path(path, ns); - } while (ret == -EAGAIN); + ns = ns_get_cb(private_data); + if (!ns) + return -ENOENT; - return ret; + return path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt, ns, path); } struct ns_get_path_task_args { @@ -146,6 +86,7 @@ int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { struct path path = {}; + struct ns_common *relative; struct file *f; int err; int fd; @@ -154,19 +95,15 @@ int open_related_ns(struct ns_common *ns, if (fd < 0) return fd; - do { - struct ns_common *relative; - - relative = get_ns(ns); - if (IS_ERR(relative)) { - put_unused_fd(fd); - return PTR_ERR(relative); - } - - err = __ns_get_path(&path, relative); - } while (err == -EAGAIN); + relative = get_ns(ns); + if (IS_ERR(relative)) { + put_unused_fd(fd); + return PTR_ERR(relative); + } - if (err) { + err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt, + relative, &path); + if (err < 0) { put_unused_fd(fd); return err; } @@ -249,7 +186,8 @@ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino) static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + const struct ns_common *ns = inode->i_private; + const struct proc_ns_operations *ns_ops = ns->ops; seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); return 0; @@ -261,6 +199,24 @@ static const struct super_operations nsfs_ops = { .show_path = nsfs_show_path, }; +static void nsfs_init_inode(struct inode *inode, void *data) +{ + inode->i_private = data; + inode->i_mode |= S_IRUGO; + inode->i_fop = &ns_file_operations; +} + +static void nsfs_put_data(void *data) +{ + struct ns_common *ns = data; + ns->ops->put(ns); +} + +static const struct stashed_operations nsfs_stashed_ops = { + .init_inode = nsfs_init_inode, + .put_data = nsfs_put_data, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); @@ -268,6 +224,7 @@ static int nsfs_init_fs_context(struct fs_context *fc) return -ENOMEM; ctx->ops = &nsfs_ops; ctx->dops = &ns_dentry_operations; + fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; } diff --git a/fs/pidfs.c b/fs/pidfs.c new file mode 100644 index 000000000000..8fd71a00be9c --- /dev/null +++ b/fs/pidfs.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/magic.h> +#include <linux/mount.h> +#include <linux/pid.h> +#include <linux/pidfs.h> +#include <linux/pid_namespace.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/pseudo_fs.h> +#include <linux/seq_file.h> +#include <uapi/linux/pidfd.h> + +#include "internal.h" + +static int pidfd_release(struct inode *inode, struct file *file) +{ +#ifndef CONFIG_FS_PID + struct pid *pid = file->private_data; + + file->private_data = NULL; + put_pid(pid); +#endif + return 0; +} + +#ifdef CONFIG_PROC_FS +/** + * pidfd_show_fdinfo - print information about a pidfd + * @m: proc fdinfo file + * @f: file referencing a pidfd + * + * Pid: + * This function will print the pid that a given pidfd refers to in the + * pid namespace of the procfs instance. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its pid. This is + * similar to calling getppid() on a process whose parent is outside of + * its pid namespace. + * + * NSpid: + * If pid namespaces are supported then this function will also print + * the pid of a given pidfd refers to for all descendant pid namespaces + * starting from the current pid namespace of the instance, i.e. the + * Pid field and the first entry in the NSpid field will be identical. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its first NSpid + * entry and no others will be shown. + * Note that this differs from the Pid and NSpid fields in + * /proc/<pid>/status where Pid and NSpid are always shown relative to + * the pid namespace of the procfs instance. The difference becomes + * obvious when sending around a pidfd between pid namespaces from a + * different branch of the tree, i.e. where no ancestral relation is + * present between the pid namespaces: + * - create two new pid namespaces ns1 and ns2 in the initial pid + * namespace (also take care to create new mount namespaces in the + * new pid namespace and mount procfs) + * - create a process with a pidfd in ns1 + * - send pidfd from ns1 to ns2 + * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid + * have exactly one entry, which is 0 + */ +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct pid *pid = pidfd_pid(f); + struct pid_namespace *ns; + pid_t nr = -1; + + if (likely(pid_has_task(pid, PIDTYPE_PID))) { + ns = proc_pid_ns(file_inode(m->file)->i_sb); + nr = pid_nr_ns(pid, ns); + } + + seq_put_decimal_ll(m, "Pid:\t", nr); + +#ifdef CONFIG_PID_NS + seq_put_decimal_ll(m, "\nNSpid:\t", nr); + if (nr > 0) { + int i; + + /* If nr is non-zero it means that 'pid' is valid and that + * ns, i.e. the pid namespace associated with the procfs + * instance, is in the pid namespace hierarchy of pid. + * Start at one below the already printed level. + */ + for (i = ns->level + 1; i <= pid->level; i++) + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); + } +#endif + seq_putc(m, '\n'); +} +#endif + +/* + * Poll support for process exit notification. + */ +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) +{ + struct pid *pid = pidfd_pid(file); + bool thread = file->f_flags & PIDFD_THREAD; + struct task_struct *task; + __poll_t poll_flags = 0; + + poll_wait(file, &pid->wait_pidfd, pts); + /* + * Depending on PIDFD_THREAD, inform pollers when the thread + * or the whole thread-group exits. + */ + guard(rcu)(); + task = pid_task(pid, PIDTYPE_PID); + if (!task) + poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; + else if (task->exit_state && (thread || thread_group_empty(task))) + poll_flags = EPOLLIN | EPOLLRDNORM; + + return poll_flags; +} + +static const struct file_operations pidfs_file_operations = { + .release = pidfd_release, + .poll = pidfd_poll, +#ifdef CONFIG_PROC_FS + .show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +struct pid *pidfd_pid(const struct file *file) +{ + if (file->f_op != &pidfs_file_operations) + return ERR_PTR(-EBADF); +#ifdef CONFIG_FS_PID + return file_inode(file)->i_private; +#else + return file->private_data; +#endif +} + +#ifdef CONFIG_FS_PID +static struct vfsmount *pidfs_mnt __ro_after_init; + +/* + * The vfs falls back to simple_setattr() if i_op->setattr() isn't + * implemented. Let's reject it completely until we have a clean + * permission concept for pidfds. + */ +static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + return -EOPNOTSUPP; +} + +static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + return 0; +} + +static const struct inode_operations pidfs_inode_operations = { + .getattr = pidfs_getattr, + .setattr = pidfs_setattr, +}; + +static void pidfs_evict_inode(struct inode *inode) +{ + struct pid *pid = inode->i_private; + + clear_inode(inode); + put_pid(pid); +} + +static const struct super_operations pidfs_sops = { + .drop_inode = generic_delete_inode, + .evict_inode = pidfs_evict_inode, + .statfs = simple_statfs, +}; + +static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(buffer, buflen, "pidfd:[%lu]", + d_inode(dentry)->i_ino); +} + +static const struct dentry_operations pidfs_dentry_operations = { + .d_delete = always_delete_dentry, + .d_dname = pidfs_dname, + .d_prune = stashed_dentry_prune, +}; + +static void pidfs_init_inode(struct inode *inode, void *data) +{ + inode->i_private = data; + inode->i_flags |= S_PRIVATE; + inode->i_mode |= S_IRWXU; + inode->i_op = &pidfs_inode_operations; + inode->i_fop = &pidfs_file_operations; +} + +static void pidfs_put_data(void *data) +{ + struct pid *pid = data; + put_pid(pid); +} + +static const struct stashed_operations pidfs_stashed_ops = { + .init_inode = pidfs_init_inode, + .put_data = pidfs_put_data, +}; + +static int pidfs_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx; + + ctx = init_pseudo(fc, PID_FS_MAGIC); + if (!ctx) + return -ENOMEM; + + ctx->ops = &pidfs_sops; + ctx->dops = &pidfs_dentry_operations; + fc->s_fs_info = (void *)&pidfs_stashed_ops; + return 0; +} + +static struct file_system_type pidfs_type = { + .name = "pidfs", + .init_fs_context = pidfs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + + struct file *pidfd_file; + struct path path; + int ret; + + /* + * Inode numbering for pidfs start at RESERVED_PIDS + 1. + * This avoids collisions with the root inode which is 1 + * for pseudo filesystems. + */ + ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt, + get_pid(pid), &path); + if (ret < 0) + return ERR_PTR(ret); + + pidfd_file = dentry_open(&path, flags, current_cred()); + path_put(&path); + return pidfd_file; +} + +void __init pidfs_init(void) +{ + pidfs_mnt = kern_mount(&pidfs_type); + if (IS_ERR(pidfs_mnt)) + panic("Failed to mount pidfs pseudo filesystem"); +} + +bool is_pidfs_sb(const struct super_block *sb) +{ + return sb == pidfs_mnt->mnt_sb; +} + +#else /* !CONFIG_FS_PID */ + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + struct file *pidfd_file; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid, + flags | O_RDWR); + if (IS_ERR(pidfd_file)) + return pidfd_file; + + get_pid(pid); + return pidfd_file; +} + +void __init pidfs_init(void) { } +bool is_pidfs_sb(const struct super_block *sb) +{ + return false; +} +#endif |