summaryrefslogtreecommitdiffstats
path: root/fs/pidfs.c
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2024-02-12 16:32:38 +0100
committerChristian Brauner <brauner@kernel.org>2024-03-01 12:23:37 +0100
commitcb12fd8e0dabb9a1c8aef55a6a41e2c255fcdf4b (patch)
treee5e957643dfd8dff83de3eec242e69678463cdbd /fs/pidfs.c
parent50f4f2d197e194ec0356962b99ca2b72e9a37bc8 (diff)
downloadlinux-stable-cb12fd8e0dabb9a1c8aef55a6a41e2c255fcdf4b.tar.gz
linux-stable-cb12fd8e0dabb9a1c8aef55a6a41e2c255fcdf4b.tar.bz2
linux-stable-cb12fd8e0dabb9a1c8aef55a6a41e2c255fcdf4b.zip
pidfd: add pidfs
This moves pidfds from the anonymous inode infrastructure to a tiny pseudo filesystem. This has been on my todo for quite a while as it will unblock further work that we weren't able to do simply because of the very justified limitations of anonymous inodes. Moving pidfds to a tiny pseudo filesystem allows: * statx() on pidfds becomes useful for the first time. * pidfds can be compared simply via statx() and then comparing inode numbers. * pidfds have unique inode numbers for the system lifetime. * struct pid is now stashed in inode->i_private instead of file->private_data. This means it is now possible to introduce concepts that operate on a process once all file descriptors have been closed. A concrete example is kill-on-last-close. * file->private_data is freed up for per-file options for pidfds. * Each struct pid will refer to a different inode but the same struct pid will refer to the same inode if it's opened multiple times. In contrast to now where each struct pid refers to the same inode. Even if we were to move to anon_inode_create_getfile() which creates new inodes we'd still be associating the same struct pid with multiple different inodes. The tiny pseudo filesystem is not visible anywhere in userspace exactly like e.g., pipefs and sockfs. There's no lookup, there's no complex inode operations, nothing. Dentries and inodes are always deleted when the last pidfd is closed. We allocate a new inode for each struct pid and we reuse that inode for all pidfds. We use iget_locked() to find that inode again based on the inode number which isn't recycled. We allocate a new dentry for each pidfd that uses the same inode. That is similar to anonymous inodes which reuse the same inode for thousands of dentries. For pidfds we're talking way less than that. There usually won't be a lot of concurrent openers of the same struct pid. They can probably often be counted on two hands. I know that systemd does use separate pidfd for the same struct pid for various complex process tracking issues. So I think with that things actually become way simpler. Especially because we don't have to care about lookup. Dentries and inodes continue to be always deleted. The code is entirely optional and fairly small. If it's not selected we fallback to anonymous inodes. Heavily inspired by nsfs which uses a similar stashing mechanism just for namespaces. Link: https://lore.kernel.org/r/20240213-vfs-pidfd_fs-v1-2-f863f58cfce1@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'fs/pidfs.c')
-rw-r--r--fs/pidfs.c156
1 files changed, 153 insertions, 3 deletions
diff --git a/fs/pidfs.c b/fs/pidfs.c
index eccb291862a0..6c3f010074af 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -1,9 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/pid.h>
+#include <linux/pidfs.h>
#include <linux/pid_namespace.h>
#include <linux/poll.h>
#include <linux/proc_fs.h>
@@ -14,10 +16,12 @@
static int pidfd_release(struct inode *inode, struct file *file)
{
+#ifndef CONFIG_FS_PID
struct pid *pid = file->private_data;
file->private_data = NULL;
put_pid(pid);
+#endif
return 0;
}
@@ -59,7 +63,7 @@ static int pidfd_release(struct inode *inode, struct file *file)
*/
static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
{
- struct pid *pid = f->private_data;
+ struct pid *pid = pidfd_pid(f);
struct pid_namespace *ns;
pid_t nr = -1;
@@ -93,7 +97,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
*/
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
- struct pid *pid = file->private_data;
+ struct pid *pid = pidfd_pid(file);
bool thread = file->f_flags & PIDFD_THREAD;
struct task_struct *task;
__poll_t poll_flags = 0;
@@ -113,10 +117,156 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
return poll_flags;
}
-const struct file_operations pidfd_fops = {
+static const struct file_operations pidfs_file_operations = {
.release = pidfd_release,
.poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo,
#endif
};
+
+struct pid *pidfd_pid(const struct file *file)
+{
+ if (file->f_op != &pidfs_file_operations)
+ return ERR_PTR(-EBADF);
+#ifdef CONFIG_FS_PID
+ return file_inode(file)->i_private;
+#else
+ return file->private_data;
+#endif
+}
+
+#ifdef CONFIG_FS_PID
+static struct vfsmount *pidfs_mnt __ro_after_init;
+static struct super_block *pidfs_sb __ro_after_init;
+
+/*
+ * The vfs falls back to simple_setattr() if i_op->setattr() isn't
+ * implemented. Let's reject it completely until we have a clean
+ * permission concept for pidfds.
+ */
+static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static const struct inode_operations pidfs_inode_operations = {
+ .getattr = pidfs_getattr,
+ .setattr = pidfs_setattr,
+};
+
+static void pidfs_evict_inode(struct inode *inode)
+{
+ struct pid *pid = inode->i_private;
+
+ clear_inode(inode);
+ put_pid(pid);
+}
+
+static const struct super_operations pidfs_sops = {
+ .drop_inode = generic_delete_inode,
+ .evict_inode = pidfs_evict_inode,
+ .statfs = simple_statfs,
+};
+
+static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ return dynamic_dname(buffer, buflen, "pidfd:[%lu]",
+ d_inode(dentry)->i_ino);
+}
+
+static const struct dentry_operations pidfs_dentry_operations = {
+ .d_delete = always_delete_dentry,
+ .d_dname = pidfs_dname,
+};
+
+static int pidfs_init_fs_context(struct fs_context *fc)
+{
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, PID_FS_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ops = &pidfs_sops;
+ ctx->dops = &pidfs_dentry_operations;
+ return 0;
+}
+
+static struct file_system_type pidfs_type = {
+ .name = "pidfs",
+ .init_fs_context = pidfs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+
+ struct inode *inode;
+ struct file *pidfd_file;
+
+ inode = iget_locked(pidfs_sb, pid->ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (inode->i_state & I_NEW) {
+ /*
+ * Inode numbering for pidfs start at RESERVED_PIDS + 1.
+ * This avoids collisions with the root inode which is 1
+ * for pseudo filesystems.
+ */
+ inode->i_ino = pid->ino;
+ inode->i_mode = S_IFREG | S_IRUGO;
+ inode->i_op = &pidfs_inode_operations;
+ inode->i_fop = &pidfs_file_operations;
+ inode->i_flags |= S_IMMUTABLE;
+ inode->i_private = get_pid(pid);
+ simple_inode_init_ts(inode);
+ unlock_new_inode(inode);
+ }
+
+ pidfd_file = alloc_file_pseudo(inode, pidfs_mnt, "", flags,
+ &pidfs_file_operations);
+ if (IS_ERR(pidfd_file))
+ iput(inode);
+
+ return pidfd_file;
+}
+
+void __init pidfs_init(void)
+{
+ pidfs_mnt = kern_mount(&pidfs_type);
+ if (IS_ERR(pidfs_mnt))
+ panic("Failed to mount pidfs pseudo filesystem");
+
+ pidfs_sb = pidfs_mnt->mnt_sb;
+}
+
+#else /* !CONFIG_FS_PID */
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+ struct file *pidfd_file;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid,
+ flags | O_RDWR);
+ if (IS_ERR(pidfd_file))
+ return pidfd_file;
+
+ get_pid(pid);
+ return pidfd_file;
+}
+
+void __init pidfs_init(void) { }
+#endif