From 3e1866410f11356a9fd869beb3e95983dc79c067 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 13 Aug 2014 01:33:38 -0700 Subject: mnt: Implicitly add MNT_NODEV on remount when it was implicitly added by mount Now that remount is properly enforcing the rule that you can't remove nodev at least sandstorm.io is breaking when performing a remount. It turns out that there is an easy intuitive solution implicitly add nodev on remount when nodev was implicitly added on mount. Tested-by: Cedric Bosdonnat Tested-by: Richard Weinberger Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 5b66b2b3624d..3a1a87dc33df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2098,7 +2098,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags, } if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) { - return -EPERM; + /* Was the nodev implicitly added in mount? */ + if ((mnt->mnt_ns->user_ns != &init_user_ns) && + !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) { + mnt_flags |= MNT_NODEV; + } else { + return -EPERM; + } } if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) { -- cgit v1.2.3 From b2f5d4dc38e034eecb7987e513255265ff9aa1cf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 4 Oct 2014 14:44:03 -0700 Subject: umount: Disallow unprivileged mount force Forced unmount affects not just the mount namespace but the underlying superblock as well. Restrict forced unmount to the global root user for now. Otherwise it becomes possible a user in a less privileged mount namespace to force the shutdown of a superblock of a filesystem in a more privileged mount namespace, allowing a DOS attack on root. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 3a1a87dc33df..43b16af8af30 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1544,6 +1544,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; if (mnt->mnt.mnt_flags & MNT_LOCKED) goto dput_and_out; + retval = -EPERM; + if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + goto dput_and_out; retval = do_umount(mnt, flags); dput_and_out: -- cgit v1.2.3 From da362b09e42ee0bcaf0356afee6078b4f324baff Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Oct 2014 12:19:53 -0700 Subject: umount: Do not allow unmounting rootfs. Andrew Vagin writes: > #define _GNU_SOURCE > #include > #include > #include > #include > #include > #include > > int main(int argc, char **argv) > { > int fd; > > fd = open("/proc/self/ns/mnt", O_RDONLY); > if (fd < 0) > return 1; > while (1) { > if (umount2("/", MNT_DETACH) || > setns(fd, CLONE_NEWNS)) > break; > } > > return 0; > } > > root@ubuntu:/home/avagin# gcc -Wall nsenter.c -o nsenter > root@ubuntu:/home/avagin# strace ./nsenter > execve("./nsenter", ["./nsenter"], [/* 22 vars */]) = 0 > ... > open("/proc/self/ns/mnt", O_RDONLY) = 3 > umount("/", MNT_DETACH) = 0 > setns(3, 131072) = 0 > umount("/", MNT_DETACH > causes: > [ 260.548301] ------------[ cut here ]------------ > [ 260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372! > [ 260.552068] invalid opcode: 0000 [#1] SMP > [ 260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64 sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse floppy > [ 260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted 3.13.0-30-generic #55-Ubuntu > [ 260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 > [ 260.552068] task: ffff8800376097f0 ti: ffff880074824000 task.ti: ffff880074824000 > [ 260.552068] RIP: 0010:[] [] propagate_umount+0x123/0x130 > [ 260.552068] RSP: 0018:ffff880074825e98 EFLAGS: 00010246 > [ 260.552068] RAX: ffff88007c741140 RBX: 0000000000000002 RCX: ffff88007c741190 > [ 260.552068] RDX: ffff88007c741190 RSI: ffff880074825ec0 RDI: ffff880074825ec0 > [ 260.552068] RBP: ffff880074825eb0 R08: 00000000000172e0 R09: ffff88007fc172e0 > [ 260.552068] R10: ffffffff811cc642 R11: ffffea0001d59000 R12: ffff88007c741140 > [ 260.552068] R13: ffff88007c741140 R14: ffff88007c741140 R15: 0000000000000000 > [ 260.552068] FS: 00007fd5c7e41740(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 > [ 260.552068] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 260.552068] CR2: 00007fd5c7968050 CR3: 0000000070124000 CR4: 00000000000406f0 > [ 260.552068] Stack: > [ 260.552068] 0000000000000002 0000000000000002 ffff88007c631000 ffff880074825ed8 > [ 260.552068] ffffffff811dcfac ffff88007c741140 0000000000000002 ffff88007c741160 > [ 260.552068] ffff880074825f38 ffffffff811dd12b ffffffff811cc642 0000000075640000 > [ 260.552068] Call Trace: > [ 260.552068] [] umount_tree+0x20c/0x260 > [ 260.552068] [] do_umount+0x12b/0x300 > [ 260.552068] [] ? final_putname+0x22/0x50 > [ 260.552068] [] ? putname+0x29/0x40 > [ 260.552068] [] SyS_umount+0xdc/0x100 > [ 260.552068] [] tracesys+0xe1/0xe6 > [ 260.552068] Code: 89 50 08 48 8b 50 08 48 89 02 49 89 45 08 e9 72 ff ff ff 0f 1f 44 00 00 4c 89 e6 4c 89 e7 e8 f5 f6 ff ff 48 89 c3 e9 39 ff ff ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 90 66 66 66 66 90 55 b8 01 > [ 260.552068] RIP [] propagate_umount+0x123/0x130 > [ 260.552068] RSP > [ 260.611451] ---[ end trace 11c33d85f1d4c652 ]-- Which in practice is totally uninteresting. Only the global root user can do it, and it is just a stupid thing to do. However that is no excuse to allow a silly way to oops the kernel. We can avoid this silly problem by setting MNT_LOCKED on the rootfs mount point and thus avoid needing any special cases in the unmount code. Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 43b16af8af30..15d0328bd035 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3011,6 +3011,7 @@ static void __init init_mount_tree(void) root.mnt = mnt; root.dentry = mnt->mnt_root; + mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); -- cgit v1.2.3 From 8486a7882b5ba906992fd78bbfcefaae7fe285cc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Oct 2014 16:22:52 -0700 Subject: mnt: Move the clear of MNT_LOCKED from copy_tree to it's callers. Clear MNT_LOCKED in the callers of copy_tree except copy_mnt_ns, and collect_mounts. In copy_mnt_ns it is necessary to create an exact copy of a mount tree, so not clearing MNT_LOCKED is important. Similarly collect_mounts is used to take a snapshot of the mount tree for audit logging purposes and auditing using a faithful copy of the tree is important. This becomes particularly significant when we start setting MNT_LOCKED on rootfs to prevent it from being unmounted. Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 1 - fs/pnode.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 15d0328bd035..e8d1ffa7f132 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1613,7 +1613,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, if (IS_ERR(q)) return q; - q->mnt.mnt_flags &= ~MNT_LOCKED; q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; diff --git a/fs/pnode.c b/fs/pnode.c index aae331a5d03b..260ac8f898a4 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -242,6 +242,7 @@ static int propagate_one(struct mount *m) child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); + child->mnt.mnt_flags &= ~MNT_LOCKED; mnt_set_mountpoint(m, mp, child); last_dest = m; last_source = child; -- cgit v1.2.3 From 381cacb12c009864993a072eedcc0720315aedbd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Oct 2014 17:11:46 -0700 Subject: mnt: Carefully set CL_UNPRIVILEGED in clone_mnt old->mnt_expiry should be ignored unless CL_EXPIRE is set. Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index e8d1ffa7f132..f87a90b98da2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -963,7 +963,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } /* Don't allow unprivileged users to reveal what is under a mount */ - if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) + if ((flag & CL_UNPRIVILEGED) && + (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire))) mnt->mnt.mnt_flags |= MNT_LOCKED; atomic_inc(&sb->s_active); -- cgit v1.2.3 From 4fed655c410cc56add64c7b1f7c85c7c56066ac2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 8 Oct 2014 10:42:57 -0700 Subject: mnt: Clear mnt_expire during pivot_root When inspecting the pivot_root and the current mount expiry logic I realized that pivot_root fails to clear like mount move does. Add the missing line in case someone does the interesting feat of moving an expirable submount. This gives a strong guarantee that root of the filesystem tree will never expire. Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index f87a90b98da2..fe1c77145a78 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2967,6 +2967,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* mount new_root on / */ attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); touch_mnt_namespace(current->nsproxy->mnt_ns); + /* A moved mount should not expire automatically */ + list_del_init(&new_mnt->mnt_expire); unlock_mount_hash(); chroot_fs_refs(&root, &new); put_mountpoint(root_mp); -- cgit v1.2.3 From 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Dec 2014 12:27:26 -0600 Subject: userns: Add a knob to disable setgroups on a per user namespace basis - Expose the knob to user space through a proc file /proc//setgroups A value of "deny" means the setgroups system call is disabled in the current processes user namespace and can not be enabled in the future in this user namespace. A value of "allow" means the segtoups system call is enabled. - Descendant user namespaces inherit the value of setgroups from their parents. - A proc file is used (instead of a sysctl) as sysctls currently do not allow checking the permissions at open time. - Writing to the proc file is restricted to before the gid_map for the user namespace is set. This ensures that disabling setgroups at a user namespace level will never remove the ability to call setgroups from a process that already has that ability. A process may opt in to the setgroups disable for itself by creating, entering and configuring a user namespace or by calling setns on an existing user namespace with setgroups disabled. Processes without privileges already can not call setgroups so this is a noop. Prodcess with privilege become processes without privilege when entering a user namespace and as with any other path to dropping privilege they would not have the ability to call setgroups. So this remains within the bounds of what is possible without a knob to disable setgroups permanently in a user namespace. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- fs/proc/base.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 772efa45a452..7dc3ea89ef1a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2464,6 +2464,57 @@ static const struct file_operations proc_projid_map_operations = { .llseek = seq_lseek, .release = proc_id_map_release, }; + +static int proc_setgroups_open(struct inode *inode, struct file *file) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + int ret; + + ret = -ESRCH; + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + if (file->f_mode & FMODE_WRITE) { + ret = -EACCES; + if (!ns_capable(ns, CAP_SYS_ADMIN)) + goto err_put_ns; + } + + ret = single_open(file, &proc_setgroups_show, ns); + if (ret) + goto err_put_ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_setgroups_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + int ret = single_release(inode, file); + put_user_ns(ns); + return ret; +} + +static const struct file_operations proc_setgroups_operations = { + .open = proc_setgroups_open, + .write = proc_setgroups_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_setgroups_release, +}; #endif /* CONFIG_USER_NS */ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, @@ -2572,6 +2623,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #ifdef CONFIG_CHECKPOINT_RESTORE REG("timers", S_IRUGO, proc_timers_operations), @@ -2913,6 +2965,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif }; -- cgit v1.2.3