From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 15 Mar 2013 01:45:51 -0700
Subject: userns:  Don't allow creation if the user is chrooted

Guarantee that the policy of which files may be access that is
established by setting the root directory will not be violated
by user namespaces by verifying that the root directory points
to the root of the mount namespace at the time of user namespace
creation.

Changing the root is a privileged operation, and as a matter of policy
it serves to limit unprivileged processes to files below the current
root directory.

For reasons of simplicity and comprehensibility the privilege to
change the root directory is gated solely on the CAP_SYS_CHROOT
capability in the user namespace.  Therefore when creating a user
namespace we must ensure that the policy of which files may be access
can not be violated by changing the root directory.

Anyone who runs a processes in a chroot and would like to use user
namespace can setup the same view of filesystems with a mount
namespace instead.  With this result that this is not a practical
limitation for using user namespaces.

Cc: stable@vger.kernel.org
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 50ca17d3cb45..a3035223d421 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt)
 	return check_mnt(real_mount(mnt));
 }
 
+bool current_chrooted(void)
+{
+	/* Does the current process have a non-standard root */
+	struct path ns_root;
+	struct path fs_root;
+	bool chrooted;
+
+	/* Find the namespace root */
+	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
+	ns_root.dentry = ns_root.mnt->mnt_root;
+	path_get(&ns_root);
+	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
+		;
+
+	get_fs_root(current->fs, &fs_root);
+
+	chrooted = !path_equal(&fs_root, &ns_root);
+
+	path_put(&fs_root);
+	path_put(&ns_root);
+
+	return chrooted;
+}
+
 static void *mntns_get(struct task_struct *task)
 {
 	struct mnt_namespace *ns = NULL;
-- 
cgit v1.2.3


From 90563b198e4c6674c63672fae1923da467215f45 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 22 Mar 2013 03:10:15 -0700
Subject: vfs: Add a mount flag to lock read only bind mounts

When a read-only bind mount is copied from mount namespace in a higher
privileged user namespace to a mount namespace in a lesser privileged
user namespace, it should not be possible to remove the the read-only
restriction.

Add a MNT_LOCK_READONLY mount flag to indicate that a mount must
remain read-only.

CC: stable@vger.kernel.org
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index a3035223d421..8505b5ece5de 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1713,6 +1713,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
 	if (readonly_request == __mnt_is_readonly(mnt))
 		return 0;
 
+	if (mnt->mnt_flags & MNT_LOCK_READONLY)
+		return -EPERM;
+
 	if (readonly_request)
 		error = mnt_make_readonly(real_mount(mnt));
 	else
-- 
cgit v1.2.3


From 132c94e31b8bca8ea921f9f96a57d684fa4ae0a9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 22 Mar 2013 04:08:05 -0700
Subject: vfs: Carefully propogate mounts across user namespaces

As a matter of policy MNT_READONLY should not be changable if the
original mounter had more privileges than creator of the mount
namespace.

Add the flag CL_UNPRIVILEGED to note when we are copying a mount from
a mount namespace that requires more privileges to a mount namespace
that requires fewer privileges.

When the CL_UNPRIVILEGED flag is set cause clone_mnt to set MNT_NO_REMOUNT
if any of the mnt flags that should never be changed are set.

This protects both mount propagation and the initial creation of a less
privileged mount namespace.

Cc: stable@vger.kernel.org
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 6 +++++-
 fs/pnode.c     | 6 ++++++
 fs/pnode.h     | 1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 8505b5ece5de..968d4c5eae03 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -798,6 +798,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	}
 
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
+	/* Don't allow unprivileged users to change mount flags */
+	if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
+		mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_sb = sb;
 	mnt->mnt.mnt_root = dget(root);
@@ -2342,7 +2346,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	/* First pass: copy the tree topology */
 	copy_flags = CL_COPY_ALL | CL_EXPIRE;
 	if (user_ns != mnt_ns->user_ns)
-		copy_flags |= CL_SHARED_TO_SLAVE;
+		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		up_write(&namespace_sem);
diff --git a/fs/pnode.c b/fs/pnode.c
index 3e000a51ac0d..8b29d2164da6 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -9,6 +9,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/fs.h>
+#include <linux/nsproxy.h>
 #include "internal.h"
 #include "pnode.h"
 
@@ -220,6 +221,7 @@ static struct mount *get_source(struct mount *dest,
 int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
 		    struct mount *source_mnt, struct list_head *tree_list)
 {
+	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	struct mount *m, *child;
 	int ret = 0;
 	struct mount *prev_dest_mnt = dest_mnt;
@@ -237,6 +239,10 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
 
 		source =  get_source(m, prev_dest_mnt, prev_src_mnt, &type);
 
+		/* Notice when we are propagating across user namespaces */
+		if (m->mnt_ns->user_ns != user_ns)
+			type |= CL_UNPRIVILEGED;
+
 		child = copy_tree(source, source->mnt.mnt_root, type);
 		if (IS_ERR(child)) {
 			ret = PTR_ERR(child);
diff --git a/fs/pnode.h b/fs/pnode.h
index 19b853a3445c..a0493d5ebfbf 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -23,6 +23,7 @@
 #define CL_MAKE_SHARED 		0x08
 #define CL_PRIVATE 		0x10
 #define CL_SHARED_TO_SLAVE	0x20
+#define CL_UNPRIVILEGED		0x40
 
 static inline void set_mnt_shared(struct mount *mnt)
 {
-- 
cgit v1.2.3


From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 24 Mar 2013 14:28:27 -0700
Subject: userns: Restrict when proc and sysfs can be mounted

Only allow unprivileged mounts of proc and sysfs if they are already
mounted when the user namespace is created.

proc and sysfs are interesting because they have content that is
per namespace, and so fresh mounts are needed when new namespaces
are created while at the same time proc and sysfs have content that
is shared between every instance.

Respect the policy of who may see the shared content of proc and sysfs
by only allowing new mounts if there was an existing mount at the time
the user namespace was created.

In practice there are only two interesting cases: proc and sysfs are
mounted at their usual places, proc and sysfs are not mounted at all
(some form of mount namespace jail).

Cc: stable@vger.kernel.org
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c   | 21 +++++++++++++++++++++
 fs/proc/root.c   |  4 ++++
 fs/sysfs/mount.c |  4 ++++
 3 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 968d4c5eae03..d581e45c0a9f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2763,6 +2763,27 @@ bool current_chrooted(void)
 	return chrooted;
 }
 
+void update_mnt_policy(struct user_namespace *userns)
+{
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct mount *mnt;
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &ns->list, mnt_list) {
+		switch (mnt->mnt.mnt_sb->s_magic) {
+		case SYSFS_MAGIC:
+			userns->may_mount_sysfs = true;
+			break;
+		case PROC_SUPER_MAGIC:
+			userns->may_mount_proc = true;
+			break;
+		}
+		if (userns->may_mount_sysfs && userns->may_mount_proc)
+			break;
+	}
+	up_read(&namespace_sem);
+}
+
 static void *mntns_get(struct task_struct *task)
 {
 	struct mnt_namespace *ns = NULL;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c6e9fac26bac..9c7fab1d23f0 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
+#include <linux/user_namespace.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
 #include <linux/parser.h>
@@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	} else {
 		ns = task_active_pid_ns(current);
 		options = data;
+
+		if (!current_user_ns()->may_mount_proc)
+			return ERR_PTR(-EPERM);
 	}
 
 	sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 8d924b5ec733..afd83273e6ce 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/user_namespace.h>
 
 #include "sysfs.h"
 
@@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	struct super_block *sb;
 	int error;
 
+	if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs)
+		return ERR_PTR(-EPERM);
+
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3