From 386bc35a2d548c28a5083b2e162a20251b37cab5 Mon Sep 17 00:00:00 2001
From: Anna Leuschner <anna.m.leuschner@gmail.com>
Date: Mon, 22 Oct 2012 21:53:36 +0200
Subject: vfs: fix: don't increase bio_slab_max if krealloc() fails

Without the patch, bio_slab_max, representing bio_slabs capacity, is increased before krealloc() of bio_slabs. If krealloc() fails, bio_slab_max is too high. Fix that by only updating bio_slab_max if krealloc() is successful.

Signed-off-by: Anna Leuschner <anna.m.leuschner@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/bio.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 9298c65ad9c7..b96fc6ce4855 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -75,6 +75,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 	unsigned int sz = sizeof(struct bio) + extra_size;
 	struct kmem_cache *slab = NULL;
 	struct bio_slab *bslab, *new_bio_slabs;
+	unsigned int new_bio_slab_max;
 	unsigned int i, entry = -1;
 
 	mutex_lock(&bio_slab_lock);
@@ -97,12 +98,13 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 		goto out_unlock;
 
 	if (bio_slab_nr == bio_slab_max && entry == -1) {
-		bio_slab_max <<= 1;
+		new_bio_slab_max = bio_slab_max << 1;
 		new_bio_slabs = krealloc(bio_slabs,
-					 bio_slab_max * sizeof(struct bio_slab),
+					 new_bio_slab_max * sizeof(struct bio_slab),
 					 GFP_KERNEL);
 		if (!new_bio_slabs)
 			goto out_unlock;
+		bio_slab_max = new_bio_slab_max;
 		bio_slabs = new_bio_slabs;
 	}
 	if (entry == -1)
-- 
cgit v1.2.3


From 57911b8ba814fae01306376a0d02bc7cdc88dc94 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 19 Oct 2012 09:22:03 +0200
Subject: Btrfs: don't put removals from push_node_left into tree mod log twice

Independant of the check (push_items < src_items) tree_mod_log_eb_copy did
log the removal of the old data entries from the source buffer. Therefore,
we must not call tree_mod_log_eb_move if the check evaluates to true, as
that would log the removal twice, finally resulting in (rewinded) buffers
with wrong values for header_nritems.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b33436211000..44a7e25353a6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1225,6 +1225,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 	free_extent_buffer(eb);
 
 	__tree_mod_log_rewind(eb_rewin, time_seq, tm);
+	WARN_ON(btrfs_header_nritems(eb_rewin) >
+		BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root));
 
 	return eb_rewin;
 }
@@ -1280,6 +1282,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 	else
 		WARN_ON(btrfs_header_level(eb) != 0);
 	extent_buffer_get(eb);
+	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
 
 	return eb;
 }
@@ -2970,8 +2973,10 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 			   push_items * sizeof(struct btrfs_key_ptr));
 
 	if (push_items < src_nritems) {
-		tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
-				     src_nritems - push_items);
+		/*
+		 * don't call tree_mod_log_eb_move here, key removal was already
+		 * fully logged by tree_mod_log_eb_copy above.
+		 */
 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
 				      btrfs_node_key_ptr_offset(push_items),
 				      (src_nritems - push_items) *
-- 
cgit v1.2.3


From ba1bfbd592c1adddd5d0005f587a6e308e25c949 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 22 Oct 2012 20:02:56 +0200
Subject: Btrfs: fix a tree mod logging issue for root replacement operations

Avoid the implicit free by tree_mod_log_set_root_pointer, which is wrong in
two places. Where needed, we call tree_mod_log_free_eb explicitly now.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 44a7e25353a6..e6b75ccd2850 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -647,8 +647,6 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 	if (tree_mod_dont_log(fs_info, NULL))
 		return 0;
 
-	__tree_mod_log_free_eb(fs_info, old_root);
-
 	ret = tree_mod_alloc(fs_info, flags, &tm);
 	if (ret < 0)
 		goto out;
@@ -926,12 +924,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 			ret = btrfs_dec_ref(trans, root, buf, 1, 1);
 			BUG_ON(ret); /* -ENOMEM */
 		}
-		/*
-		 * don't log freeing in case we're freeing the root node, this
-		 * is done by tree_mod_log_set_root_pointer later
-		 */
-		if (buf != root->node && btrfs_header_level(buf) != 0)
-			tree_mod_log_free_eb(root->fs_info, buf);
+		tree_mod_log_free_eb(root->fs_info, buf);
 		clean_tree_block(trans, root, buf);
 		*last_ref = 1;
 	}
@@ -1728,6 +1721,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			goto enospc;
 		}
 
+		tree_mod_log_free_eb(root->fs_info, root->node);
 		tree_mod_log_set_root_pointer(root, child);
 		rcu_assign_pointer(root->node, child);
 
-- 
cgit v1.2.3


From 834328a8493079d15f30866ace42489463f52571 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Tue, 23 Oct 2012 11:27:33 +0200
Subject: Btrfs: tree mod log's old roots could still be part of the tree

Tree mod log treated old root buffers as always empty buffers when starting
the rewind operations. However, the old root may still be part of the
current tree at a lower level, with still some valid entries.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e6b75ccd2850..d9308c38a8f2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1239,6 +1239,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 	struct tree_mod_root *old_root = NULL;
 	u64 old_generation = 0;
 	u64 logical;
+	u32 blocksize;
 
 	eb = btrfs_read_lock_root_node(root);
 	tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
@@ -1254,12 +1255,28 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 	}
 
 	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
-	if (old_root)
+	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+		btrfs_tree_read_unlock(root->node);
+		free_extent_buffer(root->node);
+		blocksize = btrfs_level_size(root, old_root->level);
+		eb = read_tree_block(root, logical, blocksize, 0);
+		if (!eb) {
+			pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
+				logical);
+			WARN_ON(1);
+		} else {
+			eb = btrfs_clone_extent_buffer(eb);
+		}
+	} else if (old_root) {
+		btrfs_tree_read_unlock(root->node);
+		free_extent_buffer(root->node);
 		eb = alloc_dummy_extent_buffer(logical, root->nodesize);
-	else
+	} else {
 		eb = btrfs_clone_extent_buffer(root->node);
-	btrfs_tree_read_unlock(root->node);
-	free_extent_buffer(root->node);
+		btrfs_tree_read_unlock(root->node);
+		free_extent_buffer(root->node);
+	}
+
 	if (!eb)
 		return NULL;
 	btrfs_tree_read_lock(eb);
-- 
cgit v1.2.3


From 5b6602e762cae17c8891d19698afea451e9c1d95 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Tue, 23 Oct 2012 11:28:27 +0200
Subject: Btrfs: determine level of old roots

In btrfs_find_all_roots' termination condition, we compare the level of the
old buffer we got from btrfs_search_old_slot to the level of the current
root node. We'd better compare it to the level of the rewinded root node.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/backref.c |  4 +---
 fs/btrfs/ctree.c   | 17 +++++++++++++++++
 fs/btrfs/ctree.h   |  1 +
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f3187938e081..65608fbf2232 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -283,9 +283,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 		goto out;
 	}
 
-	rcu_read_lock();
-	root_level = btrfs_header_level(root->node);
-	rcu_read_unlock();
+	root_level = btrfs_old_root_level(root, time_seq);
 
 	if (root_level + 1 == level)
 		goto out;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d9308c38a8f2..ef68c33eefd9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1297,6 +1297,23 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 	return eb;
 }
 
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+	struct tree_mod_elem *tm;
+	int level;
+
+	tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
+		level = tm->old_root.level;
+	} else {
+		rcu_read_lock();
+		level = btrfs_header_level(root->node);
+		rcu_read_unlock();
+	}
+
+	return level;
+}
+
 static inline int should_cow_block(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct extent_buffer *buf)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1630be831210..34c5a442dd33 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3120,6 +3120,7 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 {
 	return atomic_inc_return(&fs_info->tree_mod_seq);
 }
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
-- 
cgit v1.2.3


From d638108484d186bf97a838d037f165d9b404e6ee Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Tue, 23 Oct 2012 14:21:05 +0200
Subject: Btrfs: fix extent buffer reference for tree mod log roots

In get_old_root we grab a lock on the extent buffer before we obtain a
reference on that buffer. That order is changed now.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ef68c33eefd9..f6739903d072 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1279,6 +1279,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 
 	if (!eb)
 		return NULL;
+	extent_buffer_get(eb);
 	btrfs_tree_read_lock(eb);
 	if (old_root) {
 		btrfs_set_header_bytenr(eb, eb->start);
@@ -1291,7 +1292,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 		__tree_mod_log_rewind(eb, time_seq, tm);
 	else
 		WARN_ON(btrfs_header_level(eb) != 0);
-	extent_buffer_get(eb);
 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
 
 	return eb;
-- 
cgit v1.2.3


From 01763a2e37425ae3f37a3dc051e0703fdade5956 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Tue, 23 Oct 2012 15:02:12 +0200
Subject: Btrfs: comment for loop in tree_mod_log_insert_move

Emphasis the way tree_mod_log_insert_move avoids adding
MOD_LOG_KEY_REMOVE_WHILE_MOVING operations, depending on the direction of
the move operation.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ctree.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f6739903d072..eba44b076829 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -596,6 +596,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 	if (tree_mod_dont_log(fs_info, eb))
 		return 0;
 
+	/*
+	 * When we override something during the move, we log these removals.
+	 * This can only happen when we move towards the beginning of the
+	 * buffer, i.e. dst_slot < src_slot.
+	 */
 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 		ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
 					      MOD_LOG_KEY_REMOVE_WHILE_MOVING);
-- 
cgit v1.2.3


From a4ee8d978e47e79d536226dccb48991f70091168 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 23 Oct 2012 13:51:58 -0400
Subject: LOCKD: fix races in nsm_client_get

Commit e9406db20fecbfcab646bad157b4cfdc7cadddfb (lockd: per-net
NSM client creation and destruction helpers introduced) contains
a nasty race on initialisation of the per-net NSM client because
it doesn't check whether or not the client is set after grabbing
the nsm_create_mutex.

Reported-by: Nix <nix@esperi.org.uk>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/lockd/mon.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e4fb3ba5a58a..fe695603e395 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -85,29 +85,38 @@ static struct rpc_clnt *nsm_create(struct net *net)
 	return rpc_create(&args);
 }
 
+static struct rpc_clnt *nsm_client_set(struct lockd_net *ln,
+		struct rpc_clnt *clnt)
+{
+	spin_lock(&ln->nsm_clnt_lock);
+	if (ln->nsm_users == 0) {
+		if (clnt == NULL)
+			goto out;
+		ln->nsm_clnt = clnt;
+	}
+	clnt = ln->nsm_clnt;
+	ln->nsm_users++;
+out:
+	spin_unlock(&ln->nsm_clnt_lock);
+	return clnt;
+}
+
 static struct rpc_clnt *nsm_client_get(struct net *net)
 {
-	static DEFINE_MUTEX(nsm_create_mutex);
-	struct rpc_clnt	*clnt;
+	struct rpc_clnt	*clnt, *new;
 	struct lockd_net *ln = net_generic(net, lockd_net_id);
 
-	spin_lock(&ln->nsm_clnt_lock);
-	if (ln->nsm_users) {
-		ln->nsm_users++;
-		clnt = ln->nsm_clnt;
-		spin_unlock(&ln->nsm_clnt_lock);
+	clnt = nsm_client_set(ln, NULL);
+	if (clnt != NULL)
 		goto out;
-	}
-	spin_unlock(&ln->nsm_clnt_lock);
 
-	mutex_lock(&nsm_create_mutex);
-	clnt = nsm_create(net);
-	if (!IS_ERR(clnt)) {
-		ln->nsm_clnt = clnt;
-		smp_wmb();
-		ln->nsm_users = 1;
-	}
-	mutex_unlock(&nsm_create_mutex);
+	clnt = new = nsm_create(net);
+	if (IS_ERR(clnt))
+		goto out;
+
+	clnt = nsm_client_set(ln, new);
+	if (clnt != new)
+		rpc_shutdown_client(new);
 out:
 	return clnt;
 }
-- 
cgit v1.2.3


From e498daa81295d02f7359af313c2b7f87e1062207 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 24 Oct 2012 08:53:35 -0400
Subject: LOCKD: Clear ln->nsm_clnt only when ln->nsm_users is zero

The current code is clearing it in all cases _except_ when zero.

Reported-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/lockd/mon.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index fe695603e395..3d7e09bcc0e9 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -124,18 +124,16 @@ out:
 static void nsm_client_put(struct net *net)
 {
 	struct lockd_net *ln = net_generic(net, lockd_net_id);
-	struct rpc_clnt	*clnt = ln->nsm_clnt;
-	int shutdown = 0;
+	struct rpc_clnt	*clnt = NULL;
 
 	spin_lock(&ln->nsm_clnt_lock);
-	if (ln->nsm_users) {
-		if (--ln->nsm_users)
-			ln->nsm_clnt = NULL;
-		shutdown = !ln->nsm_users;
+	ln->nsm_users--;
+	if (ln->nsm_users == 0) {
+		clnt = ln->nsm_clnt;
+		ln->nsm_clnt = NULL;
 	}
 	spin_unlock(&ln->nsm_clnt_lock);
-
-	if (shutdown)
+	if (clnt != NULL)
 		rpc_shutdown_client(clnt);
 }
 
-- 
cgit v1.2.3


From 66081a72517a131430dcf986775f3268aafcb546 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sat, 29 Sep 2012 22:23:19 +0200
Subject: sysfs: sysfs_pathname/sysfs_add_one: Use strlcat() instead of
 strcat()

The warning check for duplicate sysfs entries can cause a buffer overflow
when printing the warning, as strcat() doesn't check buffer sizes.
Use strlcat() instead.

Since strlcat() doesn't return a pointer to the passed buffer, unlike
strcat(), I had to convert the nested concatenation in sysfs_add_one() to
an admittedly more obscure comma operator construct, to avoid emitting code
for the concatenation if CONFIG_BUG is disabled.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 6b0bb00d4d2b..2fbdff6be25c 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -485,20 +485,18 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 /**
  *	sysfs_pathname - return full path to sysfs dirent
  *	@sd: sysfs_dirent whose path we want
- *	@path: caller allocated buffer
+ *	@path: caller allocated buffer of size PATH_MAX
  *
  *	Gives the name "/" to the sysfs_root entry; any path returned
  *	is relative to wherever sysfs is mounted.
- *
- *	XXX: does no error checking on @path size
  */
 static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
 {
 	if (sd->s_parent) {
 		sysfs_pathname(sd->s_parent, path);
-		strcat(path, "/");
+		strlcat(path, "/", PATH_MAX);
 	}
-	strcat(path, sd->s_name);
+	strlcat(path, sd->s_name, PATH_MAX);
 	return path;
 }
 
@@ -531,9 +529,11 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 		char *path = kzalloc(PATH_MAX, GFP_KERNEL);
 		WARN(1, KERN_WARNING
 		     "sysfs: cannot create duplicate filename '%s'\n",
-		     (path == NULL) ? sd->s_name :
-		     strcat(strcat(sysfs_pathname(acxt->parent_sd, path), "/"),
-		            sd->s_name));
+		     (path == NULL) ? sd->s_name
+				    : (sysfs_pathname(acxt->parent_sd, path),
+				       strlcat(path, "/", PATH_MAX),
+				       strlcat(path, sd->s_name, PATH_MAX),
+				       path));
 		kfree(path);
 	}
 
-- 
cgit v1.2.3


From 661bec6ba884b86517ef5ea529aabb281a7198d9 Mon Sep 17 00:00:00 2001
From: Gabriel de Perthuis <g2p.code@gmail.com>
Date: Wed, 10 Oct 2012 08:50:47 -0600
Subject: Fix a sign bug causing invalid memory access in the ino_paths ioctl.

To see the problem, create many hardlinks to the same file (120 should do it),
then look up paths by inode with:

  ls -i
  btrfs inspect inode-resolve -v $ino /mnt/btrfs

I noticed the memory layout of the fspath->val data had some irregularities
(some unnecessary gaps that stop appearing about halfway),
so I'm not sure there aren't any bugs left in it.
---
 fs/btrfs/backref.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f3187938e081..2bcbea3f6308 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1186,7 +1186,7 @@ static char *ref_to_path(struct btrfs_root *fs_root,
 	int slot;
 	u64 next_inum;
 	int ret;
-	s64 bytes_left = size - 1;
+	s64 bytes_left = ((s64)size) - 1;
 	struct extent_buffer *eb = eb_in;
 	struct btrfs_key found_key;
 	int leave_spinning = path->leave_spinning;
-- 
cgit v1.2.3


From 84167d190569eedcdb24bf2499bdda437e442962 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Thu, 11 Oct 2012 07:25:16 -0600
Subject: Btrfs: Fix wrong error handling code

gcc says "warning: comparison of unsigned expression >= 0 is always
true" because i is an unsigned long. And gcc is right this time.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/extent_io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 64dc93f64bc0..a32ebfeb91cf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4104,8 +4104,8 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
 
 	return eb;
 err:
-	for (i--; i >= 0; i--)
-		__free_page(eb->pages[i]);
+	for (; i > 0; i--)
+		__free_page(eb->pages[i - 1]);
 	__free_extent_buffer(eb);
 	return NULL;
 }
-- 
cgit v1.2.3


From 96b5bd777118bb673b458b41bbefc7f0f31d65c9 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 15 Oct 2012 08:30:45 +0000
Subject: Btrfs: extended inode refs support for send mechanism

This adds support for the new extended inode refs to btrfs send.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/backref.c |  22 +++++-----
 fs/btrfs/backref.h |   4 ++
 fs/btrfs/send.c    | 126 ++++++++++++++++++++++++++++++++++-------------------
 3 files changed, 94 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2bcbea3f6308..b8b69266393a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1177,11 +1177,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 	return ret;
 }
 
-static char *ref_to_path(struct btrfs_root *fs_root,
-			 struct btrfs_path *path,
-			 u32 name_len, unsigned long name_off,
-			 struct extent_buffer *eb_in, u64 parent,
-			 char *dest, u32 size)
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+			u32 name_len, unsigned long name_off,
+			struct extent_buffer *eb_in, u64 parent,
+			char *dest, u32 size)
 {
 	int slot;
 	u64 next_inum;
@@ -1266,10 +1265,10 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root,
 			 struct extent_buffer *eb_in, u64 parent,
 			 char *dest, u32 size)
 {
-	return ref_to_path(fs_root, path,
-			   btrfs_inode_ref_name_len(eb_in, iref),
-			   (unsigned long)(iref + 1),
-			   eb_in, parent, dest, size);
+	return btrfs_ref_to_path(fs_root, path,
+				 btrfs_inode_ref_name_len(eb_in, iref),
+				 (unsigned long)(iref + 1),
+				 eb_in, parent, dest, size);
 }
 
 /*
@@ -1715,9 +1714,8 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
 					ipath->fspath->bytes_left - s_ptr : 0;
 
 	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
-	fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
-			     name_off, eb, inum, fspath_min,
-			     bytes_left);
+	fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+				   name_off, eb, inum, fspath_min, bytes_left);
 	if (IS_ERR(fspath))
 		return PTR_ERR(fspath);
 
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index e75533043a5f..d61feca79455 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -62,6 +62,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			 struct btrfs_inode_ref *iref, struct extent_buffer *eb,
 			 u64 parent, char *dest, u32 size);
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+			u32 name_len, unsigned long name_off,
+			struct extent_buffer *eb_in, u64 parent,
+			char *dest, u32 size);
 
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c7beb543a4a8..4f20c49edd58 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -745,31 +745,36 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
 				   void *ctx);
 
 /*
- * Helper function to iterate the entries in ONE btrfs_inode_ref.
+ * Helper function to iterate the entries in ONE btrfs_inode_ref or
+ * btrfs_inode_extref.
  * The iterate callback may return a non zero value to stop iteration. This can
  * be a negative value for error codes or 1 to simply stop it.
  *
- * path must point to the INODE_REF when called.
+ * path must point to the INODE_REF or INODE_EXTREF when called.
  */
 static int iterate_inode_ref(struct send_ctx *sctx,
 			     struct btrfs_root *root, struct btrfs_path *path,
 			     struct btrfs_key *found_key, int resolve,
 			     iterate_inode_ref_t iterate, void *ctx)
 {
-	struct extent_buffer *eb;
+	struct extent_buffer *eb = path->nodes[0];
 	struct btrfs_item *item;
 	struct btrfs_inode_ref *iref;
+	struct btrfs_inode_extref *extref;
 	struct btrfs_path *tmp_path;
 	struct fs_path *p;
-	u32 cur;
-	u32 len;
+	u32 cur = 0;
 	u32 total;
-	int slot;
+	int slot = path->slots[0];
 	u32 name_len;
 	char *start;
 	int ret = 0;
-	int num;
+	int num = 0;
 	int index;
+	u64 dir;
+	unsigned long name_off;
+	unsigned long elem_size;
+	unsigned long ptr;
 
 	p = fs_path_alloc_reversed(sctx);
 	if (!p)
@@ -781,24 +786,40 @@ static int iterate_inode_ref(struct send_ctx *sctx,
 		return -ENOMEM;
 	}
 
-	eb = path->nodes[0];
-	slot = path->slots[0];
-	item = btrfs_item_nr(eb, slot);
-	iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
-	cur = 0;
-	len = 0;
-	total = btrfs_item_size(eb, item);
 
-	num = 0;
+	if (found_key->type == BTRFS_INODE_REF_KEY) {
+		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
+						    struct btrfs_inode_ref);
+		item = btrfs_item_nr(eb, slot);
+		total = btrfs_item_size(eb, item);
+		elem_size = sizeof(*iref);
+	} else {
+		ptr = btrfs_item_ptr_offset(eb, slot);
+		total = btrfs_item_size_nr(eb, slot);
+		elem_size = sizeof(*extref);
+	}
+
 	while (cur < total) {
 		fs_path_reset(p);
 
-		name_len = btrfs_inode_ref_name_len(eb, iref);
-		index = btrfs_inode_ref_index(eb, iref);
+		if (found_key->type == BTRFS_INODE_REF_KEY) {
+			iref = (struct btrfs_inode_ref *)(ptr + cur);
+			name_len = btrfs_inode_ref_name_len(eb, iref);
+			name_off = (unsigned long)(iref + 1);
+			index = btrfs_inode_ref_index(eb, iref);
+			dir = found_key->offset;
+		} else {
+			extref = (struct btrfs_inode_extref *)(ptr + cur);
+			name_len = btrfs_inode_extref_name_len(eb, extref);
+			name_off = (unsigned long)&extref->name;
+			index = btrfs_inode_extref_index(eb, extref);
+			dir = btrfs_inode_extref_parent(eb, extref);
+		}
+
 		if (resolve) {
-			start = btrfs_iref_to_path(root, tmp_path, iref, eb,
-						found_key->offset, p->buf,
-						p->buf_len);
+			start = btrfs_ref_to_path(root, tmp_path, name_len,
+						  name_off, eb, dir,
+						  p->buf, p->buf_len);
 			if (IS_ERR(start)) {
 				ret = PTR_ERR(start);
 				goto out;
@@ -809,9 +830,10 @@ static int iterate_inode_ref(struct send_ctx *sctx,
 						p->buf_len + p->buf - start);
 				if (ret < 0)
 					goto out;
-				start = btrfs_iref_to_path(root, tmp_path, iref,
-						eb, found_key->offset, p->buf,
-						p->buf_len);
+				start = btrfs_ref_to_path(root, tmp_path,
+							  name_len, name_off,
+							  eb, dir,
+							  p->buf, p->buf_len);
 				if (IS_ERR(start)) {
 					ret = PTR_ERR(start);
 					goto out;
@@ -820,21 +842,16 @@ static int iterate_inode_ref(struct send_ctx *sctx,
 			}
 			p->start = start;
 		} else {
-			ret = fs_path_add_from_extent_buffer(p, eb,
-					(unsigned long)(iref + 1), name_len);
+			ret = fs_path_add_from_extent_buffer(p, eb, name_off,
+							     name_len);
 			if (ret < 0)
 				goto out;
 		}
 
-
-		len = sizeof(*iref) + name_len;
-		iref = (struct btrfs_inode_ref *)((char *)iref + len);
-		cur += len;
-
-		ret = iterate(num, found_key->offset, index, p, ctx);
+		cur += elem_size + name_len;
+		ret = iterate(num, dir, index, p, ctx);
 		if (ret)
 			goto out;
-
 		num++;
 	}
 
@@ -998,7 +1015,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
 	}
 	btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
 	if (found_key.objectid != ino ||
-		found_key.type != BTRFS_INODE_REF_KEY) {
+	    (found_key.type != BTRFS_INODE_REF_KEY &&
+	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
 		ret = -ENOENT;
 		goto out;
 	}
@@ -1551,8 +1569,8 @@ static int get_first_ref(struct send_ctx *sctx,
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_path *path;
-	struct btrfs_inode_ref *iref;
 	int len;
+	u64 parent_dir;
 
 	path = alloc_path_for_send();
 	if (!path)
@@ -1568,27 +1586,41 @@ static int get_first_ref(struct send_ctx *sctx,
 	if (!ret)
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				path->slots[0]);
-	if (ret || found_key.objectid != key.objectid ||
-	    found_key.type != key.type) {
+	if (ret || found_key.objectid != ino ||
+	    (found_key.type != BTRFS_INODE_REF_KEY &&
+	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
 		ret = -ENOENT;
 		goto out;
 	}
 
-	iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
-			struct btrfs_inode_ref);
-	len = btrfs_inode_ref_name_len(path->nodes[0], iref);
-	ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
-			(unsigned long)(iref + 1), len);
+	if (key.type == BTRFS_INODE_REF_KEY) {
+		struct btrfs_inode_ref *iref;
+		iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_inode_ref);
+		len = btrfs_inode_ref_name_len(path->nodes[0], iref);
+		ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+						     (unsigned long)(iref + 1),
+						     len);
+		parent_dir = found_key.offset;
+	} else {
+		struct btrfs_inode_extref *extref;
+		extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					struct btrfs_inode_extref);
+		len = btrfs_inode_extref_name_len(path->nodes[0], extref);
+		ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+					(unsigned long)&extref->name, len);
+		parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
+	}
 	if (ret < 0)
 		goto out;
 	btrfs_release_path(path);
 
-	ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
+	ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL,
 			NULL, NULL);
 	if (ret < 0)
 		goto out;
 
-	*dir = found_key.offset;
+	*dir = parent_dir;
 
 out:
 	btrfs_free_path(path);
@@ -3226,7 +3258,8 @@ static int process_all_refs(struct send_ctx *sctx,
 		btrfs_item_key_to_cpu(eb, &found_key, slot);
 
 		if (found_key.objectid != key.objectid ||
-		    found_key.type != key.type)
+		    (found_key.type != BTRFS_INODE_REF_KEY &&
+		     found_key.type != BTRFS_INODE_EXTREF_KEY))
 			break;
 
 		ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
@@ -3987,7 +4020,7 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
 	if (sctx->cur_ino == 0)
 		goto out;
 	if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
-	    sctx->cmp_key->type <= BTRFS_INODE_REF_KEY)
+	    sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
 		goto out;
 	if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
 		goto out;
@@ -4335,7 +4368,8 @@ static int changed_cb(struct btrfs_root *left_root,
 
 	if (key->type == BTRFS_INODE_ITEM_KEY)
 		ret = changed_inode(sctx, result);
-	else if (key->type == BTRFS_INODE_REF_KEY)
+	else if (key->type == BTRFS_INODE_REF_KEY ||
+		 key->type == BTRFS_INODE_EXTREF_KEY)
 		ret = changed_ref(sctx, result);
 	else if (key->type == BTRFS_XATTR_ITEM_KEY)
 		ret = changed_xattr(sctx, result);
-- 
cgit v1.2.3


From d79e50433b2bea09eb680ed5fae15e8a12356353 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 15 Oct 2012 18:28:46 +0000
Subject: Btrfs: send correct rdev and mode in btrfs-send

When sending a device file, the stream was missing the mode. Also the
rdev was encoded wrongly.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/send.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4f20c49edd58..97cc5399306b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2462,7 +2462,8 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
 	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
 		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
-		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev);
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
 	}
 
 	ret = send_cmd(sctx);
-- 
cgit v1.2.3


From 5b7ff5b3c4468780b15c6b20cd0567cd9f2aa626 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Tue, 16 Oct 2012 05:44:21 +0000
Subject: Btrfs: fix memory leak in btrfs_quota_enable()

We should free quota_root before returning from the error
handling code.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/qgroup.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5039686df6ae..fe9d02c45f8e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -790,8 +790,10 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 	}
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out_free_root;
+	}
 
 	key.objectid = 0;
 	key.type = BTRFS_QGROUP_STATUS_KEY;
@@ -800,7 +802,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 				      sizeof(*ptr));
 	if (ret)
-		goto out;
+		goto out_free_path;
 
 	leaf = path->nodes[0];
 	ptr = btrfs_item_ptr(leaf, path->slots[0],
@@ -818,8 +820,15 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 	fs_info->quota_root = quota_root;
 	fs_info->pending_quota_state = 1;
 	spin_unlock(&fs_info->qgroup_lock);
-out:
+out_free_path:
 	btrfs_free_path(path);
+out_free_root:
+	if (ret) {
+		free_extent_buffer(quota_root->node);
+		free_extent_buffer(quota_root->commit_root);
+		kfree(quota_root);
+	}
+out:
 	return ret;
 }
 
-- 
cgit v1.2.3


From e515c18bfef718a7900924d50198d968565dd60e Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 16 Oct 2012 09:34:36 +0000
Subject: btrfs: Return EINVAL when length to trim is less than FSB

Currently if len argument in btrfs_ioctl_fitrim() is smaller than
one FSB we will continue and finally return 0 bytes discarded.
However if the length to discard is smaller then file system block
we should really return EINVAL.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/btrfs/ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f5a2e6c4320a..da518ded34bd 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -343,7 +343,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 		return -EOPNOTSUPP;
 	if (copy_from_user(&range, arg, sizeof(range)))
 		return -EFAULT;
-	if (range.start > total_bytes)
+	if (range.start > total_bytes ||
+	    range.len < fs_info->sb->s_blocksize)
 		return -EINVAL;
 
 	range.len = min(range.len, total_bytes - range.start);
-- 
cgit v1.2.3


From 671415b7db49f62896f0b6d50fc4f312a0512983 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 16 Oct 2012 11:26:46 +0000
Subject: Btrfs: fix deadlock caused by the nested chunk allocation

Steps to reproduce:
 # mkfs.btrfs -m raid1 <disk1> <disk2>
 # btrfstune -S 1 <disk1>
 # mount <disk1> <mnt>
 # btrfs device add <disk3> <disk4> <mnt>
 # mount -o remount,rw <mnt>
 # dd if=/dev/zero of=<mnt>/tmpfile bs=1M count=1
 Deadlock happened.

It is because of the nested chunk allocation. When we wrote the data
into the filesystem, we would allocate the data chunk because there was
no data chunk in the filesystem. At the end of the data chunk allocation,
we should insert the metadata of the data chunk into the extent tree, but
there was no raid1 chunk, so we tried to lock the chunk allocation mutex to
allocate the new chunk, but we had held the mutex, the deadlock happened.

By rights, we would allocate the raid1 chunk when we added the second device
because the profile of the seed filesystem is raid1 and we had two devices.
But we didn't do that in fact. It is because the last step of the first device
insertion didn't commit the transaction. So when we added the second device,
we didn't cow the tree, and just inserted the relative metadata into the leaves
which were generated by the first device insertion, and its profile was dup.

So, I fix this problem by commiting the transaction at the end of the first
device insertion.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/volumes.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 029b903a4ae3..0f5ebb72a5ea 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1819,6 +1819,13 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 				    "Failed to relocate sys chunks after "
 				    "device initialization. This can be fixed "
 				    "using the \"btrfs balance\" command.");
+		trans = btrfs_attach_transaction(root);
+		if (IS_ERR(trans)) {
+			if (PTR_ERR(trans) == -ENOENT)
+				return 0;
+			return PTR_ERR(trans);
+		}
+		ret = btrfs_commit_transaction(trans, root);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From e2d044fe77f8e845333bb1bd29587dc08a4346a0 Mon Sep 17 00:00:00 2001
From: Alex Lyakas <alex.btrfs@zadarastorage.com>
Date: Wed, 17 Oct 2012 13:52:47 +0000
Subject: Btrfs: Send: preserve ownership (uid and gid) also for symlinks.

This patch also requires a change in the user-space part of "receive".
We need to use "lchown" instead of "chown". We will do this in the
following patch.

Signed-off-by: Alex Lyakas <alex.btrfs@zadarastorage.com>

 	if (S_ISREG(sctx->cur_inode_mode)) {
---
 fs/btrfs/send.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 97cc5399306b..e78b297b0b00 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4067,22 +4067,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 	if (ret < 0)
 		goto out;
 
-	if (!S_ISLNK(sctx->cur_inode_mode)) {
-		if (!sctx->parent_root || sctx->cur_inode_new) {
+	if (!sctx->parent_root || sctx->cur_inode_new) {
+		need_chown = 1;
+		if (!S_ISLNK(sctx->cur_inode_mode))
 			need_chmod = 1;
-			need_chown = 1;
-		} else {
-			ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
-					NULL, NULL, &right_mode, &right_uid,
-					&right_gid, NULL);
-			if (ret < 0)
-				goto out;
+	} else {
+		ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
+				NULL, NULL, &right_mode, &right_uid,
+				&right_gid, NULL);
+		if (ret < 0)
+			goto out;
 
-			if (left_uid != right_uid || left_gid != right_gid)
-				need_chown = 1;
-			if (left_mode != right_mode)
-				need_chmod = 1;
-		}
+		if (left_uid != right_uid || left_gid != right_gid)
+			need_chown = 1;
+		if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
+			need_chmod = 1;
 	}
 
 	if (S_ISREG(sctx->cur_inode_mode)) {
-- 
cgit v1.2.3


From be6aef604920406b348acf3be6e6e8db55696386 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 22 Oct 2012 15:43:12 -0400
Subject: Btrfs: Use btrfs_update_inode_fallback when creating a snapshot

On a really full file system I was getting ENOSPC back from
btrfs_update_inode when trying to update the parent inode when creating a
snapshot.  Just use the fallback method so we can update the inode and not
have to worry about having a delayed ref.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/ctree.h       | 2 ++
 fs/btrfs/inode.c       | 7 +++----
 fs/btrfs/transaction.c | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1630be831210..8a92ab1632a2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3338,6 +3338,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode);
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f92def2467a6..5fc09908f20f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -94,8 +94,6 @@ static noinline int cow_file_range(struct inode *inode,
 				   struct page *locked_page,
 				   u64 start, u64 end, int *page_started,
 				   unsigned long *nr_written, int unlock);
-static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct inode *inode);
 
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 				     struct inode *inode,  struct inode *dir,
@@ -2746,8 +2744,9 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	return btrfs_update_inode_item(trans, root, inode);
 }
 
-static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct inode *inode)
+noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct inode *inode)
 {
 	int ret;
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 77db875b5116..04bbfb1052eb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1200,7 +1200,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
 					 dentry->d_name.len * 2);
 	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-	ret = btrfs_update_inode(trans, parent_root, parent_inode);
+	ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
 	if (ret)
 		btrfs_abort_transaction(trans, root, ret);
 fail:
-- 
cgit v1.2.3


From 7bfdcf7fbad56c0f1fc3e2d26431bed72bdcce2d Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 25 Oct 2012 07:30:19 -0600
Subject: Btrfs: fix memory leak when cloning root's node

After cloning root's node, we forgot to dec the src's ref
which can lead to a memory leak.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index eba44b076829..cdfb4c49a806 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1241,6 +1241,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 {
 	struct tree_mod_elem *tm;
 	struct extent_buffer *eb;
+	struct extent_buffer *old;
 	struct tree_mod_root *old_root = NULL;
 	u64 old_generation = 0;
 	u64 logical;
@@ -1264,13 +1265,14 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 		btrfs_tree_read_unlock(root->node);
 		free_extent_buffer(root->node);
 		blocksize = btrfs_level_size(root, old_root->level);
-		eb = read_tree_block(root, logical, blocksize, 0);
-		if (!eb) {
+		old = read_tree_block(root, logical, blocksize, 0);
+		if (!old) {
 			pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
 				logical);
 			WARN_ON(1);
 		} else {
-			eb = btrfs_clone_extent_buffer(eb);
+			eb = btrfs_clone_extent_buffer(old);
+			free_extent_buffer(old);
 		}
 	} else if (old_root) {
 		btrfs_tree_read_unlock(root->node);
-- 
cgit v1.2.3


From c37b2b6269ee4637fb7cdb5da0d1e47215d57ce2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 22 Oct 2012 15:51:44 -0400
Subject: Btrfs: do not bug when we fail to commit the transaction

We BUG if we fail to commit the transaction when creating a snapshot, which
is just obnoxious.  Remove the BUG_ON().  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index da518ded34bd..84bb4de1bb80 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -571,7 +571,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 		ret = btrfs_commit_transaction(trans,
 					       root->fs_info->extent_root);
 	}
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	ret = pending_snapshot->error;
 	if (ret)
-- 
cgit v1.2.3


From b40a79591ca918e7b91b0d9b6abd5d00f2e88c19 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 25 Oct 2012 22:28:12 +0200
Subject: freezer: exec should clear PF_NOFREEZE along with PF_KTHREAD

flush_old_exec() clears PF_KTHREAD but forgets about PF_NOFREEZE.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 fs/exec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 8b9011b67041..0039055b1fc6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1083,7 +1083,8 @@ int flush_old_exec(struct linux_binprm * bprm)
 	bprm->mm = NULL;		/* We're using it now */
 
 	set_fs(USER_DS);
-	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD);
+	current->flags &=
+		~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE);
 	flush_thread();
 	current->personality &= ~bprm->per_clear;
 
-- 
cgit v1.2.3


From 12176503366885edd542389eed3aaf94be163fdb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 25 Oct 2012 13:38:16 -0700
Subject: fs/compat_ioctl.c: VIDEO_SET_SPU_PALETTE missing error check

The compat ioctl for VIDEO_SET_SPU_PALETTE was missing an error check
while converting ioctl arguments.  This could lead to leaking kernel
stack contents into userspace.

Patch extracted from existing fix in grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: David Miller <davem@davemloft.net>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: PaX Team <pageexec@freemail.hu>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f5054025f9da..4c6285fff598 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -210,6 +210,8 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
 
 	err  = get_user(palp, &up->palette);
 	err |= get_user(length, &up->length);
+	if (err)
+		return -EFAULT;
 
 	up_native = compat_alloc_user_space(sizeof(struct video_spu_palette));
 	err  = put_user(compat_ptr(palp), &up_native->palette);
-- 
cgit v1.2.3


From 98a1eebda3cb2a84ecf1f219bb3a95769033d1bf Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 10 Oct 2012 10:55:28 +0300
Subject: UBIFS: introduce categorized lprops counter

This commit is a preparation for a subsequent bugfix. We introduce a
counter for categorized lprops.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: stable@vger.kernel.org
---
 fs/ubifs/lprops.c | 6 ++++++
 fs/ubifs/ubifs.h  | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index e5a2a35a46dc..46190a7c42a6 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -300,8 +300,11 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
 	default:
 		ubifs_assert(0);
 	}
+
 	lprops->flags &= ~LPROPS_CAT_MASK;
 	lprops->flags |= cat;
+	c->in_a_category_cnt += 1;
+	ubifs_assert(c->in_a_category_cnt <= c->main_lebs);
 }
 
 /**
@@ -334,6 +337,9 @@ static void ubifs_remove_from_cat(struct ubifs_info *c,
 	default:
 		ubifs_assert(0);
 	}
+
+	c->in_a_category_cnt -= 1;
+	ubifs_assert(c->in_a_category_cnt >= 0);
 }
 
 /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 5486346d0a3f..d133c276fe05 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1183,6 +1183,8 @@ struct ubifs_debug_info;
  * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
  * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
  * @freeable_cnt: number of freeable LEBs in @freeable_list
+ * @in_a_category_cnt: count of lprops which are in a certain category, which
+ *                     basically meants that they were loaded from the flash
  *
  * @ltab_lnum: LEB number of LPT's own lprops table
  * @ltab_offs: offset of LPT's own lprops table
@@ -1412,6 +1414,7 @@ struct ubifs_info {
 	struct list_head freeable_list;
 	struct list_head frdi_idx_list;
 	int freeable_cnt;
+	int in_a_category_cnt;
 
 	int ltab_lnum;
 	int ltab_offs;
-- 
cgit v1.2.3


From a28ad42a4a0c6f302f488f26488b8b37c9b30024 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Tue, 9 Oct 2012 16:20:15 +0300
Subject: UBIFS: fix mounting problems after power cuts

This is a bugfix for a problem with the following symptoms:

1. A power cut happens
2. After reboot, we try to mount UBIFS
3. Mount fails with "No space left on device" error message

UBIFS complains like this:

UBIFS error (pid 28225): grab_empty_leb: could not find an empty LEB

The root cause of this problem is that when we mount, not all LEBs are
categorized. Only those which were read are. However, the
'ubifs_find_free_leb_for_idx()' function assumes that all LEBs were
categorized and 'c->freeable_cnt' is valid, which is a false assumption.

This patch fixes the problem by teaching 'ubifs_find_free_leb_for_idx()'
to always fall back to LPT scanning if no freeable LEBs were found.

This problem was reported by few people in the past, but Brent Taylor
was able to reproduce it and send me a flash image which cannot be mounted,
which made it easy to hunt the bug. Kudos to Brent.

Reported-by: Brent Taylor <motobud@gmail.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: stable@vger.kernel.org
---
 fs/ubifs/find.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 28ec13af28d9..2dcf3d473fec 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -681,8 +681,16 @@ int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
 	if (!lprops) {
 		lprops = ubifs_fast_find_freeable(c);
 		if (!lprops) {
-			ubifs_assert(c->freeable_cnt == 0);
-			if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+			/*
+			 * The first condition means the following: go scan the
+			 * LPT if there are uncategorized lprops, which means
+			 * there may be freeable LEBs there (UBIFS does not
+			 * store the information about freeable LEBs in the
+			 * master node).
+			 */
+			if (c->in_a_category_cnt != c->main_lebs ||
+			    c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+				ubifs_assert(c->freeable_cnt == 0);
 				lprops = scan_for_leb_for_idx(c);
 				if (IS_ERR(lprops)) {
 					err = PTR_ERR(lprops);
-- 
cgit v1.2.3


From 561ec64ae67ef25cac8d72bb9c4bfc955edfd415 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 26 Oct 2012 10:05:07 -0700
Subject: VFS: don't do protected {sym,hard}links by default

In commit 800179c9b8a1 ("This adds symlink and hardlink restrictions to
the Linux VFS"), the new link protections were enabled by default, in
the hope that no actual application would care, despite it being
technically against legacy UNIX (and documented POSIX) behavior.

However, it does turn out to break some applications.  It's rare, and
it's unfortunate, but it's unacceptable to break existing systems, so
we'll have to default to legacy behavior.

In particular, it has broken the way AFD distributes files, see

  http://www.dwd.de/AFD/

along with some legacy scripts.

Distributions can end up setting this at initrd time or in system
scripts: if you have security problems due to link attacks during your
early boot sequence, you have bigger problems than some kernel sysctl
setting. Do:

	echo 1 > /proc/sys/fs/protected_symlinks
	echo 1 > /proc/sys/fs/protected_hardlinks

to re-enable the link protections.

Alternatively, we may at some point introduce a kernel config option
that sets these kinds of "more secure but not traditional" behavioural
options automatically.

Reported-by: Nick Bowler <nbowler@elliptictech.com>
Reported-by: Holger Kiehl <Holger.Kiehl@dwd.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org # v3.6
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index d1895f308156..937f9d50c84b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -705,8 +705,8 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki
 	path_put(link);
 }
 
-int sysctl_protected_symlinks __read_mostly = 1;
-int sysctl_protected_hardlinks __read_mostly = 1;
+int sysctl_protected_symlinks __read_mostly = 0;
+int sysctl_protected_hardlinks __read_mostly = 0;
 
 /**
  * may_follow_link - Check symlink following for unsafe situations
-- 
cgit v1.2.3


From 1a25b1c4ce189e3926f2981f3302352a930086db Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 15 Oct 2012 17:20:17 -0400
Subject: Lock splice_read and splice_write functions

Functions generic_file_splice_read and generic_file_splice_write access
the pagecache directly. For block devices these functions must be locked
so that block size is not changed while they are in progress.

This patch is an additional fix for commit b87570f5d349 ("Fix a crash
when block device is read and block size is changed at the same time")
that locked aio_read, aio_write and mmap against block size change.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b3c1d3dae77d..1a1e5e3b1eaf 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1661,6 +1661,39 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
 	return ret;
 }
 
+static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos,
+				  struct pipe_inode_info *pipe, size_t len,
+				  unsigned int flags)
+{
+	ssize_t ret;
+	struct block_device *bdev = I_BDEV(file->f_mapping->host);
+
+	percpu_down_read(&bdev->bd_block_size_semaphore);
+
+	ret = generic_file_splice_read(file, ppos, pipe, len, flags);
+
+	percpu_up_read(&bdev->bd_block_size_semaphore);
+
+	return ret;
+}
+
+static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe,
+				   struct file *file, loff_t *ppos, size_t len,
+				   unsigned int flags)
+{
+	ssize_t ret;
+	struct block_device *bdev = I_BDEV(file->f_mapping->host);
+
+	percpu_down_read(&bdev->bd_block_size_semaphore);
+
+	ret = generic_file_splice_write(pipe, file, ppos, len, flags);
+
+	percpu_up_read(&bdev->bd_block_size_semaphore);
+
+	return ret;
+}
+
+
 /*
  * Try to release a page associated with block device when the system
  * is under memory pressure.
@@ -1699,8 +1732,8 @@ const struct file_operations def_blk_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
-	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_read	= blkdev_splice_read,
+	.splice_write	= blkdev_splice_write,
 };
 
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
-- 
cgit v1.2.3


From ffb5387e85d528fb6d0d924abfa3fbf0fc484071 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sun, 28 Oct 2012 22:24:57 -0400
Subject: ext4: fix unjournaled inode bitmap modification

commit 119c0d4460b001e44b41dcf73dc6ee794b98bd31 changed
ext4_new_inode() such that the inode bitmap was being modified
outside a transaction, which could lead to corruption, and was
discovered when journal_checksum found a bad checksum in the
journal during log replay.

Nix ran into this when using the journal_async_commit mount
option, which enables journal checksumming.  The ensuing
journal replay failures due to the bad checksums led to
filesystem corruption reported as the now infamous
"Apparent serious progressive ext4 data corruption bug"

[ Changed by tytso to only call ext4_journal_get_write_access() only
  when we're fairly certain that we're going to allocate the inode. ]

I've tested this by mounting with journal_checksum and
running fsstress then dropping power; I've also tested by
hacking DM to create snapshots w/o first quiescing, which
allows me to test journal replay repeatedly w/o actually
power-cycling the box.  Without the patch I hit a journal
checksum error every time.  With this fix it survives
many iterations.

Reported-by: Nix <nix@esperi.org.uk>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/ialloc.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4facdd29a350..3a100e7a62a8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -725,6 +725,10 @@ repeat_in_this_group:
 				   "inode=%lu", ino + 1);
 			continue;
 		}
+		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+		if (err)
+			goto fail;
 		ext4_lock_group(sb, group);
 		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
 		ext4_unlock_group(sb, group);
@@ -738,6 +742,11 @@ repeat_in_this_group:
 	goto out;
 
 got:
+	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+	if (err)
+		goto fail;
+
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (ext4_has_group_desc_csum(sb) &&
 	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -771,11 +780,6 @@ got:
 			goto fail;
 	}
 
-	BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
-	if (err)
-		goto fail;
-
 	BUFFER_TRACE(group_desc_bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, group_desc_bh);
 	if (err)
@@ -823,11 +827,6 @@ got:
 	}
 	ext4_unlock_group(sb, group);
 
-	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
-	if (err)
-		goto fail;
-
 	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
 	if (err)
-- 
cgit v1.2.3


From 52eb5a900a9863a8b77a895f770e5d825c8e02c6 Mon Sep 17 00:00:00 2001
From: David Zafman <david.zafman@inktank.com>
Date: Thu, 18 Oct 2012 14:01:43 -0700
Subject: ceph: fix dentry reference leak in encode_fh()

Call to d_find_alias() needs a corresponding dput()

This fixes http://tracker.newdream.net/issues/3271

Signed-off-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/export.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 8e1b60e557b6..862887004d20 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -90,6 +90,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 		*max_len = handle_length;
 		type = 255;
 	}
+	if (dentry)
+		dput(dentry);
 	return type;
 }
 
-- 
cgit v1.2.3


From 08f05c49749ee655bef921d12160960a273aad47 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 31 Oct 2012 03:37:48 +0000
Subject: Return the right error value when dup[23]() newfd argument is too
 large

Jack Lin reports that the error return from dup3() for the RLIMIT_NOFILE
case changed incorrectly after 3.6.

The culprit is commit f33ff9927f42 ("take rlimit check to callers of
expand_files()") which when it moved the "return -EMFILE" out to the
caller, didn't notice that the dup3() had special code to turn the
EMFILE return into EBADF.

The replace_fd() helper that got added later then inherited the bug too.

Reported-by: Jack Lin <linliangjie@huawei.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
[ Noted more bugs, wrote proper changelog, fixed up typos - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index d3b5fa80b71b..708d997a7748 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -900,7 +900,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
 		return __close_fd(files, fd);
 
 	if (fd >= rlimit(RLIMIT_NOFILE))
-		return -EMFILE;
+		return -EBADF;
 
 	spin_lock(&files->file_lock);
 	err = expand_files(files, fd);
@@ -926,7 +926,7 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
 		return -EINVAL;
 
 	if (newfd >= rlimit(RLIMIT_NOFILE))
-		return -EMFILE;
+		return -EBADF;
 
 	spin_lock(&files->file_lock);
 	err = expand_files(files, newfd);
-- 
cgit v1.2.3


From 399f11c3d872bd748e1575574de265a6304c7c43 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Tue, 30 Oct 2012 16:06:35 -0400
Subject: NFS: Wait for session recovery to finish before returning

Currently, we will schedule session recovery and then return to the
caller of nfs4_handle_exception.  This works for most cases, but causes
a hang on the following test case:

	Client				Server
	------				------
	Open file over NFS v4.1
	Write to file
					Expire client
	Try to lock file

The server will return NFS4ERR_BADSESSION, prompting the client to
schedule recovery.  However, the client will continue placing lock
attempts and the open recovery never seems to be scheduled.  The
simplest solution is to wait for session recovery to run before retrying
the lock.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/nfs4proc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 68b21d81b7ac..d5fbf1f49d5f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -339,8 +339,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 			dprintk("%s ERROR: %d Reset session\n", __func__,
 				errorcode);
 			nfs4_schedule_session_recovery(clp->cl_session, errorcode);
-			exception->retry = 1;
-			break;
+			goto wait_on_recovery;
 #endif /* defined(CONFIG_NFS_V4_1) */
 		case -NFS4ERR_FILE_OPEN:
 			if (exception->timeout > HZ) {
-- 
cgit v1.2.3


From 2240a9e2d013d8269ea425b73e1d7a54c7bc141f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 29 Oct 2012 18:37:40 -0400
Subject: NFSv4.1: We must release the sequence id when we fail to get a
 session slot

If we do not release the sequence id in cases where we fail to get a
session slot, then we can deadlock if we hit a recovery scenario.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/nfs4proc.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d5fbf1f49d5f..e0423bb5a880 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1571,9 +1571,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	data->timestamp = jiffies;
 	if (nfs4_setup_sequence(data->o_arg.server,
 				&data->o_arg.seq_args,
-				&data->o_res.seq_res, task))
-		return;
-	rpc_call_start(task);
+				&data->o_res.seq_res,
+				task) != 0)
+		nfs_release_seqid(data->o_arg.seqid);
+	else
+		rpc_call_start(task);
 	return;
 unlock_no_action:
 	rcu_read_unlock();
@@ -2295,9 +2297,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	if (nfs4_setup_sequence(NFS_SERVER(inode),
 				&calldata->arg.seq_args,
 				&calldata->res.seq_res,
-				task))
-		goto out;
-	rpc_call_start(task);
+				task) != 0)
+		nfs_release_seqid(calldata->arg.seqid);
+	else
+		rpc_call_start(task);
 out:
 	dprintk("%s: done!\n", __func__);
 }
@@ -4544,9 +4547,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 	calldata->timestamp = jiffies;
 	if (nfs4_setup_sequence(calldata->server,
 				&calldata->arg.seq_args,
-				&calldata->res.seq_res, task))
-		return;
-	rpc_call_start(task);
+				&calldata->res.seq_res,
+				task) != 0)
+		nfs_release_seqid(calldata->arg.seqid);
+	else
+		rpc_call_start(task);
 }
 
 static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4691,7 +4696,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 	/* Do we need to do an open_to_lock_owner? */
 	if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
 		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
-			return;
+			goto out_release_lock_seqid;
 		data->arg.open_stateid = &state->stateid;
 		data->arg.new_lock_owner = 1;
 		data->res.open_seqid = data->arg.open_seqid;
@@ -4700,10 +4705,15 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 	data->timestamp = jiffies;
 	if (nfs4_setup_sequence(data->server,
 				&data->arg.seq_args,
-				&data->res.seq_res, task))
+				&data->res.seq_res,
+				task) == 0) {
+		rpc_call_start(task);
 		return;
-	rpc_call_start(task);
-	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
+	}
+	nfs_release_seqid(data->arg.open_seqid);
+out_release_lock_seqid:
+	nfs_release_seqid(data->arg.lock_seqid);
+	dprintk("%s: done!, ret = %d\n", __func__, task->tk_status);
 }
 
 static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
-- 
cgit v1.2.3


From 2b1bc308f492589f7d49012ed24561534ea2be8c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 29 Oct 2012 18:53:23 -0400
Subject: NFSv4: nfs4_locku_done must release the sequence id

If the state recovery machinery is triggered by the call to
nfs4_async_handle_error() then we can deadlock.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/nfs4proc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e0423bb5a880..1465364501ba 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4531,6 +4531,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 			if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
 				rpc_restart_call_prepare(task);
 	}
+	nfs_release_seqid(calldata->arg.seqid);
 }
 
 static void nfs4_locku_prepare(struct rpc_task *task, void *data)
-- 
cgit v1.2.3


From 8d96b10639fb402357b75b055b1e82a65ff95050 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 31 Oct 2012 12:16:01 +1100
Subject: NFS: fix bug in legacy DNS resolver.

The DNS resolver's use of the sunrpc cache involves a 'ttl' number
(relative) rather that a timeout (absolute).  This confused me when
I wrote
  commit c5b29f885afe890f953f7f23424045cdad31d3e4
     "sunrpc: use seconds since boot in expiry cache"

and I managed to break it.  The effect is that any TTL is interpreted
as 0, and nothing useful gets into the cache.

This patch removes the use of get_expiry() - which really expects an
expiry time - and uses get_uint() instead, treating the int correctly
as a ttl.

This fixes a regression that has been present since 2.6.37, causing
certain NFS accesses in certain environments to incorrectly fail.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Chuck Lever <chuck.lever@oracle.com>
Cc: stable@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dns_resolve.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 31c26c4dcc23..ca4b11ec87a2 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -217,7 +217,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
 {
 	char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
 	struct nfs_dns_ent key, *item;
-	unsigned long ttl;
+	unsigned int ttl;
 	ssize_t len;
 	int ret = -EINVAL;
 
@@ -240,7 +240,8 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
 	key.namelen = len;
 	memset(&key.h, 0, sizeof(key.h));
 
-	ttl = get_expiry(&buf);
+	if (get_uint(&buf, &ttl) < 0)
+		goto out;
 	if (ttl == 0)
 		goto out;
 	key.h.expiry_time = ttl + seconds_since_boot();
-- 
cgit v1.2.3


From 7175fe90153e6375082d65884fbb41ab3bbb4901 Mon Sep 17 00:00:00 2001
From: Yanchuan Nian <ycnian@gmail.com>
Date: Wed, 31 Oct 2012 16:05:48 +0800
Subject: nfs: Check whether a layout pointer is NULL before free it

The new layout pointer in pnfs_find_alloc_layout() may be NULL because of
out of memory. we must do some check work, otherwise pnfs_free_layout_hdr()
will go wrong because it can not deal with a NULL pointer.

Signed-off-by: Yanchuan Nian <ycnian@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index fe624c91bd00..2878f97bd78d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -925,8 +925,8 @@ pnfs_find_alloc_layout(struct inode *ino,
 	if (likely(nfsi->layout == NULL)) {	/* Won the race? */
 		nfsi->layout = new;
 		return new;
-	}
-	pnfs_free_layout_hdr(new);
+	} else if (new != NULL)
+		pnfs_free_layout_hdr(new);
 out_existing:
 	pnfs_get_layout_hdr(nfsi->layout);
 	return nfsi->layout;
-- 
cgit v1.2.3


From acce94e68a0f346115fd41cdc298197d2d5a59ad Mon Sep 17 00:00:00 2001
From: Scott Mayhew <smayhew@redhat.com>
Date: Tue, 16 Oct 2012 13:22:19 -0400
Subject: nfsv3: Make v3 mounts fail with ETIMEDOUTs instead EIO on mountd
 timeouts

In very busy v3 environment, rpc.mountd can respond to the NULL
procedure but not the MNT procedure in a timely manner causing
the MNT procedure to time out. The problem is the mount system
call returns EIO which causes the mount to fail, instead of
ETIMEDOUT, which would cause the mount to be retried.

This patch sets the RPC_TASK_SOFT|RPC_TASK_TIMEOUT flags to
the rpc_call_sync() call in nfs_mount() which causes
ETIMEDOUT to be returned on timed out connections.

Signed-off-by: Steve Dickson <steved@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/mount_clnt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 8e65c7f1f87c..015f71f8f62c 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -181,7 +181,7 @@ int nfs_mount(struct nfs_mount_request *info)
 	else
 		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
 
-	status = rpc_call_sync(mnt_clnt, &msg, 0);
+	status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT);
 	rpc_shutdown_client(mnt_clnt);
 
 	if (status < 0)
-- 
cgit v1.2.3


From 97a54868262da1629a3e65121e65b8e8c4419d9f Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Sun, 21 Oct 2012 19:23:52 +0100
Subject: nfs: Show original device name verbatim in /proc/*/mount{s,info}

Since commit c7f404b ('vfs: new superblock methods to override
/proc/*/mount{s,info}'), nfs_path() is used to generate the mounted
device name reported back to userland.

nfs_path() always generates a trailing slash when the given dentry is
the root of an NFS mount, but userland may expect the original device
name to be returned verbatim (as it used to be).  Make this
canonicalisation optional and change the callers accordingly.

[jrnieder@gmail.com: use flag instead of bool argument]
Reported-and-tested-by: Chris Hiestand <chiestand@salk.edu>
Reference: http://bugs.debian.org/669314
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: <stable@vger.kernel.org> # v2.6.39+
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h      |  5 +++--
 fs/nfs/namespace.c     | 19 ++++++++++++++-----
 fs/nfs/nfs4namespace.c |  3 ++-
 fs/nfs/super.c         |  2 +-
 4 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 59b133c5d652..a54fe51c1dfb 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -353,8 +353,9 @@ extern void nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
 
 /* namespace.c */
+#define NFS_PATH_CANONICAL 1
 extern char *nfs_path(char **p, struct dentry *dentry,
-		      char *buffer, ssize_t buflen);
+		      char *buffer, ssize_t buflen, unsigned flags);
 extern struct vfsmount *nfs_d_automount(struct path *path);
 struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *,
 			      struct nfs_fh *, struct nfs_fattr *);
@@ -498,7 +499,7 @@ static inline char *nfs_devname(struct dentry *dentry,
 				char *buffer, ssize_t buflen)
 {
 	char *dummy;
-	return nfs_path(&dummy, dentry, buffer, buflen);
+	return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL);
 }
 
 /*
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 655925373b91..dd057bc6b65b 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -33,6 +33,7 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
  * @dentry - pointer to dentry
  * @buffer - result buffer
  * @buflen - length of buffer
+ * @flags - options (see below)
  *
  * Helper function for constructing the server pathname
  * by arbitrary hashed dentry.
@@ -40,8 +41,14 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
  * This is mainly for use in figuring out the path on the
  * server side when automounting on top of an existing partition
  * and in generating /proc/mounts and friends.
+ *
+ * Supported flags:
+ * NFS_PATH_CANONICAL: ensure there is exactly one slash after
+ *		       the original device (export) name
+ *		       (if unset, the original name is returned verbatim)
  */
-char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen)
+char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen,
+	       unsigned flags)
 {
 	char *end;
 	int namelen;
@@ -74,7 +81,7 @@ rename_retry:
 		rcu_read_unlock();
 		goto rename_retry;
 	}
-	if (*end != '/') {
+	if ((flags & NFS_PATH_CANONICAL) && *end != '/') {
 		if (--buflen < 0) {
 			spin_unlock(&dentry->d_lock);
 			rcu_read_unlock();
@@ -91,9 +98,11 @@ rename_retry:
 		return end;
 	}
 	namelen = strlen(base);
-	/* Strip off excess slashes in base string */
-	while (namelen > 0 && base[namelen - 1] == '/')
-		namelen--;
+	if (flags & NFS_PATH_CANONICAL) {
+		/* Strip off excess slashes in base string */
+		while (namelen > 0 && base[namelen - 1] == '/')
+			namelen--;
+	}
 	buflen -= namelen;
 	if (buflen < 0) {
 		spin_unlock(&dentry->d_lock);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 79fbb61ce202..1e09eb78543b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -81,7 +81,8 @@ static char *nfs_path_component(const char *nfspath, const char *end)
 static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
 {
 	char *limit;
-	char *path = nfs_path(&limit, dentry, buffer, buflen);
+	char *path = nfs_path(&limit, dentry, buffer, buflen,
+			      NFS_PATH_CANONICAL);
 	if (!IS_ERR(path)) {
 		char *path_component = nfs_path_component(path, limit);
 		if (path_component)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e831bce49766..13c2a5be4765 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -771,7 +771,7 @@ int nfs_show_devname(struct seq_file *m, struct dentry *root)
 	int err = 0;
 	if (!page)
 		return -ENOMEM;
-	devname = nfs_path(&dummy, root, page, PAGE_SIZE);
+	devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0);
 	if (IS_ERR(devname))
 		err = PTR_ERR(devname);
 	else
-- 
cgit v1.2.3


From 324d003b0cd82151adbaecefef57b73f7959a469 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Tue, 30 Oct 2012 17:01:39 -0400
Subject: NFS: add nfs_sb_deactive_async to avoid deadlock

Use nfs_sb_deactive_async instead of nfs_sb_deactive when in a workqueue
context.  This avoids a deadlock where rpc_shutdown_client loops forever
in a workqueue kworker context, trying to kill all RPC tasks associated with
the client, while one or more of these tasks have already been assigned to the
same kworker (and will never run rpc_exit_task).

This approach is needed because RPC tasks that have already been assigned
to a kworker by queue_work cannot be canceled, as explained in the comment
for workqueue.c:insert_wq_barrier.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
[Trond: add module_get/put.]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c    |  5 ++++-
 fs/nfs/internal.h |  1 +
 fs/nfs/nfs4proc.c |  2 +-
 fs/nfs/super.c    | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/unlink.c   |  2 +-
 5 files changed, 56 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5c7325c5c5e6..6fa01aea2488 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -685,7 +685,10 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	dput(ctx->dentry);
-	nfs_sb_deactive(sb);
+	if (is_sync)
+		nfs_sb_deactive(sb);
+	else
+		nfs_sb_deactive_async(sb);
 	kfree(ctx->mdsthreshold);
 	kfree(ctx);
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a54fe51c1dfb..05521cadac2e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -351,6 +351,7 @@ extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
 extern void nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
+extern void nfs_sb_deactive_async(struct super_block *sb);
 
 /* namespace.c */
 #define NFS_PATH_CANONICAL 1
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1465364501ba..8cfbac1a8d5e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2197,7 +2197,7 @@ static void nfs4_free_closedata(void *data)
 	nfs4_put_open_state(calldata->state);
 	nfs_free_seqid(calldata->arg.seqid);
 	nfs4_put_state_owner(sp);
-	nfs_sb_deactive(sb);
+	nfs_sb_deactive_async(sb);
 	kfree(calldata);
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 13c2a5be4765..652d3f7176a9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -54,6 +54,7 @@
 #include <linux/parser.h>
 #include <linux/nsproxy.h>
 #include <linux/rcupdate.h>
+#include <linux/kthread.h>
 
 #include <asm/uaccess.h>
 
@@ -415,6 +416,54 @@ void nfs_sb_deactive(struct super_block *sb)
 }
 EXPORT_SYMBOL_GPL(nfs_sb_deactive);
 
+static int nfs_deactivate_super_async_work(void *ptr)
+{
+	struct super_block *sb = ptr;
+
+	deactivate_super(sb);
+	module_put_and_exit(0);
+	return 0;
+}
+
+/*
+ * same effect as deactivate_super, but will do final unmount in kthread
+ * context
+ */
+static void nfs_deactivate_super_async(struct super_block *sb)
+{
+	struct task_struct *task;
+	char buf[INET6_ADDRSTRLEN + 1];
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_client *clp = server->nfs_client;
+
+	if (!atomic_add_unless(&sb->s_active, -1, 1)) {
+		rcu_read_lock();
+		snprintf(buf, sizeof(buf),
+			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+		rcu_read_unlock();
+
+		__module_get(THIS_MODULE);
+		task = kthread_run(nfs_deactivate_super_async_work, sb,
+				"%s-deactivate-super", buf);
+		if (IS_ERR(task)) {
+			pr_err("%s: kthread_run: %ld\n",
+				__func__, PTR_ERR(task));
+			/* make synchronous call and hope for the best */
+			deactivate_super(sb);
+			module_put(THIS_MODULE);
+		}
+	}
+}
+
+void nfs_sb_deactive_async(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	if (atomic_dec_and_test(&server->active))
+		nfs_deactivate_super_async(sb);
+}
+EXPORT_SYMBOL_GPL(nfs_sb_deactive_async);
+
 /*
  * Deliver file system statistics to userspace
  */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 13cea637eff8..3f79c77153b8 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata)
 
 	nfs_dec_sillycount(data->dir);
 	nfs_free_unlinkdata(data);
-	nfs_sb_deactive(sb);
+	nfs_sb_deactive_async(sb);
 }
 
 static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
-- 
cgit v1.2.3


From f9b1ef5f06d65a01952169b67d474f7f0dcb0206 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 29 Oct 2012 16:48:40 -0400
Subject: NFSv4: Initialise the NFSv4.1 slot table highest_used_slotid
 correctly

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8cfbac1a8d5e..091baab3eccf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5677,7 +5677,7 @@ static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
 		tbl->slots = new;
 		tbl->max_slots = max_slots;
 	}
-	tbl->highest_used_slotid = -1;	/* no slot is currently used */
+	tbl->highest_used_slotid = NFS4_NO_SLOT;
 	for (i = 0; i < tbl->max_slots; i++)
 		tbl->slots[i].seq_nr = ivalue;
 	spin_unlock(&tbl->slot_tbl_lock);
-- 
cgit v1.2.3


From 998f40b550f257e436485291802fa938e4cf580f Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 2 Nov 2012 18:00:56 -0400
Subject: NFS4: nfs4_opendata_access should return errno

Return errno - not an NFS4ERR_. This worked because NFS4ERR_ACCESS == EACCES.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 091baab3eccf..5eec4429970c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1749,7 +1749,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
 
 	/* even though OPEN succeeded, access is denied. Close the file */
 	nfs4_close_state(state, fmode);
-	return -NFS4ERR_ACCESS;
+	return -EACCES;
 }
 
 /*
-- 
cgit v1.2.3


From 36960e440ccf94349c09fb944930d3bfe4bc473f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 3 Nov 2012 09:37:28 -0400
Subject: cifs: fix potential buffer overrun in cifs.idmap handling code

The userspace cifs.idmap program generally works with the wbclient libs
to generate binary SIDs in userspace. That program defines the struct
that holds these values as having a max of 15 subauthorities. The kernel
idmapping code however limits that value to 5.

When the kernel copies those values around though, it doesn't sanity
check the num_subauths value handed back from userspace or from the
server. It's possible therefore for userspace to hand us back a bogus
num_subauths value (or one that's valid, but greater than 5) that could
cause the kernel to walk off the end of the cifs_sid->sub_auths array.

Fix this by defining a new routine for copying sids and using that in
all of the places that copy it. If we end up with a sid that's longer
than expected then this approach will just lop off the "extra" subauths,
but that's basically what the code does today already. Better approaches
might be to fix this code to reject SIDs with >5 subauths, or fix it
to handle the subauths array dynamically.

At the same time, change the kernel to check the length of the data
returned by userspace. If it's shorter than struct cifs_sid, reject it
and return -EIO. If that happens we'll end up with fields that are
basically uninitialized.

Long term, it might make sense to redefine cifs_sid using a flexarray at
the end, to allow for variable-length subauth lists, and teach the code
to handle the case where the subauths array being passed in from
userspace is shorter than 5 elements.

Note too, that I don't consider this a security issue since you'd need
a compromised cifs.idmap program. If you have that, you can do all sorts
of nefarious stuff. Still, this is probably reasonable for stable.

Cc: stable@kernel.org
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/cifsacl.c | 49 ++++++++++++++++++++-----------------------------
 1 file changed, 20 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fc783e264420..0fb15bbbe43c 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -224,6 +224,13 @@ sid_to_str(struct cifs_sid *sidptr, char *sidstr)
 	}
 }
 
+static void
+cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+	dst->num_subauth = min_t(u8, src->num_subauth, NUM_SUBAUTHS);
+}
+
 static void
 id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr,
 		struct cifs_sid_id **psidid, char *typestr)
@@ -248,7 +255,7 @@ id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr,
 		}
 	}
 
-	memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid));
+	cifs_copy_sid(&(*psidid)->sid, sidptr);
 	(*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
 	(*psidid)->refcount = 0;
 
@@ -354,7 +361,7 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
 	 * any fields of the node after a reference is put .
 	 */
 	if (test_bit(SID_ID_MAPPED, &psidid->state)) {
-		memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+		cifs_copy_sid(ssid, &psidid->sid);
 		psidid->time = jiffies; /* update ts for accessing */
 		goto id_sid_out;
 	}
@@ -370,14 +377,14 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
 		if (IS_ERR(sidkey)) {
 			rc = -EINVAL;
 			cFYI(1, "%s: Can't map and id to a SID", __func__);
+		} else if (sidkey->datalen < sizeof(struct cifs_sid)) {
+			rc = -EIO;
+			cFYI(1, "%s: Downcall contained malformed key "
+				"(datalen=%hu)", __func__, sidkey->datalen);
 		} else {
 			lsid = (struct cifs_sid *)sidkey->payload.data;
-			memcpy(&psidid->sid, lsid,
-				sidkey->datalen < sizeof(struct cifs_sid) ?
-				sidkey->datalen : sizeof(struct cifs_sid));
-			memcpy(ssid, &psidid->sid,
-				sidkey->datalen < sizeof(struct cifs_sid) ?
-				sidkey->datalen : sizeof(struct cifs_sid));
+			cifs_copy_sid(&psidid->sid, lsid);
+			cifs_copy_sid(ssid, &psidid->sid);
 			set_bit(SID_ID_MAPPED, &psidid->state);
 			key_put(sidkey);
 			kfree(psidid->sidstr);
@@ -396,7 +403,7 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
 			return rc;
 		}
 		if (test_bit(SID_ID_MAPPED, &psidid->state))
-			memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+			cifs_copy_sid(ssid, &psidid->sid);
 		else
 			rc = -EINVAL;
 	}
@@ -675,8 +682,6 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 static void copy_sec_desc(const struct cifs_ntsd *pntsd,
 				struct cifs_ntsd *pnntsd, __u32 sidsoffset)
 {
-	int i;
-
 	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
 	struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
 
@@ -692,26 +697,14 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
 	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->osidoffset));
 	nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
-
-	nowner_sid_ptr->revision = owner_sid_ptr->revision;
-	nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth;
-	for (i = 0; i < 6; i++)
-		nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i];
-	for (i = 0; i < 5; i++)
-		nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i];
+	cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr);
 
 	/* copy group sid */
 	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->gsidoffset));
 	ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
 					sizeof(struct cifs_sid));
-
-	ngroup_sid_ptr->revision = group_sid_ptr->revision;
-	ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth;
-	for (i = 0; i < 6; i++)
-		ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
-	for (i = 0; i < 5; i++)
-		ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
+	cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr);
 
 	return;
 }
@@ -1120,8 +1113,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 				kfree(nowner_sid_ptr);
 				return rc;
 			}
-			memcpy(owner_sid_ptr, nowner_sid_ptr,
-					sizeof(struct cifs_sid));
+			cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr);
 			kfree(nowner_sid_ptr);
 			*aclflag = CIFS_ACL_OWNER;
 		}
@@ -1139,8 +1131,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 				kfree(ngroup_sid_ptr);
 				return rc;
 			}
-			memcpy(group_sid_ptr, ngroup_sid_ptr,
-					sizeof(struct cifs_sid));
+			cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr);
 			kfree(ngroup_sid_ptr);
 			*aclflag = CIFS_ACL_GROUP;
 		}
-- 
cgit v1.2.3


From 3798f47aa276b332c30da499cb4df4577e2f8872 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Mon, 5 Nov 2012 11:39:32 +0000
Subject: cifs: Do not lookup hashed negative dentry in cifs_atomic_open

We do not need to lookup a hashed negative directory since we have
already revalidated it before and have found it to be fine.

This also prevents a crash in cifs_lookup() when it attempts to rehash
the already hashed negative lookup dentry.

The patch has been tested using the reproducer at
https://bugzilla.redhat.com/show_bug.cgi?id=867344#c28

Cc: <stable@kernel.org> # 3.6.x
Reported-by: Vit Zahradka <vit.zahradka@tiscali.cz>
Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
---
 fs/cifs/dir.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 7c0a81283645..d3671f2acb29 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -398,7 +398,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
 	 * in network traffic in the other paths.
 	 */
 	if (!(oflags & O_CREAT)) {
-		struct dentry *res = cifs_lookup(inode, direntry, 0);
+		struct dentry *res;
+
+		/*
+		 * Check for hashed negative dentry. We have already revalidated
+		 * the dentry and it is fine. No need to perform another lookup.
+		 */
+		if (!d_unhashed(direntry))
+			return -ENOENT;
+
+		res = cifs_lookup(inode, direntry, 0);
 		if (IS_ERR(res))
 			return PTR_ERR(res);
 
-- 
cgit v1.2.3


From aaaf68c5629108f6078ab458d34a661143ea6857 Mon Sep 17 00:00:00 2001
From: Andrew Price <anprice@redhat.com>
Date: Fri, 12 Oct 2012 16:45:08 +0100
Subject: GFS2: Fix an unchecked error from gfs2_rs_alloc

Check the return value of gfs2_rs_alloc(ip) and avoid a possible null
pointer dereference.

Signed-off-by: Andrew Price <anprice@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 40c4b0d42fa8..c5af8e18f27a 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -497,8 +497,11 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
 	struct gfs2_quota_data **qd;
 	int error;
 
-	if (ip->i_res == NULL)
-		gfs2_rs_alloc(ip);
+	if (ip->i_res == NULL) {
+		error = gfs2_rs_alloc(ip);
+		if (error)
+			return error;
+	}
 
 	qd = ip->i_res->rs_qa_qd;
 
-- 
cgit v1.2.3


From cd0ed19fb614cb1315c0a510ec6c163d8324fd82 Mon Sep 17 00:00:00 2001
From: Andrew Price <anprice@redhat.com>
Date: Fri, 12 Oct 2012 16:45:09 +0100
Subject: GFS2: Fix possible null pointer deref in gfs2_rs_alloc

Despite the return value from kmem_cache_zalloc() being checked, the
error wasn't being returned until after a possible null pointer
dereference. This patch returns the error immediately, allowing the
removal of the error variable.

Signed-off-by: Andrew Price <anprice@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3cc402ce6fea..43d1a20bdbe4 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -553,7 +553,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
  */
 int gfs2_rs_alloc(struct gfs2_inode *ip)
 {
-	int error = 0;
 	struct gfs2_blkreserv *res;
 
 	if (ip->i_res)
@@ -561,7 +560,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
 
 	res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
 	if (!res)
-		error = -ENOMEM;
+		return -ENOMEM;
 
 	RB_CLEAR_NODE(&res->rs_node);
 
@@ -571,7 +570,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
 	else
 		ip->i_res = res;
 	up_write(&ip->i_rw_mutex);
-	return error;
+	return 0;
 }
 
 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
-- 
cgit v1.2.3


From 73738a77f42c2d7f53fd61f73272c9dd6f520897 Mon Sep 17 00:00:00 2001
From: Andrew Price <anprice@redhat.com>
Date: Fri, 12 Oct 2012 16:45:10 +0100
Subject: GFS2: Clean up some unused assignments

Cleans up two cases where variables were assigned values but then never
used again.

Signed-off-by: Andrew Price <anprice@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 2 --
 fs/gfs2/lops.c | 2 --
 2 files changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 0def0504afc1..377a68dbd066 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -677,10 +677,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	size_t writesize = iov_length(iov, nr_segs);
 	struct dentry *dentry = file->f_dentry;
 	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-	struct gfs2_sbd *sdp;
 	int ret;
 
-	sdp = GFS2_SB(file->f_mapping->host);
 	ret = gfs2_rs_alloc(ip);
 	if (ret)
 		return ret;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 8ff95a2d54ee..01e444b5b2bd 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -621,7 +621,6 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 
 static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 {
-	struct gfs2_log_descriptor *ld;
 	struct gfs2_meta_header *mh;
 	unsigned int offset;
 	struct list_head *head = &sdp->sd_log_le_revoke;
@@ -634,7 +633,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 
 	length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64));
 	page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke);
-	ld = page_address(page);
 	offset = sizeof(struct gfs2_log_descriptor);
 
 	list_for_each_entry(bd, head, bd_list) {
-- 
cgit v1.2.3


From 3a238adefb8c5b8cb8cde0ce689d513306176ff4 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 16 Oct 2012 11:39:07 +0200
Subject: GFS2: Require user to provide argument for FITRIM

When the fstrim_range argument is not provided by user in FITRIM ioctl
we should just return EFAULT and not promoting bad behaviour by filling
the structure in kernel. Let the user deal with it.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 43d1a20bdbe4..b6bbf718d6c3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1270,11 +1270,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 	if (!blk_queue_discard(q))
 		return -EOPNOTSUPP;
 
-	if (argp == NULL) {
-		r.start = 0;
-		r.len = ULLONG_MAX;
-		r.minlen = 0;
-	} else if (copy_from_user(&r, argp, sizeof(r)))
+	if (copy_from_user(&r, argp, sizeof(r)))
 		return -EFAULT;
 
 	ret = gfs2_rindex_update(sdp);
@@ -1323,7 +1319,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 
 out:
 	r.len = trimmed << 9;
-	if (argp && copy_to_user(argp, &r, sizeof(r)))
+	if (copy_to_user(argp, &r, sizeof(r)))
 		return -EFAULT;
 
 	return ret;
-- 
cgit v1.2.3


From 076f0faa764ab3a5a32fc726ae05e2de0e66151d Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 16 Oct 2012 11:39:08 +0200
Subject: GFS2: Fix FITRIM argument handling

Currently implementation in gfs2 uses FITRIM arguments as it were in
file system blocks units which is wrong. The FITRIM arguments
(fstrim_range.start, fstrim_range.len and fstrim_range.minlen) are
actually in bytes.

Moreover, check for start argument beyond the end of file system, len
argument being smaller than file system block and minlen argument being
bigger than biggest resource group were missing.

This commit converts the code to convert FITRIM argument to file system
blocks and also adds appropriate checks mentioned above.

All the problems were recognised by xfstests 251 and 260.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index b6bbf718d6c3..38fe18f2f055 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1262,7 +1262,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 	int ret = 0;
 	u64 amt;
 	u64 trimmed = 0;
+	u64 start, end, minlen;
 	unsigned int x;
+	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1277,8 +1279,18 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 	if (ret)
 		return ret;
 
-	rgd = gfs2_blk2rgrpd(sdp, r.start, 0);
-	rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0);
+	start = r.start >> bs_shift;
+	end = start + (r.len >> bs_shift);
+	minlen = max_t(u64, r.minlen,
+		       q->limits.discard_granularity) >> bs_shift;
+
+	rgd = gfs2_blk2rgrpd(sdp, start, 0);
+	rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
+
+	if (end <= start ||
+	    minlen > sdp->sd_max_rg_data ||
+	    start > rgd_end->rd_data0 + rgd_end->rd_data)
+		return -EINVAL;
 
 	while (1) {
 
@@ -1290,7 +1302,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 			/* Trim each bitmap in the rgrp */
 			for (x = 0; x < rgd->rd_length; x++) {
 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
-				ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt);
+				ret = gfs2_rgrp_send_discards(sdp,
+						rgd->rd_data0, NULL, bi, minlen,
+						&amt);
 				if (ret) {
 					gfs2_glock_dq_uninit(&gh);
 					goto out;
-- 
cgit v1.2.3


From 3d1626889a64bd5a661544d582036a0a02104a60 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 6 Nov 2012 00:49:28 -0600
Subject: GFS2: Don't call file_accessed() with a shared glock

file_accessed() was being called by gfs2_mmap() with a shared glock. If it
needed to update the atime, it was crashing because it dirtied the inode in
gfs2_dirty_inode() without holding an exclusive lock. gfs2_dirty_inode()
checked if the caller was already holding a glock, but it didn't make sure that
the glock was in the exclusive state. Now, instead of calling file_accessed()
while holding the shared lock in gfs2_mmap(), file_accessed() is called after
grabbing and releasing the glock to update the inode.  If file_accessed() needs
to update the atime, it will grab an exclusive lock in gfs2_dirty_inode().

gfs2_dirty_inode() now also checks to make sure that if the calling process has
already locked the glock, it has an exclusive lock.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c  | 12 +++++-------
 fs/gfs2/super.c |  3 ++-
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 377a68dbd066..e056b4ce4877 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -516,15 +516,13 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 		struct gfs2_holder i_gh;
 		int error;
 
-		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-		error = gfs2_glock_nq(&i_gh);
-		if (error == 0) {
-			file_accessed(file);
-			gfs2_glock_dq(&i_gh);
-		}
-		gfs2_holder_uninit(&i_gh);
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
 		if (error)
 			return error;
+		/* grab lock to update inode */
+		gfs2_glock_dq_uninit(&i_gh);
+		file_accessed(file);
 	}
 	vma->vm_ops = &gfs2_vm_ops;
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index bc737261f234..d6488674d916 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -810,7 +810,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
 			return;
 		}
 		need_unlock = 1;
-	}
+	} else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE))
+		return;
 
 	if (current->journal_info == NULL) {
 		ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
-- 
cgit v1.2.3


From 96e5d1d3adf56f1c7eeb07258f6a1a0a7ae9c489 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Wed, 7 Nov 2012 00:38:06 -0600
Subject: GFS2: Test bufdata with buffer locked and gfs2_log_lock held

In gfs2_trans_add_bh(), gfs2 was testing if a there was a bd attached to the
buffer without having the gfs2_log_lock held. It was then assuming it would
stay attached for the rest of the function. However, without either the log
lock being held of the buffer locked, __gfs2_ail_flush() could detach bd at any
time.  This patch moves the locking before the test.  If there isn't a bd
already attached, gfs2 can safely allocate one and attach it before locking.
There is no way that the newly allocated bd could be on the ail list,
and thus no way for __gfs2_ail_flush() to detach it.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lops.c  | 14 ++------------
 fs/gfs2/trans.c |  8 ++++++++
 2 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 01e444b5b2bd..9ceccb1595a3 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -393,12 +393,10 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 	struct gfs2_meta_header *mh;
 	struct gfs2_trans *tr;
 
-	lock_buffer(bd->bd_bh);
-	gfs2_log_lock(sdp);
 	tr = current->journal_info;
 	tr->tr_touched = 1;
 	if (!list_empty(&bd->bd_list))
-		goto out;
+		return;
 	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
 	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
 	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
@@ -414,9 +412,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 	sdp->sd_log_num_buf++;
 	list_add(&bd->bd_list, &sdp->sd_log_le_buf);
 	tr->tr_num_buf_new++;
-out:
-	gfs2_log_unlock(sdp);
-	unlock_buffer(bd->bd_bh);
 }
 
 static void gfs2_check_magic(struct buffer_head *bh)
@@ -775,12 +770,10 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 	struct address_space *mapping = bd->bd_bh->b_page->mapping;
 	struct gfs2_inode *ip = GFS2_I(mapping->host);
 
-	lock_buffer(bd->bd_bh);
-	gfs2_log_lock(sdp);
 	if (tr)
 		tr->tr_touched = 1;
 	if (!list_empty(&bd->bd_list))
-		goto out;
+		return;
 	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
 	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
 	if (gfs2_is_jdata(ip)) {
@@ -791,9 +784,6 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 	} else {
 		list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
 	}
-out:
-	gfs2_log_unlock(sdp);
-	unlock_buffer(bd->bd_bh);
 }
 
 /**
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index adbd27875ef9..413627072f36 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -155,14 +155,22 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct gfs2_bufdata *bd;
 
+	lock_buffer(bh);
+	gfs2_log_lock(sdp);
 	bd = bh->b_private;
 	if (bd)
 		gfs2_assert(sdp, bd->bd_gl == gl);
 	else {
+		gfs2_log_unlock(sdp);
+		unlock_buffer(bh);
 		gfs2_attach_bufdata(gl, bh, meta);
 		bd = bh->b_private;
+		lock_buffer(bh);
+		gfs2_log_lock(sdp);
 	}
 	lops_add(sdp, bd);
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bh);
 }
 
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-- 
cgit v1.2.3


From 7e9620f21d8c9e389fd6845487e07d5df898a2e4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:12 +1100
Subject: xfs: only update the last_sync_lsn when a transaction completes

The log write code stamps each iclog with the current tail LSN in
the iclog header so that recovery knows where to find the tail of
thelog once it has found the head. Normally this is taken from the
first item on the AIL - the log item that corresponds to the oldest
active item in the log.

The problem is that when the AIL is empty, the tail lsn is dervied
from the the l_last_sync_lsn, which is the LSN of the last iclog to
be written to the log. In most cases this doesn't happen, because
the AIL is rarely empty on an active filesystem. However, when it
does, it opens up an interesting case when the transaction being
committed to the iclog spans multiple iclogs.

That is, the first iclog is stamped with the l_last_sync_lsn, and IO
is issued. Then the next iclog is setup, the changes copied into the
iclog (takes some time), and then the l_last_sync_lsn is stamped
into the header and IO is issued. This is still the same
transaction, so the tail lsn of both iclogs must be the same for log
recovery to find the entire transaction to be able to replay it.

The problem arises in that the iclog buffer IO completion updates
the l_last_sync_lsn with it's own LSN. Therefore, If the first iclog
completes it's IO before the second iclog is filled and has the tail
lsn stamped in it, it will stamp the LSN of the first iclog into
it's tail lsn field. If the system fails at this point, log recovery
will not see a complete transaction, so the transaction will no be
replayed.

The fix is simple - the l_last_sync_lsn is updated when a iclog
buffer IO completes, and this is incorrect. The l_last_sync_lsn
shoul dbe updated when a transaction is completed by a iclog buffer
IO. That is, only iclog buffers that have transaction commit
callbacks attached to them should update the l_last_sync_lsn. This
means that the last_sync_lsn will only move forward when a commit
record it written, not in the middle of a large transaction that is
rolling through multiple iclog buffers.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7f4f9370d0e7..4dad756962d0 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2387,14 +2387,27 @@ xlog_state_do_callback(
 
 
 				/*
-				 * update the last_sync_lsn before we drop the
+				 * Completion of a iclog IO does not imply that
+				 * a transaction has completed, as transactions
+				 * can be large enough to span many iclogs. We
+				 * cannot change the tail of the log half way
+				 * through a transaction as this may be the only
+				 * transaction in the log and moving th etail to
+				 * point to the middle of it will prevent
+				 * recovery from finding the start of the
+				 * transaction. Hence we should only update the
+				 * last_sync_lsn if this iclog contains
+				 * transaction completion callbacks on it.
+				 *
+				 * We have to do this before we drop the
 				 * icloglock to ensure we are the only one that
 				 * can update it.
 				 */
 				ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
 					be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-				atomic64_set(&log->l_last_sync_lsn,
-					be64_to_cpu(iclog->ic_header.h_lsn));
+				if (iclog->ic_callback)
+					atomic64_set(&log->l_last_sync_lsn,
+						be64_to_cpu(iclog->ic_header.h_lsn));
 
 			} else
 				ioerrors++;
-- 
cgit v1.2.3


From 408cc4e97a3ccd172d2d676e4b585badf439271b Mon Sep 17 00:00:00 2001
From: Mark Tinguely <tinguely@sgi.com>
Date: Thu, 20 Sep 2012 13:16:45 -0500
Subject: xfs: zero allocation_args on the kernel stack

Zero the kernel stack space that makes up the xfs_alloc_arg structures.

Signed-off-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c  | 1 +
 fs/xfs/xfs_bmap.c   | 3 +++
 fs/xfs/xfs_ialloc.c | 1 +
 3 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 4f33c32affe3..0287f3b1b503 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1866,6 +1866,7 @@ xfs_alloc_fix_freelist(
 	/*
 	 * Initialize the args structure.
 	 */
+	memset(&targs, 0, sizeof(targs));
 	targs.tp = tp;
 	targs.mp = mp;
 	targs.agbp = agbp;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 848ffa77707b..e1545ec2f7d2 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2437,6 +2437,7 @@ xfs_bmap_btalloc(
 	 * Normal allocation, done through xfs_alloc_vextent.
 	 */
 	tryagain = isaligned = 0;
+	memset(&args, 0, sizeof(args));
 	args.tp = ap->tp;
 	args.mp = mp;
 	args.fsbno = ap->blkno;
@@ -3082,6 +3083,7 @@ xfs_bmap_extents_to_btree(
 	 * Convert to a btree with two levels, one record in root.
 	 */
 	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+	memset(&args, 0, sizeof(args));
 	args.tp = tp;
 	args.mp = mp;
 	args.firstblock = *firstblock;
@@ -3237,6 +3239,7 @@ xfs_bmap_local_to_extents(
 		xfs_buf_t	*bp;	/* buffer for extent block */
 		xfs_bmbt_rec_host_t *ep;/* extent record pointer */
 
+		memset(&args, 0, sizeof(args));
 		args.tp = tp;
 		args.mp = ip->i_mount;
 		args.firstblock = *firstblock;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 445bf1aef31c..c5c4ef4f2bdb 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -250,6 +250,7 @@ xfs_ialloc_ag_alloc(
 					/* boundary */
 	struct xfs_perag *pag;
 
+	memset(&args, 0, sizeof(args));
 	args.tp = tp;
 	args.mp = tp->t_mountp;
 
-- 
cgit v1.2.3


From 326c03555b914ff153ba5b40df87fd6e28e7e367 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 5 Oct 2012 11:06:58 +1000
Subject: xfs: introduce XFS_BMAPI_STACK_SWITCH

Certain allocation paths through xfs_bmapi_write() are in situations
where we have limited stack available. These are almost always in
the buffered IO writeback path when convertion delayed allocation
extents to real extents.

The current stack switch occurs for userdata allocations, which
means we also do stack switches for preallocation, direct IO and
unwritten extent conversion, even those these call chains have never
been implicated in a stack overrun.

Hence, let's target just the single stack overun offended for stack
switches. To do that, introduce a XFS_BMAPI_STACK_SWITCH flag that
the caller can pass xfs_bmapi_write() to indicate it should switch
stacks if it needs to do allocation.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c | 2 +-
 fs/xfs/xfs_alloc.h | 1 +
 fs/xfs/xfs_bmap.c  | 4 ++++
 fs/xfs/xfs_bmap.h  | 5 ++++-
 fs/xfs/xfs_iomap.c | 4 +++-
 5 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 0287f3b1b503..43f791bcd8b1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2447,7 +2447,7 @@ xfs_alloc_vextent(
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 
-	if (!args->userdata)
+	if (!args->stack_switch)
 		return __xfs_alloc_vextent(args);
 
 
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 93be4a667ca1..ef7d4885dc2d 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -123,6 +123,7 @@ typedef struct xfs_alloc_arg {
 	struct completion *done;
 	struct work_struct work;
 	int		result;
+	char		stack_switch;
 } xfs_alloc_arg_t;
 
 /*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index e1545ec2f7d2..91259554df8b 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2441,6 +2441,7 @@ xfs_bmap_btalloc(
 	args.tp = ap->tp;
 	args.mp = mp;
 	args.fsbno = ap->blkno;
+	args.stack_switch = ap->stack_switch;
 
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
@@ -4675,6 +4676,9 @@ xfs_bmapi_allocate(
 			return error;
 	}
 
+	if (flags & XFS_BMAPI_STACK_SWITCH)
+		bma->stack_switch = 1;
+
 	error = xfs_bmap_alloc(bma);
 	if (error)
 		return error;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 803b56d7ce16..b68c598034c1 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -77,6 +77,7 @@ typedef	struct xfs_bmap_free
  * from written to unwritten, otherwise convert from unwritten to written.
  */
 #define XFS_BMAPI_CONVERT	0x040
+#define XFS_BMAPI_STACK_SWITCH	0x080
 
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
@@ -85,7 +86,8 @@ typedef	struct xfs_bmap_free
 	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
 	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
 	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
-	{ XFS_BMAPI_CONVERT,	"CONVERT" }
+	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
+	{ XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" }
 
 
 static inline int xfs_bmapi_aflag(int w)
@@ -133,6 +135,7 @@ typedef struct xfs_bmalloca {
 	char			userdata;/* set if is user data */
 	char			aeof;	/* allocated space at eof */
 	char			conv;	/* overwriting unwritten extents */
+	char			stack_switch;
 } xfs_bmalloca_t;
 
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 973dff6ad935..7f537663365b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -584,7 +584,9 @@ xfs_iomap_write_allocate(
 			 * pointer that the caller gave to us.
 			 */
 			error = xfs_bmapi_write(tp, ip, map_start_fsb,
-						count_fsb, 0, &first_block, 1,
+						count_fsb,
+						XFS_BMAPI_STACK_SWITCH,
+						&first_block, 1,
 						imap, &nimaps, &free_list);
 			if (error)
 				goto trans_cancel;
-- 
cgit v1.2.3


From 1f3c785c3adb7d2b109ec7c8f10081d1294b03d3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 5 Oct 2012 11:06:59 +1000
Subject: xfs: move allocation stack switch up to xfs_bmapi_allocate

Switching stacks are xfs_alloc_vextent can cause deadlocks when we
run out of worker threads on the allocation workqueue. This can
occur because xfs_bmap_btalloc can make multiple calls to
xfs_alloc_vextent() and even if xfs_alloc_vextent() fails it can
return with the AGF locked in the current allocation transaction.

If we then need to make another allocation, and all the allocation
worker contexts are exhausted because the are blocked waiting for
the AGF lock, holder of the AGF cannot get it's xfs-alloc_vextent
work completed to release the AGF.  Hence allocation effectively
deadlocks.

To avoid this, move the stack switch one layer up to
xfs_bmapi_allocate() so that all of the allocation attempts in a
single switched stack transaction occur in a single worker context.
This avoids the problem of an allocation being blocked waiting for
a worker thread whilst holding the AGF.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c | 42 +-------------------------------------
 fs/xfs/xfs_alloc.h |  4 ----
 fs/xfs/xfs_bmap.c  | 60 ++++++++++++++++++++++++++++++++++++++++++++----------
 fs/xfs/xfs_bmap.h  |  4 ++++
 4 files changed, 54 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 43f791bcd8b1..335206a9c698 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2208,7 +2208,7 @@ xfs_alloc_read_agf(
  * group or loop over the allocation groups to find the result.
  */
 int				/* error */
-__xfs_alloc_vextent(
+xfs_alloc_vextent(
 	xfs_alloc_arg_t	*args)	/* allocation argument structure */
 {
 	xfs_agblock_t	agsize;	/* allocation group size */
@@ -2418,46 +2418,6 @@ error0:
 	return error;
 }
 
-static void
-xfs_alloc_vextent_worker(
-	struct work_struct	*work)
-{
-	struct xfs_alloc_arg	*args = container_of(work,
-						struct xfs_alloc_arg, work);
-	unsigned long		pflags;
-
-	/* we are in a transaction context here */
-	current_set_flags_nested(&pflags, PF_FSTRANS);
-
-	args->result = __xfs_alloc_vextent(args);
-	complete(args->done);
-
-	current_restore_flags_nested(&pflags, PF_FSTRANS);
-}
-
-/*
- * Data allocation requests often come in with little stack to work on. Push
- * them off to a worker thread so there is lots of stack to use. Metadata
- * requests, OTOH, are generally from low stack usage paths, so avoid the
- * context switch overhead here.
- */
-int
-xfs_alloc_vextent(
-	struct xfs_alloc_arg	*args)
-{
-	DECLARE_COMPLETION_ONSTACK(done);
-
-	if (!args->stack_switch)
-		return __xfs_alloc_vextent(args);
-
-
-	args->done = &done;
-	INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker);
-	queue_work(xfs_alloc_wq, &args->work);
-	wait_for_completion(&done);
-	return args->result;
-}
-
 /*
  * Free an extent.
  * Just break up the extent address and hand off to xfs_free_ag_extent
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index ef7d4885dc2d..feacb061bab7 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -120,10 +120,6 @@ typedef struct xfs_alloc_arg {
 	char		isfl;		/* set if is freelist blocks - !acctg */
 	char		userdata;	/* set if this is user data */
 	xfs_fsblock_t	firstblock;	/* io first block allocated */
-	struct completion *done;
-	struct work_struct work;
-	int		result;
-	char		stack_switch;
 } xfs_alloc_arg_t;
 
 /*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 91259554df8b..83d0cf3df930 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2441,7 +2441,6 @@ xfs_bmap_btalloc(
 	args.tp = ap->tp;
 	args.mp = mp;
 	args.fsbno = ap->blkno;
-	args.stack_switch = ap->stack_switch;
 
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
@@ -4620,12 +4619,11 @@ xfs_bmapi_delay(
 
 
 STATIC int
-xfs_bmapi_allocate(
-	struct xfs_bmalloca	*bma,
-	int			flags)
+__xfs_bmapi_allocate(
+	struct xfs_bmalloca	*bma)
 {
 	struct xfs_mount	*mp = bma->ip->i_mount;
-	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+	int			whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
 						XFS_ATTR_FORK : XFS_DATA_FORK;
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 	int			tmp_logflags = 0;
@@ -4658,25 +4656,25 @@ xfs_bmapi_allocate(
 	 * Indicate if this is the first user data in the file, or just any
 	 * user data.
 	 */
-	if (!(flags & XFS_BMAPI_METADATA)) {
+	if (!(bma->flags & XFS_BMAPI_METADATA)) {
 		bma->userdata = (bma->offset == 0) ?
 			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
 	}
 
-	bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
 
 	/*
 	 * Only want to do the alignment at the eof if it is userdata and
 	 * allocation length is larger than a stripe unit.
 	 */
 	if (mp->m_dalign && bma->length >= mp->m_dalign &&
-	    !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+	    !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
 		error = xfs_bmap_isaeof(bma, whichfork);
 		if (error)
 			return error;
 	}
 
-	if (flags & XFS_BMAPI_STACK_SWITCH)
+	if (bma->flags & XFS_BMAPI_STACK_SWITCH)
 		bma->stack_switch = 1;
 
 	error = xfs_bmap_alloc(bma);
@@ -4713,7 +4711,7 @@ xfs_bmapi_allocate(
 	 * A wasdelay extent has been initialized, so shouldn't be flagged
 	 * as unwritten.
 	 */
-	if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
+	if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
 	    xfs_sb_version_hasextflgbit(&mp->m_sb))
 		bma->got.br_state = XFS_EXT_UNWRITTEN;
 
@@ -4741,6 +4739,45 @@ xfs_bmapi_allocate(
 	return 0;
 }
 
+static void
+xfs_bmapi_allocate_worker(
+	struct work_struct	*work)
+{
+	struct xfs_bmalloca	*args = container_of(work,
+						struct xfs_bmalloca, work);
+	unsigned long		pflags;
+
+	/* we are in a transaction context here */
+	current_set_flags_nested(&pflags, PF_FSTRANS);
+
+	args->result = __xfs_bmapi_allocate(args);
+	complete(args->done);
+
+	current_restore_flags_nested(&pflags, PF_FSTRANS);
+}
+
+/*
+ * Some allocation requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. Otherwise just
+ * call directly to avoid the context switch overhead here.
+ */
+int
+xfs_bmapi_allocate(
+	struct xfs_bmalloca	*args)
+{
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	if (!args->stack_switch)
+		return __xfs_bmapi_allocate(args);
+
+
+	args->done = &done;
+	INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
+	queue_work(xfs_alloc_wq, &args->work);
+	wait_for_completion(&done);
+	return args->result;
+}
+
 STATIC int
 xfs_bmapi_convert_unwritten(
 	struct xfs_bmalloca	*bma,
@@ -4926,6 +4963,7 @@ xfs_bmapi_write(
 			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
 			bma.wasdel = wasdelay;
 			bma.offset = bno;
+			bma.flags = flags;
 
 			/*
 			 * There's a 32/64 bit type mismatch between the
@@ -4941,7 +4979,7 @@ xfs_bmapi_write(
 
 			ASSERT(len > 0);
 			ASSERT(bma.length > 0);
-			error = xfs_bmapi_allocate(&bma, flags);
+			error = xfs_bmapi_allocate(&bma);
 			if (error)
 				goto error0;
 			if (bma.blkno == NULLFSBLOCK)
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index b68c598034c1..5f469c3516eb 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -136,6 +136,10 @@ typedef struct xfs_bmalloca {
 	char			aeof;	/* allocated space at eof */
 	char			conv;	/* overwriting unwritten extents */
 	char			stack_switch;
+	int			flags;
+	struct completion	*done;
+	struct work_struct	work;
+	int			result;
 } xfs_bmalloca_t;
 
 /*
-- 
cgit v1.2.3


From eaef854335ce09956e930fe4a193327417edc6c9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Oct 2012 14:50:52 +1100
Subject: xfs: growfs: don't read garbage for new secondary superblocks

When updating new secondary superblocks in a growfs operation, the
superblock buffer is read from the newly grown region of the
underlying device. This is not guaranteed to be zero, so violates
the underlying assumption that the unused parts of superblocks are
zero filled. Get a new buffer for these secondary superblocks to
ensure that the unused regions are zero filled correctly.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fsops.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c25b094efbf7..4beaede43277 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -399,9 +399,26 @@ xfs_growfs_data_private(
 
 	/* update secondary superblocks. */
 	for (agno = 1; agno < nagcount; agno++) {
-		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+		error = 0;
+		/*
+		 * new secondary superblocks need to be zeroed, not read from
+		 * disk as the contents of the new area we are growing into is
+		 * completely unknown.
+		 */
+		if (agno < oagcount) {
+			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
 				  XFS_FSS_TO_BB(mp, 1), 0, &bp);
+		} else {
+			bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
+				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+				  XFS_FSS_TO_BB(mp, 1), 0);
+			if (bp)
+				xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+			else
+				error = ENOMEM;
+		}
+
 		if (error) {
 			xfs_warn(mp,
 		"error %d reading secondary superblock for ag %d",
@@ -423,7 +440,7 @@ xfs_growfs_data_private(
 			break; /* no point in continuing */
 		}
 	}
-	return 0;
+	return error;
 
  error0:
 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-- 
cgit v1.2.3


From 1e7acbb7bc1ae7c1c62fd1310b3176a820225056 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 25 Oct 2012 17:22:30 +1100
Subject: xfs: silence uninitialised f.file warning.

Uninitialised variable build warning introduced by 2903ff0 ("switch
simple cases of fget_light to fdget"), gcc is not smart enough to
work out that the variable is not used uninitialised, and the commit
removed the initialisation at declaration that the old variable had.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8305f2ac6773..c1df3c623de2 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -70,7 +70,7 @@ xfs_find_handle(
 	int			hsize;
 	xfs_handle_t		handle;
 	struct inode		*inode;
-	struct fd		f;
+	struct fd		f = {0};
 	struct path		path;
 	int			error;
 	struct xfs_inode	*ip;
-- 
cgit v1.2.3


From ca250b1b3d711936d7dae9e97871f2261347f82d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 2 Nov 2012 11:38:41 +1100
Subject: xfs: invalidate allocbt blocks moved to the free list

When we free a block from the alloc btree tree, we move it to the
freelist held in the AGFL and mark it busy in the busy extent tree.
This typically happens when we merge btree blocks.

Once the transaction is committed and checkpointed, the block can
remain on the free list for an indefinite amount of time.  Now, this
isn't the end of the world at this point - if the free list is
shortened, the buffer is invalidated in the transaction that moves
it back to free space. If the buffer is allocated as metadata from
the free list, then all the modifications getted logged, and we have
no issues, either. And if it gets allocated as userdata direct from
the freelist, it gets invalidated and so will never get written.

However, during the time it sits on the free list, pressure on the
log can cause the AIL to be pushed and the buffer that covers the
block gets pushed for write. IOWs, we end up writing a freed
metadata block to disk. Again, this isn't the end of the world
because we know from the above we are only writing to free space.

The problem, however, is for validation callbacks. If the block was
on old btree root block, then the level of the block is going to be
higher than the current tree root, and so will fail validation.
There may be other inconsistencies in the block as well, and
currently we don't care because the block is in free space. Shutting
down the filesystem because a freed block doesn't pass write
validation, OTOH, is rather unfriendly.

So, make sure we always invalidate buffers as they move from the
free space trees to the free list so that we guarantee they never
get written to disk while on the free list.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Phil White <pwhite@sgi.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc_btree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f1647caace8f..f7876c6d6165 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -121,6 +121,8 @@ xfs_allocbt_free_block(
 	xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
 			      XFS_EXTENT_BUSY_SKIP_DISCARD);
 	xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+	xfs_trans_binval(cur->bc_tp, bp);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4b62acfe99e158fb7812982d1cf90a075710a92c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 2 Nov 2012 11:38:42 +1100
Subject: xfs: don't vmap inode cluster buffers during free

Inode buffers do not need to be mapped as inodes are read or written
directly from/to the pages underlying the buffer. This fixes a
regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the
default behaviour").

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2778258fcfa2..1938b41ee9f5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1509,7 +1509,8 @@ xfs_ifree_cluster(
 		 * to mark all the active inodes on the buffer stale.
 		 */
 		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
-					mp->m_bsize * blks_per_cluster, 0);
+					mp->m_bsize * blks_per_cluster,
+					XBF_UNMAPPED);
 
 		if (!bp)
 			return ENOMEM;
-- 
cgit v1.2.3


From 03b1293edad462ad1ad62bcc5160c76758e450d5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Fri, 2 Nov 2012 14:23:12 +1100
Subject: xfs: fix buffer shudown reference count mismatch

When we shut down the filesystem, we have to unpin and free all the
buffers currently active in the CIL. To do this we unpin and remove
them in one operation as a result of a failed iclogbuf write. For
buffers, we do this removal via a simultated IO completion of after
marking the buffer stale.

At the time we do this, we have two references to the buffer - the
active LRU reference and the buf log item.  The LRU reference is
removed by marking the buffer stale, and the active CIL reference is
by the xfs_buf_iodone() callback that is run by
xfs_buf_do_callbacks() during ioend processing (via the bp->b_iodone
callback).

However, ioend processing requires one more reference - that of the
IO that it is completing. We don't have this reference, so we free
the buffer prematurely and use it after it is freed. For buffers
marked with XBF_ASYNC, this leads to assert failures in
xfs_buf_rele() on debug kernels because the b_hold count is zero.

Fix this by making sure we take the necessary IO reference before
starting IO completion processing on the stale buffer, and set the
XBF_ASYNC flag to ensure that IO completion processing removes all
the active references from the buffer to ensure it is fully torn
down.

Cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_buf_item.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a8d0ed911196..becf4a97efc6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -526,7 +526,25 @@ xfs_buf_item_unpin(
 		}
 		xfs_buf_relse(bp);
 	} else if (freed && remove) {
+		/*
+		 * There are currently two references to the buffer - the active
+		 * LRU reference and the buf log item. What we are about to do
+		 * here - simulate a failed IO completion - requires 3
+		 * references.
+		 *
+		 * The LRU reference is removed by the xfs_buf_stale() call. The
+		 * buf item reference is removed by the xfs_buf_iodone()
+		 * callback that is run by xfs_buf_do_callbacks() during ioend
+		 * processing (via the bp->b_iodone callback), and then finally
+		 * the ioend processing will drop the IO reference if the buffer
+		 * is marked XBF_ASYNC.
+		 *
+		 * Hence we need to take an additional reference here so that IO
+		 * completion processing doesn't free the buffer prematurely.
+		 */
 		xfs_buf_lock(bp);
+		xfs_buf_hold(bp);
+		bp->b_flags |= XBF_ASYNC;
 		xfs_buf_ioerror(bp, EIO);
 		XFS_BUF_UNDONE(bp);
 		xfs_buf_stale(bp);
-- 
cgit v1.2.3


From 6ce377afd1755eae5c93410ca9a1121dfead7b87 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 2 Nov 2012 11:38:44 +1100
Subject: xfs: fix reading of wrapped log data

Commit 4439647 ("xfs: reset buffer pointers before freeing them") in
3.0-rc1 introduced a regression when recovering log buffers that
wrapped around the end of log. The second part of the log buffer at
the start of the physical log was being read into the header buffer
rather than the data buffer, and hence recovery was seeing garbage
in the data buffer when it got to the region of the log buffer that
was incorrectly read.

Cc: <stable@vger.kernel.org> # 3.0.x, 3.2.x, 3.4.x 3.6.x
Reported-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5da3ace352bf..d308749fabf1 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3541,7 +3541,7 @@ xlog_do_recovery_pass(
 				 *   - order is important.
 				 */
 				error = xlog_bread_offset(log, 0,
-						bblks - split_bblks, hbp,
+						bblks - split_bblks, dbp,
 						offset + BBTOB(split_bblks));
 				if (error)
 					goto bread_err2;
-- 
cgit v1.2.3


From a80a6b85b428e6ce12a8363bb1f08d44c50f3252 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 8 Nov 2012 15:53:35 -0800
Subject: revert "epoll: support for disabling items, and a self-test app"

Revert commit 03a7beb55b9f ("epoll: support for disabling items, and a
self-test app") pending resolution of the issues identified by Michael
Kerrisk, copied below.

We'll revisit this for 3.8.

: I've taken a look at this patch as it currently stands in 3.7-rc1, and
: done a bit of testing. (By the way, the test program
: tools/testing/selftests/epoll/test_epoll.c does not compile...)
:
: There are one or two places where the behavior seems a little strange,
: so I have a question or two at the end of this mail. But other than
: that, I want to check my understanding so that the interface can be
: correctly documented.
:
: Just to go though my understanding, the problem is the following
: scenario in a multithreaded application:
:
: 1. Multiple threads are performing epoll_wait() operations,
:    and maintaining a user-space cache that contains information
:    corresponding to each file descriptor being monitored by
:    epoll_wait().
:
: 2. At some point, a thread wants to delete (EPOLL_CTL_DEL)
:    a file descriptor from the epoll interest list, and
:    delete the corresponding record from the user-space cache.
:
: 3. The problem with (2) is that some other thread may have
:    previously done an epoll_wait() that retrieved information
:    about the fd in question, and may be in the middle of using
:    information in the cache that relates to that fd. Thus,
:    there is a potential race.
:
: 4. The race can't solved purely in user space, because doing
:    so would require applying a mutex across the epoll_wait()
:    call, which would of course blow thread concurrency.
:
: Right?
:
: Your solution is the EPOLL_CTL_DISABLE operation. I want to
: confirm my understanding about how to use this flag, since
: the description that has accompanied the patches so far
: has been a bit sparse
:
: 0. In the scenario you're concerned about, deleting a file
:    descriptor means (safely) doing the following:
:    (a) Deleting the file descriptor from the epoll interest list
:        using EPOLL_CTL_DEL
:    (b) Deleting the corresponding record in the user-space cache
:
: 1. It's only meaningful to use this EPOLL_CTL_DISABLE in
:    conjunction with EPOLLONESHOT.
:
: 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in
:    conjunction is a logical error.
:
: 3. The correct way to code multithreaded applications using
:    EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows:
:
:    a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should
:       should EPOLLONESHOT.
:
:    b. When a thread wants to delete a file descriptor, it
:       should do the following:
:
:       [1] Call epoll_ctl(EPOLL_CTL_DISABLE)
:       [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE)
:           was zero, then the file descriptor can be safely
:           deleted by the thread that made this call.
:       [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY,
:           then the descriptor is in use. In this case, the calling
:           thread should set a flag in the user-space cache to
:           indicate that the thread that is using the descriptor
:           should perform the deletion operation.
:
: Is all of the above correct?
:
: The implementation depends on checking on whether
: (events & ~EP_PRIVATE_BITS) == 0
: This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always
: set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT
: causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be
: cleared.
:
: A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE
: is only useful in conjunction with EPOLLONESHOT. However, as things
: stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does
: not have EPOLLONESHOT set in 'events' This results in the following
: (slightly surprising) behavior:
:
: (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0
:     (the indicator that the file descriptor can be safely deleted).
: (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY.
:
: This doesn't seem particularly useful, and in fact is probably an
: indication that the user made a logic error: they should only be using
: epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which
: EPOLLONESHOT was set in 'events'. If that is correct, then would it
: not make sense to return an error to user space for this case?

Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: "Paton J. Lewis" <palewis@adobe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 38 +++-----------------------------------
 1 file changed, 3 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index da72250ddc1c..cd96649bfe62 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
 /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
 static inline int ep_op_has_event(int op)
 {
-	return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
+	return op != EPOLL_CTL_DEL;
 }
 
 /* Initialize the poll safe wake up structure */
@@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	return 0;
 }
 
-/*
- * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
- * had no event flags set, indicating that another thread may be currently
- * handling that item's events (in the case that EPOLLONESHOT was being
- * used). Otherwise a zero result indicates that the item has been disabled
- * from receiving events. A disabled item may be re-enabled via
- * EPOLL_CTL_MOD. Must be called with "mtx" held.
- */
-static int ep_disable(struct eventpoll *ep, struct epitem *epi)
-{
-	int result = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ep->lock, flags);
-	if (epi->event.events & ~EP_PRIVATE_BITS) {
-		if (ep_is_linked(&epi->rdllink))
-			list_del_init(&epi->rdllink);
-		/* Ensure ep_poll_callback will not add epi back onto ready
-		   list: */
-		epi->event.events &= EP_PRIVATE_BITS;
-		}
-	else
-		result = -EBUSY;
-	spin_unlock_irqrestore(&ep->lock, flags);
-
-	return result;
-}
-
 static void ep_free(struct eventpoll *ep)
 {
 	struct rb_node *rbp;
@@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
 	rb_insert_color(&epi->rbn, &ep->rbr);
 }
 
+
+
 #define PATH_ARR_SIZE 5
 /*
  * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		} else
 			error = -ENOENT;
 		break;
-	case EPOLL_CTL_DISABLE:
-		if (epi)
-			error = ep_disable(ep, epi);
-		else
-			error = -ENOENT;
-		break;
 	}
 	mutex_unlock(&ep->mtx);
 
-- 
cgit v1.2.3


From 848561d368751a1c0f679b9f045a02944506a801 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 8 Nov 2012 15:53:37 -0800
Subject: fanotify: fix missing break

Anders Blomdell noted in 2010 that Fanotify lost events and provided a
test case.  Eric Paris confirmed it was a bug and posted a fix to the
list

  https://groups.google.com/forum/?fromgroups=#!topic/linux.kernel/RrJfTfyW2BE

but never applied it.  Repeated attempts over time to actually get him
to apply it have never had a reply from anyone who has raised it

So apply it anyway

Signed-off-by: Alan Cox <alan@linux.intel.com>
Reported-by: Anders Blomdell <anders.blomdell@control.lth.se>
Cc: Eric Paris <eparis@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fanotify/fanotify.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f35794b97e8e..a50636025364 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -21,6 +21,7 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
 			if ((old->path.mnt == new->path.mnt) &&
 			    (old->path.dentry == new->path.dentry))
 				return true;
+			break;
 		case (FSNOTIFY_EVENT_NONE):
 			return true;
 		default:
-- 
cgit v1.2.3


From 5ffd3412ae5536a4c57469cb8ea31887121dcb2e Mon Sep 17 00:00:00 2001
From: Thomas Betker <thomas.betker@freenet.de>
Date: Wed, 17 Oct 2012 22:59:30 +0200
Subject: jffs2: Fix lock acquisition order bug in jffs2_write_begin

jffs2_write_begin() first acquires the page lock, then f->sem. This
causes an AB-BA deadlock with jffs2_garbage_collect_live(), which first
acquires f->sem, then the page lock:

jffs2_garbage_collect_live
    mutex_lock(&f->sem)                         (A)
    jffs2_garbage_collect_dnode
        jffs2_gc_fetch_page
            read_cache_page_async
                do_read_cache_page
                    lock_page(page)             (B)

jffs2_write_begin
    grab_cache_page_write_begin
        find_lock_page
            lock_page(page)                     (B)
    mutex_lock(&f->sem)                         (A)

We fix this by restructuring jffs2_write_begin() to take f->sem before
the page lock. However, we make sure that f->sem is not held when
calling jffs2_reserve_space(), as this is not permitted by the locking
rules.

The deadlock above was observed multiple times on an SoC with a dual
ARMv7 (Cortex-A9), running the long-term 3.4.11 kernel; it occurred
when using scp to copy files from a host system to the ARM target
system. The fix was heavily tested on the same target system.

Cc: stable@vger.kernel.org
Signed-off-by: Thomas Betker <thomas.betker@rohde-schwarz.com>
Acked-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
---
 fs/jffs2/file.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 60ef3fb707ff..1506673c087e 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -138,33 +138,39 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	struct page *pg;
 	struct inode *inode = mapping->host;
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_raw_inode ri;
+	uint32_t alloc_len = 0;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	uint32_t pageofs = index << PAGE_CACHE_SHIFT;
 	int ret = 0;
 
+	jffs2_dbg(1, "%s()\n", __func__);
+
+	if (pageofs > inode->i_size) {
+		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+		if (ret)
+			return ret;
+	}
+
+	mutex_lock(&f->sem);
 	pg = grab_cache_page_write_begin(mapping, index, flags);
-	if (!pg)
+	if (!pg) {
+		if (alloc_len)
+			jffs2_complete_reservation(c);
+		mutex_unlock(&f->sem);
 		return -ENOMEM;
+	}
 	*pagep = pg;
 
-	jffs2_dbg(1, "%s()\n", __func__);
-
-	if (pageofs > inode->i_size) {
+	if (alloc_len) {
 		/* Make new hole frag from old EOF to new page */
-		struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
-		struct jffs2_raw_inode ri;
 		struct jffs2_full_dnode *fn;
-		uint32_t alloc_len;
 
 		jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
 			  (unsigned int)inode->i_size, pageofs);
 
-		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
-					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
-		if (ret)
-			goto out_page;
-
-		mutex_lock(&f->sem);
 		memset(&ri, 0, sizeof(ri));
 
 		ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -191,7 +197,6 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 		if (IS_ERR(fn)) {
 			ret = PTR_ERR(fn);
 			jffs2_complete_reservation(c);
-			mutex_unlock(&f->sem);
 			goto out_page;
 		}
 		ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -206,12 +211,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 			jffs2_mark_node_obsolete(c, fn->raw);
 			jffs2_free_full_dnode(fn);
 			jffs2_complete_reservation(c);
-			mutex_unlock(&f->sem);
 			goto out_page;
 		}
 		jffs2_complete_reservation(c);
 		inode->i_size = pageofs;
-		mutex_unlock(&f->sem);
 	}
 
 	/*
@@ -220,18 +223,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	 * case of a short-copy.
 	 */
 	if (!PageUptodate(pg)) {
-		mutex_lock(&f->sem);
 		ret = jffs2_do_readpage_nolock(inode, pg);
-		mutex_unlock(&f->sem);
 		if (ret)
 			goto out_page;
 	}
+	mutex_unlock(&f->sem);
 	jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
 	return ret;
 
 out_page:
 	unlock_page(pg);
 	page_cache_release(pg);
+	mutex_unlock(&f->sem);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 5a8477660d9ddc090203736d7271137265cb25bb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 12 Nov 2012 01:19:02 -0500
Subject: kill bogus BUG_ON() in do_close_on_exec()

It can be legitimately triggered via procfs access.  Now, at least
2 of 3 of get_files_struct() callers in procfs are useless, but
when and if we get rid of those we can always add WARN_ON() here.
BUG_ON() at that spot is simply wrong.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index d3b5fa80b71b..331e7d24d9d3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -685,7 +685,6 @@ void do_close_on_exec(struct files_struct *files)
 	struct fdtable *fdt;
 
 	/* exec unshares first */
-	BUG_ON(atomic_read(&files->count) != 1);
 	spin_lock(&files->file_lock);
 	for (i = 0; ; i++) {
 		unsigned long set;
-- 
cgit v1.2.3


From 70a6f46d7b0ec03653b9ab3f8063a9717a4a53ef Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 14 Nov 2012 11:49:53 +0000
Subject: pstore: Fix NULL pointer dereference in console writes

Passing a NULL id causes a NULL pointer deference in writers such as
erst_writer and efi_pstore_write because they expect to update this id.
Pass a dummy id instead.

This avoids a cascade of oopses caused when the initial
pstore_console_write passes a null which in turn causes writes to the
console causing further oopses in subsequent pstore_console_write calls.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: stable@vger.kernel.org
Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
---
 fs/pstore/platform.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index a40da07e93d6..947fbe06c3b1 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -161,6 +161,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 
 	while (s < e) {
 		unsigned long flags;
+		u64 id;
 
 		if (c > psinfo->bufsize)
 			c = psinfo->bufsize;
@@ -172,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 			spin_lock_irqsave(&psinfo->buf_lock, flags);
 		}
 		memcpy(psinfo->buf, s, c);
-		psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo);
+		psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, c, psinfo);
 		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 		s += c;
 		c = e - s;
-- 
cgit v1.2.3


From fa0cbbf145aabbf29c6f28f8a11935c0b0fd86fc Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 12 Nov 2012 17:53:04 -0800
Subject: mm, oom: reintroduce /proc/pid/oom_adj

This is mostly a revert of 01dc52ebdf47 ("oom: remove deprecated oom_adj")
from Davidlohr Bueso.

It reintroduces /proc/pid/oom_adj for backwards compatibility with earlier
kernels.  It simply scales the value linearly when /proc/pid/oom_score_adj
is written.

The major difference is that its scheduled removal is no longer included
in Documentation/feature-removal-schedule.txt.  We do warn users with a
single printk, though, to suggest the more powerful and supported
/proc/pid/oom_score_adj interface.

Reported-by: Artem S. Tashkinov <t.artem@lycos.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 144a96732dd7..3c231adf8450 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,6 +873,113 @@ static const struct file_operations proc_environ_operations = {
 	.release	= mem_release,
 };
 
+static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	char buffer[PROC_NUMBUF];
+	int oom_adj = OOM_ADJUST_MIN;
+	size_t len;
+	unsigned long flags;
+
+	if (!task)
+		return -ESRCH;
+	if (lock_task_sighand(task, &flags)) {
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+			oom_adj = OOM_ADJUST_MAX;
+		else
+			oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+				  OOM_SCORE_ADJ_MAX;
+		unlock_task_sighand(task, &flags);
+	}
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	int oom_adj;
+	unsigned long flags;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+	if (err)
+		goto out;
+	if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+	     oom_adj != OOM_DISABLE) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
+	if (!lock_task_sighand(task, &flags)) {
+		err = -ESRCH;
+		goto err_task_lock;
+	}
+
+	/*
+	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+	 * value is always attainable.
+	 */
+	if (oom_adj == OOM_ADJUST_MAX)
+		oom_adj = OOM_SCORE_ADJ_MAX;
+	else
+		oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+
+	if (oom_adj < task->signal->oom_score_adj &&
+	    !capable(CAP_SYS_RESOURCE)) {
+		err = -EACCES;
+		goto err_sighand;
+	}
+
+	/*
+	 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+	 * /proc/pid/oom_score_adj instead.
+	 */
+	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+		  current->comm, task_pid_nr(current), task_pid_nr(task),
+		  task_pid_nr(task));
+
+	task->signal->oom_score_adj = oom_adj;
+	trace_oom_score_adj_update(task);
+err_sighand:
+	unlock_task_sighand(task, &flags);
+err_task_lock:
+	task_unlock(task);
+	put_task_struct(task);
+out:
+	return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_adj_operations = {
+	.read		= oom_adj_read,
+	.write		= oom_adj_write,
+	.llseek		= generic_file_llseek,
+};
+
 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 					size_t count, loff_t *ppos)
 {
@@ -2598,6 +2705,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
 	INF("oom_score",  S_IRUGO, proc_oom_score),
+	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -2964,6 +3072,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
 	INF("oom_score", S_IRUGO, proc_oom_score),
+	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
-- 
cgit v1.2.3


From 42e2976f131d65555d5c1d6c3d47facc63577814 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:09:44 +1100
Subject: xfs: fix attr tree double split corruption

In certain circumstances, a double split of an attribute tree is
needed to insert or replace an attribute. In rare situations, this
can go wrong, leaving the attribute tree corrupted. In this case,
the attr being replaced is the last attr in a leaf node, and the
replacement is larger so doesn't fit in the same leaf node.
When we have the initial condition of a node format attribute
btree with two leaves at index 1 and 2. Call them L1 and L2.  The
leaf L1 is completely full, there is not a single byte of free space
in it. L2 is mostly empty.  The attribute being replaced - call it X
- is the last attribute in L1.

The way an attribute replace is executed is that the replacement
attribute - call it Y - is first inserted into the tree, but has an
INCOMPLETE flag set on it so that list traversals ignore it. Once
this transaction is committed, a second transaction it run to
atomically mark Y as COMPLETE and X as INCOMPLETE, so that a
traversal will now find Y and skip X. Once that transaction is
committed, attribute X is then removed.

So, the initial condition is:

     +--------+     +--------+
     |   L1   |     |   L2   |
     | fwd: 2 |---->| fwd: 0 |
     | bwd: 0 |<----| bwd: 1 |
     | fsp: 0 |     | fsp: N |
     |--------|     |--------|
     | attr A |     | attr 1 |
     |--------|     |--------|
     | attr B |     | attr 2 |
     |--------|     |--------|
     ..........     ..........
     |--------|     |--------|
     | attr X |     | attr n |
     +--------+     +--------+

So now we go to replace X, and see that L1:fsp = 0 - it is full so
we can't insert Y in the same leaf. So we record the the location of
attribute X so we can track it for later use, then we split L1 into
L1 and L3 and reblance across the two leafs. We end with:

     +--------+     +--------+     +--------+
     |   L1   |     |   L3   |     |   L2   |
     | fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
     | bwd: 0 |<----| bwd: 1 |<----| bwd: 3 |
     | fsp: M |     | fsp: J |     | fsp: N |
     |--------|     |--------|     |--------|
     | attr A |     | attr X |     | attr 1 |
     |--------|     +--------+     |--------|
     | attr B |                    | attr 2 |
     |--------|                    |--------|
     ..........                    ..........
     |--------|                    |--------|
     | attr W |                    | attr n |
     +--------+                    +--------+

And we track that the original attribute is now at L3:0.

We then try to insert Y into L1 again, and find that there isn't
enough room because the new attribute is larger than the old one.
Hence we have to split again to make room for Y. We end up with
this:

     +--------+     +--------+     +--------+     +--------+
     |   L1   |     |   L4   |     |   L3   |     |   L2   |
     | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
     | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 |
     | fsp: M |     | fsp: J |     | fsp: J |     | fsp: N |
     |--------|     |--------|     |--------|     |--------|
     | attr A |     | attr Y |     | attr X |     | attr 1 |
     |--------|     + INCOMP +     +--------+     |--------|
     | attr B |     +--------+                    | attr 2 |
     |--------|                                   |--------|
     ..........                                   ..........
     |--------|                                   |--------|
     | attr W |                                   | attr n |
     +--------+                                   +--------+

And now we have the new (incomplete) attribute @ L4:0, and the
original attribute at L3:0. At this point, the first transaction is
committed, and we move to the flipping of the flags.

This is where we are supposed to end up with this:

     +--------+     +--------+     +--------+     +--------+
     |   L1   |     |   L4   |     |   L3   |     |   L2   |
     | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
     | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 |
     | fsp: M |     | fsp: J |     | fsp: J |     | fsp: N |
     |--------|     |--------|     |--------|     |--------|
     | attr A |     | attr Y |     | attr X |     | attr 1 |
     |--------|     +--------+     + INCOMP +     |--------|
     | attr B |                    +--------+     | attr 2 |
     |--------|                                   |--------|
     ..........                                   ..........
     |--------|                                   |--------|
     | attr W |                                   | attr n |
     +--------+                                   +--------+

But that doesn't happen properly - the attribute tracking indexes
are not pointing to the right locations. What we end up with is both
the old attribute to be removed pointing at L4:0 and the new
attribute at L4:1.  On a debug kernel, this assert fails like so:

XFS: Assertion failed: args->index2 < be16_to_cpu(leaf2->hdr.count), file: fs/xfs/xfs_attr_leaf.c, line: 2725

because the new attribute location does not exist. On a production
kernel, this goes unnoticed and the code proceeds ahead merrily and
removes L4 because it thinks that is the block that is no longer
needed. This leaves the hash index node pointing to entries
L1, L4 and L2, but only blocks L1, L3 and L2 to exist. Further, the
leaf level sibling list is L1 <-> L4 <-> L2, but L4 is now free
space, and so everything is busted. This corruption is caused by the
removal of the old attribute triggering a join - it joins everything
correctly but then frees the wrong block.

xfs_repair will report something like:

bad sibling back pointer for block 4 in attribute fork for inode 131
problem with attribute contents in inode 131
would clear attr fork
bad nblocks 8 for inode 131, would reset to 3
bad anextents 4 for inode 131, would reset to 0

The problem lies in the assignment of the old/new blocks for
tracking purposes when the double leaf split occurs. The first split
tries to place the new attribute inside the current leaf (i.e.
"inleaf == true") and moves the old attribute (X) to the new block.
This sets up the old block/index to L1:X, and newly allocated
block to L3:0. It then moves attr X to the new block and tries to
insert attr Y at the old index. That fails, so it splits again.

With the second split, the rebalance ends up placing the new attr in
the second new block - L4:0 - and this is where the code goes wrong.
What is does is it sets both the new and old block index to the
second new block. Hence it inserts attr Y at the right place (L4:0)
but overwrites the current location of the attr to replace that is
held in the new block index (currently L3:0). It over writes it with
L4:1 - the index we later assert fail on.

Hopefully this table will show this in a foramt that is a bit easier
to understand:

Split		old attr index		new attr index
		vanilla	patched		vanilla	patched
before 1st	L1:26	L1:26		N/A	N/A
after 1st	L3:0	L3:0		L1:26	L1:26
after 2nd	L4:0	L3:0		L4:1	L4:0
                ^^^^			^^^^
		wrong			wrong

The fix is surprisingly simple, for all this analysis - just stop
the rebalance on the out-of leaf case from overwriting the new attr
index - it's already correct for the double split case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr_leaf.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d330111ca738..70eec1829776 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -1291,6 +1291,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	leaf2 = blk2->bp->b_addr;
 	ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
 	ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+	ASSERT(leaf2->hdr.count == 0);
 	args = state->args;
 
 	trace_xfs_attr_leaf_rebalance(args);
@@ -1361,6 +1362,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		 * I assert that since all callers pass in an empty
 		 * second buffer, this code should never execute.
 		 */
+		ASSERT(0);
 
 		/*
 		 * Figure the total bytes to be added to the destination leaf.
@@ -1422,10 +1424,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 			args->index2 = 0;
 			args->blkno2 = blk2->blkno;
 		} else {
+			/*
+			 * On a double leaf split, the original attr location
+			 * is already stored in blkno2/index2, so don't
+			 * overwrite it overwise we corrupt the tree.
+			 */
 			blk2->index = blk1->index
 				    - be16_to_cpu(leaf1->hdr.count);
-			args->index = args->index2 = blk2->index;
-			args->blkno = args->blkno2 = blk2->blkno;
+			args->index = blk2->index;
+			args->blkno = blk2->blkno;
+			if (!state->extravalid) {
+				/*
+				 * set the new attr location to match the old
+				 * one and let the higher level split code
+				 * decide where in the leaf to place it.
+				 */
+				args->index2 = blk2->index;
+				args->blkno2 = blk2->blkno;
+			}
 		}
 	} else {
 		ASSERT(state->inleaf == 1);
-- 
cgit v1.2.3


From 3daed8bc3e49b9695ae931b9f472b5b90d1965b3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:09:45 +1100
Subject: xfs: fix broken error handling in xfs_vm_writepage

When we shut down the filesystem, it might first be detected in
writeback when we are allocating a inode size transaction. This
happens after we have moved all the pages into the writeback state
and unlocked them. Unfortunately, if we fail to set up the
transaction we then abort writeback and try to invalidate the
current page. This then triggers are BUG() in block_invalidatepage()
because we are trying to invalidate an unlocked page.

Fixing this is a bit of a chicken and egg problem - we can't
allocate the transaction until we've clustered all the pages into
the IO and we know the size of it (i.e. whether the last block of
the IO is beyond the current EOF or not). However, we don't want to
hold pages locked for long periods of time, especially while we lock
other pages to cluster them into the write.

To fix this, we need to make a clear delineation in writeback where
errors can only be handled by IO completion processing. That is,
once we have marked a page for writeback and unlocked it, we have to
report errors via IO completion because we've already started the
IO. We may not have submitted any IO, but we've changed the page
state to indicate that it is under IO so we must now use the IO
completion path to report errors.

To do this, add an error field to xfs_submit_ioend() to pass it the
error that occurred during the building on the ioend chain. When
this is non-zero, mark each ioend with the error and call
xfs_finish_ioend() directly rather than building bios. This will
immediately push the ioends through completion processing with the
error that has occurred.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c | 54 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e562dd43f41f..e57e2daa357c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -481,11 +481,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
  *
  * The fix is two passes across the ioend list - one to start writeback on the
  * buffer_heads, and then submit them for I/O on the second pass.
+ *
+ * If @fail is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * than submit it to IO. This typically only happens on a filesystem shutdown.
  */
 STATIC void
 xfs_submit_ioend(
 	struct writeback_control *wbc,
-	xfs_ioend_t		*ioend)
+	xfs_ioend_t		*ioend,
+	int			fail)
 {
 	xfs_ioend_t		*head = ioend;
 	xfs_ioend_t		*next;
@@ -506,6 +512,18 @@ xfs_submit_ioend(
 		next = ioend->io_list;
 		bio = NULL;
 
+		/*
+		 * If we are failing the IO now, just mark the ioend with an
+		 * error and finish it. This will run IO completion immediately
+		 * as there is only one reference to the ioend at this point in
+		 * time.
+		 */
+		if (fail) {
+			ioend->io_error = -fail;
+			xfs_finish_ioend(ioend);
+			continue;
+		}
+
 		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 
 			if (!bio) {
@@ -1060,7 +1078,18 @@ xfs_vm_writepage(
 
 	xfs_start_page_writeback(page, 1, count);
 
-	if (ioend && imap_valid) {
+	/* if there is no IO to be submitted for this page, we are done */
+	if (!ioend)
+		return 0;
+
+	ASSERT(iohead);
+
+	/*
+	 * Any errors from this point onwards need tobe reported through the IO
+	 * completion path as we have marked the initial page as under writeback
+	 * and unlocked it.
+	 */
+	if (imap_valid) {
 		xfs_off_t		end_index;
 
 		end_index = imap.br_startoff + imap.br_blockcount;
@@ -1079,20 +1108,15 @@ xfs_vm_writepage(
 				  wbc, end_index);
 	}
 
-	if (iohead) {
-		/*
-		 * Reserve log space if we might write beyond the on-disk
-		 * inode size.
-		 */
-		if (ioend->io_type != XFS_IO_UNWRITTEN &&
-		    xfs_ioend_is_append(ioend)) {
-			err = xfs_setfilesize_trans_alloc(ioend);
-			if (err)
-				goto error;
-		}
 
-		xfs_submit_ioend(wbc, iohead);
-	}
+	/*
+	 * Reserve log space if we might write beyond the on-disk inode size.
+	 */
+	err = 0;
+	if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+		err = xfs_setfilesize_trans_alloc(ioend);
+
+	xfs_submit_ioend(wbc, iohead, err);
 
 	return 0;
 
-- 
cgit v1.2.3


From d69043c42d8c6414fa28ad18d99973aa6c1c2e24 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:09:46 +1100
Subject: xfs: drop buffer io reference when a bad bio is built

Error handling in xfs_buf_ioapply_map() does not handle IO reference
counts correctly. We increment the b_io_remaining count before
building the bio, but then fail to decrement it in the failure case.
This leads to the buffer never running IO completion and releasing
the reference that the IO holds, so at unmount we can leak the
buffer. This leak is captured by this assert failure during unmount:

XFS: Assertion failed: atomic_read(&pag->pag_ref) == 0, file: fs/xfs/xfs_mount.c, line: 273

This is not a new bug - the b_io_remaining accounting has had this
problem for a long, long time - it's just very hard to get a
zero length bio being built by this code...

Further, the buffer IO error can be overwritten on a multi-segment
buffer by subsequent bio completions for partial sections of the
buffer. Hence we should only set the buffer error status if the
buffer is not already carrying an error status. This ensures that a
partial IO error on a multi-segment buffer will not be lost. This
part of the problem is a regression, however.

cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_buf.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 933b7930b863..4b0b8dd1b7b0 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1197,9 +1197,14 @@ xfs_buf_bio_end_io(
 {
 	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
 
-	xfs_buf_ioerror(bp, -error);
+	/*
+	 * don't overwrite existing errors - otherwise we can lose errors on
+	 * buffers that require multiple bios to complete.
+	 */
+	if (!bp->b_error)
+		xfs_buf_ioerror(bp, -error);
 
-	if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
 		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
 
 	_xfs_buf_ioend(bp, 1);
@@ -1279,6 +1284,11 @@ next_chunk:
 		if (size)
 			goto next_chunk;
 	} else {
+		/*
+		 * This is guaranteed not to be the last io reference count
+		 * because the caller (xfs_buf_iorequest) holds a count itself.
+		 */
+		atomic_dec(&bp->b_io_remaining);
 		xfs_buf_ioerror(bp, EIO);
 		bio_put(bio);
 	}
-- 
cgit v1.2.3


From 3587b1b097d70c2eb9fee95ea7995d13c05f66e5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 18 Nov 2012 19:19:00 +0000
Subject: fanotify: fix FAN_Q_OVERFLOW case of fanotify_read()

If the FAN_Q_OVERFLOW bit set in event->mask, the fanotify event
metadata will not contain a valid file descriptor, but
copy_event_to_user() didn't check for that, and unconditionally does a
fd_install() on the file descriptor.

Which in turn will cause a BUG_ON() in __fd_install().

Introduced by commit 352e3b249284 ("fanotify: sanitize failure exits in
copy_event_to_user()")

Mea culpa - missed that path ;-/

Reported-by: Alex Shi <lkml.alex@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fanotify/fanotify_user.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 721d692fa8d4..6fcaeb8c902e 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -258,7 +258,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	if (ret)
 		goto out_close_fd;
 
-	fd_install(fd, f);
+	if (fd != FAN_NOFD)
+		fd_install(fd, f);
 	return fanotify_event_metadata.event_len;
 
 out_close_fd:
-- 
cgit v1.2.3


From 3bb3e1fc47aca554e7e2cc4deeddc24750987ac2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 13 Nov 2012 14:55:52 +0100
Subject: reiserfs: Fix lock ordering during remount

When remounting reiserfs dquot_suspend() or dquot_resume() can be called.
These functions take dqonoff_mutex which ranks above write lock so we have
to drop it before calling into quota code.

CC: stable@vger.kernel.org # >= 3.0
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/reiserfs/super.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1078ae179993..5372980ec458 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1335,7 +1335,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 				kfree(qf_names[i]);
 #endif
 		err = -EINVAL;
-		goto out_err;
+		goto out_unlock;
 	}
 #ifdef CONFIG_QUOTA
 	handle_quota_files(s, qf_names, &qfmt);
@@ -1379,7 +1379,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	if (blocks) {
 		err = reiserfs_resize(s, blocks);
 		if (err != 0)
-			goto out_err;
+			goto out_unlock;
 	}
 
 	if (*mount_flags & MS_RDONLY) {
@@ -1389,9 +1389,15 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 			/* it is read-only already */
 			goto out_ok;
 
+		/*
+		 * Drop write lock. Quota will retake it when needed and lock
+		 * ordering requires calling dquot_suspend() without it.
+		 */
+		reiserfs_write_unlock(s);
 		err = dquot_suspend(s, -1);
 		if (err < 0)
 			goto out_err;
+		reiserfs_write_lock(s);
 
 		/* try to remount file system with read-only permissions */
 		if (sb_umount_state(rs) == REISERFS_VALID_FS
@@ -1401,7 +1407,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 
 		err = journal_begin(&th, s, 10);
 		if (err)
-			goto out_err;
+			goto out_unlock;
 
 		/* Mounting a rw partition read-only. */
 		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1416,7 +1422,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 
 		if (reiserfs_is_journal_aborted(journal)) {
 			err = journal->j_errno;
-			goto out_err;
+			goto out_unlock;
 		}
 
 		handle_data_mode(s, mount_options);
@@ -1425,7 +1431,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		s->s_flags &= ~MS_RDONLY;	/* now it is safe to call journal_begin */
 		err = journal_begin(&th, s, 10);
 		if (err)
-			goto out_err;
+			goto out_unlock;
 
 		/* Mount a partition which is read-only, read-write */
 		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1442,10 +1448,16 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	SB_JOURNAL(s)->j_must_wait = 1;
 	err = journal_end(&th, s, 10);
 	if (err)
-		goto out_err;
+		goto out_unlock;
 
 	if (!(*mount_flags & MS_RDONLY)) {
+		/*
+		 * Drop write lock. Quota will retake it when needed and lock
+		 * ordering requires calling dquot_resume() without it.
+		 */
+		reiserfs_write_unlock(s);
 		dquot_resume(s, -1);
+		reiserfs_write_lock(s);
 		finish_unfinished(s);
 		reiserfs_xattr_init(s, *mount_flags);
 	}
@@ -1455,9 +1467,10 @@ out_ok:
 	reiserfs_write_unlock(s);
 	return 0;
 
+out_unlock:
+	reiserfs_write_unlock(s);
 out_err:
 	kfree(new_opts);
-	reiserfs_write_unlock(s);
 	return err;
 }
 
-- 
cgit v1.2.3


From b9e06ef2e8706fe669b51f4364e3aeed58639eb2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 13 Nov 2012 16:34:17 +0100
Subject: reiserfs: Protect reiserfs_quota_on() with write lock

In reiserfs_quota_on() we do quite some work - for example unpacking
tail of a quota file. Thus we have to hold write lock until a moment
we call back into the quota code.

CC: stable@vger.kernel.org # >= 3.0
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/reiserfs/super.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5372980ec458..e59d6ddcc69f 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2216,8 +2216,11 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 	struct reiserfs_transaction_handle th;
 	int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
 
-	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt)))
-		return -EINVAL;
+	reiserfs_write_lock(sb);
+	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	/* Quotafile not on the same filesystem? */
 	if (path->dentry->d_sb != sb) {
@@ -2259,8 +2262,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 		if (err)
 			goto out;
 	}
-	err = dquot_quota_on(sb, type, format_id, path);
+	reiserfs_write_unlock(sb);
+	return dquot_quota_on(sb, type, format_id, path);
 out:
+	reiserfs_write_unlock(sb);
 	return err;
 }
 
-- 
cgit v1.2.3


From 361d94a338a3fd0cee6a4ea32bbc427ba228e628 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 13 Nov 2012 18:25:38 +0100
Subject: reiserfs: Protect reiserfs_quota_write() with write lock

Calls into reiserfs journalling code and reiserfs_get_block() need to
be protected with write lock. We remove write lock around calls to high
level quota code in the next patch so these paths would suddently become
unprotected.

CC: stable@vger.kernel.org # >= 3.0
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/reiserfs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e59d6ddcc69f..c101704ece48 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2338,7 +2338,9 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 		tocopy = sb->s_blocksize - offset < towrite ?
 		    sb->s_blocksize - offset : towrite;
 		tmp_bh.b_state = 0;
+		reiserfs_write_lock(sb);
 		err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
+		reiserfs_write_unlock(sb);
 		if (err)
 			goto out;
 		if (offset || tocopy != sb->s_blocksize)
@@ -2354,10 +2356,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 		flush_dcache_page(bh->b_page);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
+		reiserfs_write_lock(sb);
 		reiserfs_prepare_for_journal(sb, bh, 1);
 		journal_mark_dirty(current->journal_info, sb, bh);
 		if (!journal_quota)
 			reiserfs_add_ordered_list(inode, bh);
+		reiserfs_write_unlock(sb);
 		brelse(bh);
 		offset = 0;
 		towrite -= tocopy;
-- 
cgit v1.2.3


From 7af11686933726e99af22901d622f9e161404e6b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 13 Nov 2012 17:05:14 +0100
Subject: reiserfs: Move quota calls out of write lock

Calls into highlevel quota code cannot happen under the write lock. These
calls take dqio_mutex which ranks above write lock. So drop write lock
before calling back into quota code.

CC: stable@vger.kernel.org # >= 3.0
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/reiserfs/inode.c | 10 +++++++---
 fs/reiserfs/stree.c |  4 ++++
 fs/reiserfs/super.c | 18 ++++++++++++++----
 3 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f27f01a98aa2..d83736fbc26c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,8 +1782,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 
 	BUG_ON(!th->t_trans_id);
 
-	dquot_initialize(inode);
+	reiserfs_write_unlock(inode->i_sb);
 	err = dquot_alloc_inode(inode);
+	reiserfs_write_lock(inode->i_sb);
 	if (err)
 		goto out_end_trans;
 	if (!dir->i_nlink) {
@@ -1979,8 +1980,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 
       out_end_trans:
 	journal_end(th, th->t_super, th->t_blocks_allocated);
+	reiserfs_write_unlock(inode->i_sb);
 	/* Drop can be outside and it needs more credits so it's better to have it outside */
 	dquot_drop(inode);
+	reiserfs_write_lock(inode->i_sb);
 	inode->i_flags |= S_NOQUOTA;
 	make_bad_inode(inode);
 
@@ -3103,10 +3106,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	/* must be turned off for recursive notify_change calls */
 	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
 
-	depth = reiserfs_write_lock_once(inode->i_sb);
 	if (is_quota_modification(inode, attr))
 		dquot_initialize(inode);
-
+	depth = reiserfs_write_lock_once(inode->i_sb);
 	if (attr->ia_valid & ATTR_SIZE) {
 		/* version 2 items will be caught by the s_maxbytes check
 		 ** done for us in vmtruncate
@@ -3170,7 +3172,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 		error = journal_begin(&th, inode->i_sb, jbegin_count);
 		if (error)
 			goto out;
+		reiserfs_write_unlock_once(inode->i_sb, depth);
 		error = dquot_transfer(inode, attr);
+		depth = reiserfs_write_lock_once(inode->i_sb);
 		if (error) {
 			journal_end(&th, inode->i_sb, jbegin_count);
 			goto out;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index f8afa4b162b8..2f40a4c70a4d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1968,7 +1968,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 		       key2type(&(key->on_disk_key)));
 #endif
 
+	reiserfs_write_unlock(inode->i_sb);
 	retval = dquot_alloc_space_nodirty(inode, pasted_size);
+	reiserfs_write_lock(inode->i_sb);
 	if (retval) {
 		pathrelse(search_path);
 		return retval;
@@ -2061,9 +2063,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
 			       "reiserquota insert_item(): allocating %u id=%u type=%c",
 			       quota_bytes, inode->i_uid, head2type(ih));
 #endif
+		reiserfs_write_unlock(inode->i_sb);
 		/* We can't dirty inode here. It would be immediately written but
 		 * appropriate stat item isn't inserted yet... */
 		retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+		reiserfs_write_lock(inode->i_sb);
 		if (retval) {
 			pathrelse(path);
 			return retval;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c101704ece48..418bdc3a57da 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -298,7 +298,9 @@ static int finish_unfinished(struct super_block *s)
 			retval = remove_save_link_only(s, &save_link_key, 0);
 			continue;
 		}
+		reiserfs_write_unlock(s);
 		dquot_initialize(inode);
+		reiserfs_write_lock(s);
 
 		if (truncate && S_ISDIR(inode->i_mode)) {
 			/* We got a truncate request for a dir which is impossible.
@@ -2108,13 +2110,15 @@ static int reiserfs_write_dquot(struct dquot *dquot)
 			  REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
 	if (ret)
 		goto out;
+	reiserfs_write_unlock(dquot->dq_sb);
 	ret = dquot_commit(dquot);
+	reiserfs_write_lock(dquot->dq_sb);
 	err =
 	    journal_end(&th, dquot->dq_sb,
 			REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
 	if (!ret && err)
 		ret = err;
-      out:
+out:
 	reiserfs_write_unlock(dquot->dq_sb);
 	return ret;
 }
@@ -2130,13 +2134,15 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
 			  REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
 	if (ret)
 		goto out;
+	reiserfs_write_unlock(dquot->dq_sb);
 	ret = dquot_acquire(dquot);
+	reiserfs_write_lock(dquot->dq_sb);
 	err =
 	    journal_end(&th, dquot->dq_sb,
 			REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
 	if (!ret && err)
 		ret = err;
-      out:
+out:
 	reiserfs_write_unlock(dquot->dq_sb);
 	return ret;
 }
@@ -2150,19 +2156,21 @@ static int reiserfs_release_dquot(struct dquot *dquot)
 	ret =
 	    journal_begin(&th, dquot->dq_sb,
 			  REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	reiserfs_write_unlock(dquot->dq_sb);
 	if (ret) {
 		/* Release dquot anyway to avoid endless cycle in dqput() */
 		dquot_release(dquot);
 		goto out;
 	}
 	ret = dquot_release(dquot);
+	reiserfs_write_lock(dquot->dq_sb);
 	err =
 	    journal_end(&th, dquot->dq_sb,
 			REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
 	if (!ret && err)
 		ret = err;
-      out:
 	reiserfs_write_unlock(dquot->dq_sb);
+out:
 	return ret;
 }
 
@@ -2187,11 +2195,13 @@ static int reiserfs_write_info(struct super_block *sb, int type)
 	ret = journal_begin(&th, sb, 2);
 	if (ret)
 		goto out;
+	reiserfs_write_unlock(sb);
 	ret = dquot_commit_info(sb, type);
+	reiserfs_write_lock(sb);
 	err = journal_end(&th, sb, 2);
 	if (!ret && err)
 		ret = err;
-      out:
+out:
 	reiserfs_write_unlock(sb);
 	return ret;
 }
-- 
cgit v1.2.3


From ae49eeec785025373e28dc24c8351c6bba688d99 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Thu, 11 Oct 2012 12:28:38 +0200
Subject: ext3: Avoid underflow of in ext3_trim_fs()

Currently if len argument in ext3_trim_fs() is smaller than one block,
the 'end' variable underflow. Avoid that by returning EINVAL if len is
smaller than file system block.

Also remove useless unlikely().

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/balloc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 7320a66e958f..22548f56197b 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -2101,8 +2101,9 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	end = start + (range->len >> sb->s_blocksize_bits) - 1;
 	minlen = range->minlen >> sb->s_blocksize_bits;
 
-	if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) ||
-	    unlikely(start >= max_blks))
+	if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
+	    start >= max_blks ||
+	    range->len < sb->s_blocksize)
 		return -EINVAL;
 	if (end >= max_blks)
 		end = max_blks - 1;
-- 
cgit v1.2.3


From 25389bb207987b5774182f763b9fb65ff08761c8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 23 Nov 2012 14:03:04 +0100
Subject: jbd: Fix lock ordering bug in journal_unmap_buffer()

Commit 09e05d48 introduced a wait for transaction commit into
journal_unmap_buffer() in the case we are truncating a buffer undergoing commit
in the page stradding i_size on a filesystem with blocksize < pagesize. Sadly
we forgot to drop buffer lock before waiting for transaction commit and thus
deadlock is possible when kjournald wants to lock the buffer.

Fix the problem by dropping the buffer lock before waiting for transaction
commit. Since we are still holding page lock (and that is OK), buffer cannot
disappear under us.

CC: stable@vger.kernel.org # Wherever commit 09e05d48 was taken
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 78b7f84241d4..7f5120bf0ec2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1961,7 +1961,9 @@ retry:
 			spin_unlock(&journal->j_list_lock);
 			jbd_unlock_bh_state(bh);
 			spin_unlock(&journal->j_state_lock);
+			unlock_buffer(bh);
 			log_wait_commit(journal, tid);
+			lock_buffer(bh);
 			goto retry;
 		}
 		/*
-- 
cgit v1.2.3


From 05f564849d49499ced97913a0914b5950577d07d Mon Sep 17 00:00:00 2001
From: Stanislav Kinsbursky <skinsbursky@parallels.com>
Date: Mon, 26 Nov 2012 16:29:42 -0800
Subject: proc: check vma->vm_file before dereferencing

Commit 7b540d0646ce ("proc_map_files_readdir(): don't bother with
grabbing files") switched proc_map_files_readdir() to use @f_mode
directly instead of grabbing @file reference, but same time the test for
@vm_file presence was lost leading to nil dereference.  The patch brings
the test back.

The all proc_map_files feature is CONFIG_CHECKPOINT_RESTORE wrapped
(which is set to 'n' by default) so the bug doesn't affect regular
kernels.

The regression is 3.7-rc1 only as far as I can tell.

[gorcunov@openvz.org: provided changelog]
Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3c231adf8450..9e28356a959a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1877,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
 	if (!vma)
 		goto out_no_vma;
 
-	result = proc_map_files_instantiate(dir, dentry, task,
-			(void *)(unsigned long)vma->vm_file->f_mode);
+	if (vma->vm_file)
+		result = proc_map_files_instantiate(dir, dentry, task,
+				(void *)(unsigned long)vma->vm_file->f_mode);
 
 out_no_vma:
 	up_read(&mm->mmap_sem);
-- 
cgit v1.2.3


From 4eff96dd5283a102e0c1cac95247090be74a38ed Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 26 Nov 2012 16:29:51 -0800
Subject: writeback: put unused inodes to LRU after writeback completion

Commit 169ebd90131b ("writeback: Avoid iput() from flusher thread")
removed iget-iput pair from inode writeback.  As a side effect, inodes
that are dirty during iput_final() call won't be ever added to inode LRU
(iput_final() doesn't add dirty inodes to LRU and later when the inode
is cleaned there's noone to add the inode there).  Thus inodes are
effectively unreclaimable until someone looks them up again.

The practical effect of this bug is limited by the fact that inodes are
pinned by a dentry for long enough that the inode gets cleaned.  But
still the bug can have nasty consequences leading up to OOM conditions
under certain circumstances.  Following can easily reproduce the
problem:

  for (( i = 0; i < 1000; i++ )); do
    mkdir $i
    for (( j = 0; j < 1000; j++ )); do
      touch $i/$j
      echo 2 > /proc/sys/vm/drop_caches
    done
  done

then one needs to run 'sync; ls -lR' to make inodes reclaimable again.

We fix the issue by inserting unused clean inodes into the LRU after
writeback finishes in inode_sync_complete().

Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: <stable@vger.kernel.org>		[3.5+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c |  2 ++
 fs/inode.c        | 16 ++++++++++++++--
 fs/internal.h     |  1 +
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 51ea267d444c..3e3422f7f0a4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -228,6 +228,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 static void inode_sync_complete(struct inode *inode)
 {
 	inode->i_state &= ~I_SYNC;
+	/* If inode is clean an unused, put it into LRU now... */
+	inode_add_lru(inode);
 	/* Waiters must see I_SYNC cleared before being woken up */
 	smp_mb();
 	wake_up_bit(&inode->i_state, __I_SYNC);
diff --git a/fs/inode.c b/fs/inode.c
index b03c71957246..64999f144153 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -408,6 +408,19 @@ static void inode_lru_list_add(struct inode *inode)
 	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
+/*
+ * Add inode to LRU if needed (inode is unused and clean).
+ *
+ * Needs inode->i_lock held.
+ */
+void inode_add_lru(struct inode *inode)
+{
+	if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
+		inode_lru_list_add(inode);
+}
+
+
 static void inode_lru_list_del(struct inode *inode)
 {
 	spin_lock(&inode->i_sb->s_inode_lru_lock);
@@ -1390,8 +1403,7 @@ static void iput_final(struct inode *inode)
 
 	if (!drop && (sb->s_flags & MS_ACTIVE)) {
 		inode->i_state |= I_REFERENCED;
-		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-			inode_lru_list_add(inode);
+		inode_add_lru(inode);
 		spin_unlock(&inode->i_lock);
 		return;
 	}
diff --git a/fs/internal.h b/fs/internal.h
index 916b7cbf3e3e..2f6af7f645eb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f);
  * inode.c
  */
 extern spinlock_t inode_sb_list_lock;
+extern void inode_add_lru(struct inode *inode);
 
 /*
  * fs-writeback.c
-- 
cgit v1.2.3


From 3a98b8614312026d489e56c1d0e294a68e2aad77 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 26 Nov 2012 09:48:41 -0500
Subject: cifs: fix writeback race with file that is growing

Commit eddb079deb4 created a regression in the writepages codepath.
Previously, whenever it needed to check the size of the file, it did so
by consulting the inode->i_size field directly. With that patch, the
i_size was fetched once on entry into the writepages code and that value
was used henceforth.

If the file is changing size though (for instance, if someone is writing
to it or has truncated it), then that value is likely to be wrong. This
can lead to data corruption. Pages past the EOF at the time that the
writepages call was issued may be silently dropped and ignored because
cifs_writepages wrongly assumes that the file must have been truncated
in the interim.

Fix cifs_writepages to properly fetch the size from the inode->i_size
field instead to properly account for this possibility.

Original bug report is here:

    https://bugzilla.kernel.org/show_bug.cgi?id=50991

Reported-and-Tested-by: Maxim Britov <ungifted01@gmail.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/file.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index edb25b4bbb95..70b6f4c3a0c1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1794,7 +1794,6 @@ static int cifs_writepages(struct address_space *mapping,
 	struct TCP_Server_Info *server;
 	struct page *page;
 	int rc = 0;
-	loff_t isize = i_size_read(mapping->host);
 
 	/*
 	 * If wsize is smaller than the page cache size, default to writing
@@ -1899,7 +1898,7 @@ retry:
 			 */
 			set_page_writeback(page);
 
-			if (page_offset(page) >= isize) {
+			if (page_offset(page) >= i_size_read(mapping->host)) {
 				done = true;
 				unlock_page(page);
 				end_page_writeback(page);
@@ -1932,7 +1931,8 @@ retry:
 		wdata->offset = page_offset(wdata->pages[0]);
 		wdata->pagesz = PAGE_CACHE_SIZE;
 		wdata->tailsz =
-			min(isize - page_offset(wdata->pages[nr_pages - 1]),
+			min(i_size_read(mapping->host) -
+			    page_offset(wdata->pages[nr_pages - 1]),
 			    (loff_t)PAGE_CACHE_SIZE);
 		wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
 					wdata->tailsz;
-- 
cgit v1.2.3


From c772aa92b6deb2857d4b39a5cc3bd3679cc5f4a6 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Wed, 28 Nov 2012 15:27:54 +0400
Subject: CIFS: Fix wrong buffer pointer usage in smb_set_file_info

Commit 6bdf6dbd662176c0da5c3ac8ed10ac94e7776c85 caused a regression
in setattr codepath that leads to files with wrong attributes.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smb1ops.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 56cc4be87807..34cea2798333 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -766,7 +766,6 @@ smb_set_file_info(struct inode *inode, const char *full_path,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink = NULL;
 	struct cifs_tcon *tcon;
-	FILE_BASIC_INFO info_buf;
 
 	/* if the file is already open for write, just use that fileid */
 	open_file = find_writable_file(cinode, true);
@@ -817,7 +816,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
 	netpid = current->tgid;
 
 set_via_filehandle:
-	rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid);
+	rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid);
 	if (!rc)
 		cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
 
-- 
cgit v1.2.3


From 45bce8f3e3436bbe2e03dd2b076abdce79ffabb7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 29 Nov 2012 10:21:43 -0800
Subject: fs/buffer.c: make block-size be per-page and protected by the page
 lock

This makes the buffer size handling be a per-page thing, which allows us
to not have to worry about locking too much when changing the buffer
size.  If a page doesn't have buffers, we still need to read the block
size from the inode, but we can do that with ACCESS_ONCE(), so that even
if the size is changing, we get a consistent value.

This doesn't convert all functions - many of the buffer functions are
used purely by filesystems, which in turn results in the buffer size
being fixed at mount-time.  So they don't have the same consistency
issues that the raw device access can have.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 79 +++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index b5f044283edb..28a74ff5324b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1552,6 +1552,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
 }
 EXPORT_SYMBOL(unmap_underlying_metadata);
 
+/*
+ * Size is a power-of-two in the range 512..PAGE_SIZE,
+ * and the case we care about most is PAGE_SIZE.
+ *
+ * So this *could* possibly be written with those
+ * constraints in mind (relevant mostly if some
+ * architecture has a slow bit-scan instruction)
+ */
+static inline int block_size_bits(unsigned int blocksize)
+{
+	return ilog2(blocksize);
+}
+
+static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
+	return page_buffers(page);
+}
+
 /*
  * NOTE! All mapped/uptodate combinations are valid:
  *
@@ -1589,19 +1611,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	sector_t block;
 	sector_t last_block;
 	struct buffer_head *bh, *head;
-	const unsigned blocksize = 1 << inode->i_blkbits;
+	unsigned int blocksize, bbits;
 	int nr_underway = 0;
 	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
 			WRITE_SYNC : WRITE);
 
-	BUG_ON(!PageLocked(page));
-
-	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, blocksize,
+	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
 
 	/*
 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
@@ -1613,9 +1629,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	 * handle that here by just cleaning them.
 	 */
 
-	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	head = page_buffers(page);
 	bh = head;
+	blocksize = bh->b_size;
+	bbits = block_size_bits(blocksize);
+
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+	last_block = (i_size_read(inode) - 1) >> bbits;
 
 	/*
 	 * Get all the dirty buffers mapped to disk addresses and
@@ -1806,12 +1825,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 	BUG_ON(to > PAGE_CACHE_SIZE);
 	BUG_ON(from > to);
 
-	blocksize = 1 << inode->i_blkbits;
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-	head = page_buffers(page);
+	head = create_page_buffers(page, inode, 0);
+	blocksize = head->b_size;
+	bbits = block_size_bits(blocksize);
 
-	bbits = inode->i_blkbits;
 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
 
 	for(bh = head, block_start = 0; bh != head || !block_start;
@@ -1881,11 +1898,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 	unsigned blocksize;
 	struct buffer_head *bh, *head;
 
-	blocksize = 1 << inode->i_blkbits;
+	bh = head = page_buffers(page);
+	blocksize = bh->b_size;
 
-	for(bh = head = page_buffers(page), block_start = 0;
-	    bh != head || !block_start;
-	    block_start=block_end, bh = bh->b_this_page) {
+	block_start = 0;
+	do {
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (!buffer_uptodate(bh))
@@ -1895,7 +1912,10 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 			mark_buffer_dirty(bh);
 		}
 		clear_buffer_new(bh);
-	}
+
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
 
 	/*
 	 * If this is a partial write which happened to make all buffers
@@ -2020,7 +2040,6 @@ EXPORT_SYMBOL(generic_write_end);
 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
 					unsigned long from)
 {
-	struct inode *inode = page->mapping->host;
 	unsigned block_start, block_end, blocksize;
 	unsigned to;
 	struct buffer_head *bh, *head;
@@ -2029,13 +2048,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
 	if (!page_has_buffers(page))
 		return 0;
 
-	blocksize = 1 << inode->i_blkbits;
+	head = page_buffers(page);
+	blocksize = head->b_size;
 	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
 	to = from + to;
 	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
 		return 0;
 
-	head = page_buffers(page);
 	bh = head;
 	block_start = 0;
 	do {
@@ -2068,18 +2087,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 	struct inode *inode = page->mapping->host;
 	sector_t iblock, lblock;
 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
-	unsigned int blocksize;
+	unsigned int blocksize, bbits;
 	int nr, i;
 	int fully_mapped = 1;
 
-	BUG_ON(!PageLocked(page));
-	blocksize = 1 << inode->i_blkbits;
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-	head = page_buffers(page);
+	head = create_page_buffers(page, inode, 0);
+	blocksize = head->b_size;
+	bbits = block_size_bits(blocksize);
 
-	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+	lblock = (i_size_read(inode)+blocksize-1) >> bbits;
 	bh = head;
 	nr = 0;
 	i = 0;
-- 
cgit v1.2.3


From 1e8b33328a5407b447ff80953655a47014a6dcb9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 29 Nov 2012 10:49:50 -0800
Subject: blockdev: remove bd_block_size_semaphore again

This reverts the block-device direct access code to the previous
unlocked code, now that fs/buffer.c no longer needs external locking.

With this, fs/block_dev.c is back to the original version, apart from a
whitespace cleanup that I didn't want to revert.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 105 +++------------------------------------------------------
 1 file changed, 4 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1a1e5e3b1eaf..47a949d8a07e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -116,8 +116,6 @@ EXPORT_SYMBOL(invalidate_bdev);
 
 int set_blocksize(struct block_device *bdev, int size)
 {
-	struct address_space *mapping;
-
 	/* Size must be a power of two, and between 512 and PAGE_SIZE */
 	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
 		return -EINVAL;
@@ -126,19 +124,6 @@ int set_blocksize(struct block_device *bdev, int size)
 	if (size < bdev_logical_block_size(bdev))
 		return -EINVAL;
 
-	/* Prevent starting I/O or mapping the device */
-	percpu_down_write(&bdev->bd_block_size_semaphore);
-
-	/* Check that the block device is not memory mapped */
-	mapping = bdev->bd_inode->i_mapping;
-	mutex_lock(&mapping->i_mmap_mutex);
-	if (mapping_mapped(mapping)) {
-		mutex_unlock(&mapping->i_mmap_mutex);
-		percpu_up_write(&bdev->bd_block_size_semaphore);
-		return -EBUSY;
-	}
-	mutex_unlock(&mapping->i_mmap_mutex);
-
 	/* Don't change the size if it is same as current */
 	if (bdev->bd_block_size != size) {
 		sync_blockdev(bdev);
@@ -146,9 +131,6 @@ int set_blocksize(struct block_device *bdev, int size)
 		bdev->bd_inode->i_blkbits = blksize_bits(size);
 		kill_bdev(bdev);
 	}
-
-	percpu_up_write(&bdev->bd_block_size_semaphore);
-
 	return 0;
 }
 
@@ -459,12 +441,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
 	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
-
-	if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
-		kmem_cache_free(bdev_cachep, ei);
-		return NULL;
-	}
-
 	return &ei->vfs_inode;
 }
 
@@ -473,8 +449,6 @@ static void bdev_i_callback(struct rcu_head *head)
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct bdev_inode *bdi = BDEV_I(inode);
 
-	percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
-
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
@@ -1593,22 +1567,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
-ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
-			unsigned long nr_segs, loff_t pos)
-{
-	ssize_t ret;
-	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
-
-	percpu_down_read(&bdev->bd_block_size_semaphore);
-
-	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
-
-	percpu_up_read(&bdev->bd_block_size_semaphore);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(blkdev_aio_read);
-
 /*
  * Write data to the block device.  Only intended for the block device itself
  * and the raw driver which basically is a fake block device.
@@ -1620,16 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 	struct blk_plug plug;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	blk_start_plug(&plug);
-
-	percpu_down_read(&bdev->bd_block_size_semaphore);
-
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	if (ret > 0 || ret == -EIOCBQUEUED) {
 		ssize_t err;
@@ -1638,62 +1592,11 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		if (err < 0 && ret > 0)
 			ret = err;
 	}
-
-	percpu_up_read(&bdev->bd_block_size_semaphore);
-
 	blk_finish_plug(&plug);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_aio_write);
 
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	int ret;
-	struct block_device *bdev = I_BDEV(file->f_mapping->host);
-
-	percpu_down_read(&bdev->bd_block_size_semaphore);
-
-	ret = generic_file_mmap(file, vma);
-
-	percpu_up_read(&bdev->bd_block_size_semaphore);
-
-	return ret;
-}
-
-static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos,
-				  struct pipe_inode_info *pipe, size_t len,
-				  unsigned int flags)
-{
-	ssize_t ret;
-	struct block_device *bdev = I_BDEV(file->f_mapping->host);
-
-	percpu_down_read(&bdev->bd_block_size_semaphore);
-
-	ret = generic_file_splice_read(file, ppos, pipe, len, flags);
-
-	percpu_up_read(&bdev->bd_block_size_semaphore);
-
-	return ret;
-}
-
-static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe,
-				   struct file *file, loff_t *ppos, size_t len,
-				   unsigned int flags)
-{
-	ssize_t ret;
-	struct block_device *bdev = I_BDEV(file->f_mapping->host);
-
-	percpu_down_read(&bdev->bd_block_size_semaphore);
-
-	ret = generic_file_splice_write(pipe, file, ppos, len, flags);
-
-	percpu_up_read(&bdev->bd_block_size_semaphore);
-
-	return ret;
-}
-
-
 /*
  * Try to release a page associated with block device when the system
  * is under memory pressure.
@@ -1724,16 +1627,16 @@ const struct file_operations def_blk_fops = {
 	.llseek		= block_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-  	.aio_read	= blkdev_aio_read,
+	.aio_read	= generic_file_aio_read,
 	.aio_write	= blkdev_aio_write,
-	.mmap		= blkdev_mmap,
+	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
-	.splice_read	= blkdev_splice_read,
-	.splice_write	= blkdev_splice_write,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
 };
 
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
-- 
cgit v1.2.3


From ab73857e354ab9e317613cba7db714e2c12c6547 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 29 Nov 2012 12:27:00 -0800
Subject: direct-io: don't read inode->i_blkbits multiple times

Since directio can work on a raw block device, and the block size of the
device can change under it, we need to do the same thing that
fs/buffer.c now does: read the block size a single time, using
ACCESS_ONCE().

Reading it multiple times can get different results, which will then
confuse the code because it actually encodes the i_blksize in
relationship to the underlying logical blocksize.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index f86c720dba0e..cf5b44b10c67 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -540,6 +540,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 	sector_t fs_endblk;	/* Into file, in filesystem-sized blocks */
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	int create;
+	unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
 
 	/*
 	 * If there was a memory error and we've overwritten all the
@@ -554,7 +555,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 		fs_count = fs_endblk - fs_startblk + 1;
 
 		map_bh->b_state = 0;
-		map_bh->b_size = fs_count << dio->inode->i_blkbits;
+		map_bh->b_size = fs_count << i_blkbits;
 
 		/*
 		 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
@@ -1053,7 +1054,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int seg;
 	size_t size;
 	unsigned long addr;
-	unsigned blkbits = inode->i_blkbits;
+	unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
+	unsigned blkbits = i_blkbits;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
@@ -1149,7 +1151,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	dio->inode = inode;
 	dio->rw = rw;
 	sdio.blkbits = blkbits;
-	sdio.blkfactor = inode->i_blkbits - blkbits;
+	sdio.blkfactor = i_blkbits - blkbits;
 	sdio.block_in_file = offset >> blkbits;
 
 	sdio.get_block = get_block;
-- 
cgit v1.2.3


From bbec0270bdd887f96377065ee38b8848b5afa395 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 29 Nov 2012 12:31:52 -0800
Subject: blkdev_max_block: make private to fs/buffer.c

We really don't want to look at the block size for the raw block device
accesses in fs/block-dev.c, because it may be changing from under us.
So get rid of the max_block logic entirely, since the caller should
already have done it anyway.

That leaves the only user of this function in fs/buffer.c, so move the
whole function there and make it static.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 55 +------------------------------------------------------
 fs/buffer.c    | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 47a949d8a07e..a1e09b4fe1ba 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode,
 	spin_unlock(&dst->wb.list_lock);
 }
 
-sector_t blkdev_max_block(struct block_device *bdev)
-{
-	sector_t retval = ~((sector_t)0);
-	loff_t sz = i_size_read(bdev->bd_inode);
-
-	if (sz) {
-		unsigned int size = block_size(bdev);
-		unsigned int sizebits = blksize_bits(size);
-		retval = (sz >> sizebits);
-	}
-	return retval;
-}
-
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 void kill_bdev(struct block_device *bdev)
 {
@@ -163,52 +150,12 @@ static int
 blkdev_get_block(struct inode *inode, sector_t iblock,
 		struct buffer_head *bh, int create)
 {
-	if (iblock >= blkdev_max_block(I_BDEV(inode))) {
-		if (create)
-			return -EIO;
-
-		/*
-		 * for reads, we're just trying to fill a partial page.
-		 * return a hole, they will have to call get_block again
-		 * before they can fill it, and they will get -EIO at that
-		 * time
-		 */
-		return 0;
-	}
 	bh->b_bdev = I_BDEV(inode);
 	bh->b_blocknr = iblock;
 	set_buffer_mapped(bh);
 	return 0;
 }
 
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
-		struct buffer_head *bh, int create)
-{
-	sector_t end_block = blkdev_max_block(I_BDEV(inode));
-	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
-
-	if ((iblock + max_blocks) > end_block) {
-		max_blocks = end_block - iblock;
-		if ((long)max_blocks <= 0) {
-			if (create)
-				return -EIO;	/* write fully beyond EOF */
-			/*
-			 * It is a read which is fully beyond EOF.  We return
-			 * a !buffer_mapped buffer
-			 */
-			max_blocks = 0;
-		}
-	}
-
-	bh->b_bdev = I_BDEV(inode);
-	bh->b_blocknr = iblock;
-	bh->b_size = max_blocks << inode->i_blkbits;
-	if (max_blocks)
-		set_buffer_mapped(bh);
-	return 0;
-}
-
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
@@ -217,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct inode *inode = file->f_mapping->host;
 
 	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
-				    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
+				    nr_segs, blkdev_get_block, NULL, NULL, 0);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
diff --git a/fs/buffer.c b/fs/buffer.c
index 28a74ff5324b..3586fb05c8ce 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -911,6 +911,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
 	attach_page_buffers(page, head);
 }
 
+static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
+{
+	sector_t retval = ~((sector_t)0);
+	loff_t sz = i_size_read(bdev->bd_inode);
+
+	if (sz) {
+		unsigned int sizebits = blksize_bits(size);
+		retval = (sz >> sizebits);
+	}
+	return retval;
+}
+
 /*
  * Initialise the state of a blockdev page's buffers.
  */ 
@@ -921,7 +933,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
 	struct buffer_head *head = page_buffers(page);
 	struct buffer_head *bh = head;
 	int uptodate = PageUptodate(page);
-	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode));
+	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
 
 	do {
 		if (!buffer_mapped(bh)) {
-- 
cgit v1.2.3


From 696199f8ccf7fc6d17ef89c296ad3b6c78c52d9c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 29 Nov 2012 22:00:51 -0500
Subject: don't do blind d_drop() in nfs_prime_dcache()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ce8cb926526b..99489cfca24d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -450,7 +450,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 			nfs_refresh_inode(dentry->d_inode, entry->fattr);
 			goto out;
 		} else {
-			d_drop(dentry);
+			if (d_invalidate(dentry) != 0)
+				goto out;
 			dput(dentry);
 		}
 	}
-- 
cgit v1.2.3


From c44600c9d1de64314c2bd58103f15acb53e10073 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 29 Nov 2012 22:04:36 -0500
Subject: nfs_lookup_revalidate(): fix a leak

We are leaking fattr and fhandle if we decide that dentry is not to
be invalidated, after all (e.g. happens to be a mountpoint).  Just
free both before that...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 99489cfca24d..b9e66b7e0c14 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1101,6 +1101,8 @@ out_set_verifier:
 out_zap_parent:
 	nfs_zap_caches(dir);
  out_bad:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
 	nfs_mark_for_revalidate(dir);
 	if (inode && S_ISDIR(inode->i_mode)) {
 		/* Purge readdir caches. */
@@ -1113,8 +1115,6 @@ out_zap_parent:
 		shrink_dcache_parent(dentry);
 	}
 	d_drop(dentry);
-	nfs_free_fattr(fattr);
-	nfs_free_fhandle(fhandle);
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
 			__func__, dentry->d_parent->d_name.name,
-- 
cgit v1.2.3


From 0903a0c8491c1e987dfc6eb294199a36760398bc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 29 Nov 2012 22:11:06 -0500
Subject: cifs: get rid of blind d_drop() in readdir

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cifs/readdir.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f9b5d3d6cf33..1c576e871366 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -86,14 +86,17 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 
 	dentry = d_lookup(parent, name);
 	if (dentry) {
+		int err;
 		inode = dentry->d_inode;
 		/* update inode in place if i_ino didn't change */
 		if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
 			cifs_fattr_to_inode(inode, fattr);
 			return dentry;
 		}
-		d_drop(dentry);
+		err = d_invalidate(dentry);
 		dput(dentry);
+		if (err)
+			return NULL;
 	}
 
 	dentry = d_alloc(parent, name);
-- 
cgit v1.2.3


From 21d8a15ac333b05f1fecdf9fdc30996be2e11d60 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 29 Nov 2012 22:17:21 -0500
Subject: lookup_one_len: don't accept . and ..

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 937f9d50c84b..5f4cdf3ad913 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2131,6 +2131,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	if (!len)
 		return ERR_PTR(-EACCES);
 
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
 	while (len--) {
 		c = *(const unsigned char *)name++;
 		if (c == '/' || c == '\0')
-- 
cgit v1.2.3


From a77cfcb429ed98845a4e4df72473b8f37acd890b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 29 Nov 2012 22:57:33 -0500
Subject: fix off-by-one in argument passed by iterate_fd() to callbacks

Noticed by Pavel Roskin; the thing in his patch I disagree with
was compensating for that shite in callbacks instead of fixing
it once in the iterator itself.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 7cb71b992603..eff23162485f 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -994,16 +994,18 @@ int iterate_fd(struct files_struct *files, unsigned n,
 		const void *p)
 {
 	struct fdtable *fdt;
-	struct file *file;
 	int res = 0;
 	if (!files)
 		return 0;
 	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	while (!res && n < fdt->max_fds) {
-		file = rcu_dereference_check_fdtable(files, fdt->fd[n++]);
-		if (file)
-			res = f(p, file, n);
+	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
+		struct file *file;
+		file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
+		if (!file)
+			continue;
+		res = f(p, file, n);
+		if (res)
+			break;
 	}
 	spin_unlock(&files->file_lock);
 	return res;
-- 
cgit v1.2.3


From 57302e0ddf8a210a66fd8a1a2fa50844863b5ded Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 4 Dec 2012 08:25:11 -0800
Subject: vfs: avoid "attempt to access beyond end of device" warnings

The block device access simplification that avoided accessing the (racy)
block size information (commit bbec0270bdd8: "blkdev_max_block: make
private to fs/buffer.c") no longer checks the maximum block size in the
block mapping path.

That was _almost_ as simple as just removing the code entirely, because
the readers and writers all check the size of the device anyway, so
under normal circumstances it "just worked".

However, the block size may be such that the end of the device may
straddle one single buffer_head.  At which point we may still want to
access the end of the device, but the buffer we use to access it
partially extends past the end.

The 'bd_set_size()' function intentionally sets the block size to avoid
this, but mounting the device - or setting the block size by hand to
some other value - can modify that block size.

So instead, teach 'submit_bh()' about the special case of the buffer
head straddling the end of the device, and turning such an access into a
smaller IO access, avoiding the problem.

This, btw, also means that unlike before, we can now access the whole
device regardless of device block size setting.  So now, even if the
device size is only 512-byte aligned, we can read and write even the
last sector even when having a much bigger block size for accessing the
rest of the device.

So with this, we could now get rid of the 'bd_set_size()' block size
code entirely - resulting in faster IO for the common case - but that
would be a separate patch.

Reported-and-tested-by: Romain Francoise <romain@orebokech.com>
Reporeted-and-tested-by: Meelis Roos <mroos@linux.ee>
Reported-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 3586fb05c8ce..c4e11390a44c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2893,6 +2893,55 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 	bio_put(bio);
 }
 
+/*
+ * This allows us to do IO even on the odd last sectors
+ * of a device, even if the bh block size is some multiple
+ * of the physical sector size.
+ *
+ * We'll just truncate the bio to the size of the device,
+ * and clear the end of the buffer head manually.
+ *
+ * Truly out-of-range accesses will turn into actual IO
+ * errors, this only handles the "we need to be able to
+ * do IO at the final sector" case.
+ */
+static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+{
+	sector_t maxsector;
+	unsigned bytes;
+
+	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+	if (!maxsector)
+		return;
+
+	/*
+	 * If the *whole* IO is past the end of the device,
+	 * let it through, and the IO layer will turn it into
+	 * an EIO.
+	 */
+	if (unlikely(bio->bi_sector >= maxsector))
+		return;
+
+	maxsector -= bio->bi_sector;
+	bytes = bio->bi_size;
+	if (likely((bytes >> 9) <= maxsector))
+		return;
+
+	/* Uhhuh. We've got a bh that straddles the device size! */
+	bytes = maxsector << 9;
+
+	/* Truncate the bio.. */
+	bio->bi_size = bytes;
+	bio->bi_io_vec[0].bv_len = bytes;
+
+	/* ..and clear the end of the buffer for reads */
+	if (rw & READ) {
+		void *kaddr = kmap_atomic(bh->b_page);
+		memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
+		kunmap_atomic(kaddr);
+	}
+}
+
 int submit_bh(int rw, struct buffer_head * bh)
 {
 	struct bio *bio;
@@ -2929,6 +2978,9 @@ int submit_bh(int rw, struct buffer_head * bh)
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
 
+	/* Take care of bh's that straddle the end of the device */
+	guard_bh_eod(rw, bio, bh);
+
 	bio_get(bio);
 	submit_bio(rw, bio);
 
-- 
cgit v1.2.3


From 27d7c2a006a81c04fab00b8cd81b99af3b32738d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 5 Dec 2012 20:01:24 +0300
Subject: vfs: clear to the end of the buffer on partial buffer reads

READ is zero so the "rw & READ" test is always false.  The intended test
was "((rw & RW_MASK) == READ)".

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index c4e11390a44c..ec0aca8ba6bf 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2935,7 +2935,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
 	bio->bi_io_vec[0].bv_len = bytes;
 
 	/* ..and clear the end of the buffer for reads */
-	if (rw & READ) {
+	if ((rw & RW_MASK) == READ) {
 		void *kaddr = kmap_atomic(bh->b_page);
 		memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
 		kunmap_atomic(kaddr);
-- 
cgit v1.2.3