From 386bc35a2d548c28a5083b2e162a20251b37cab5 Mon Sep 17 00:00:00 2001 From: Anna Leuschner Date: Mon, 22 Oct 2012 21:53:36 +0200 Subject: vfs: fix: don't increase bio_slab_max if krealloc() fails Without the patch, bio_slab_max, representing bio_slabs capacity, is increased before krealloc() of bio_slabs. If krealloc() fails, bio_slab_max is too high. Fix that by only updating bio_slab_max if krealloc() is successful. Signed-off-by: Anna Leuschner Signed-off-by: Jens Axboe --- fs/bio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 9298c65ad9c7..b96fc6ce4855 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -75,6 +75,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) unsigned int sz = sizeof(struct bio) + extra_size; struct kmem_cache *slab = NULL; struct bio_slab *bslab, *new_bio_slabs; + unsigned int new_bio_slab_max; unsigned int i, entry = -1; mutex_lock(&bio_slab_lock); @@ -97,12 +98,13 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) goto out_unlock; if (bio_slab_nr == bio_slab_max && entry == -1) { - bio_slab_max <<= 1; + new_bio_slab_max = bio_slab_max << 1; new_bio_slabs = krealloc(bio_slabs, - bio_slab_max * sizeof(struct bio_slab), + new_bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); if (!new_bio_slabs) goto out_unlock; + bio_slab_max = new_bio_slab_max; bio_slabs = new_bio_slabs; } if (entry == -1) -- cgit v1.2.3 From 57911b8ba814fae01306376a0d02bc7cdc88dc94 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 19 Oct 2012 09:22:03 +0200 Subject: Btrfs: don't put removals from push_node_left into tree mod log twice Independant of the check (push_items < src_items) tree_mod_log_eb_copy did log the removal of the old data entries from the source buffer. Therefore, we must not call tree_mod_log_eb_move if the check evaluates to true, as that would log the removal twice, finally resulting in (rewinded) buffers with wrong values for header_nritems. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b33436211000..44a7e25353a6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1225,6 +1225,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, free_extent_buffer(eb); __tree_mod_log_rewind(eb_rewin, time_seq, tm); + WARN_ON(btrfs_header_nritems(eb_rewin) > + BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); return eb_rewin; } @@ -1280,6 +1282,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) else WARN_ON(btrfs_header_level(eb) != 0); extent_buffer_get(eb); + WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); return eb; } @@ -2970,8 +2973,10 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { - tree_mod_log_eb_move(root->fs_info, src, 0, push_items, - src_nritems - push_items); + /* + * don't call tree_mod_log_eb_move here, key removal was already + * fully logged by tree_mod_log_eb_copy above. + */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), (src_nritems - push_items) * -- cgit v1.2.3 From ba1bfbd592c1adddd5d0005f587a6e308e25c949 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 22 Oct 2012 20:02:56 +0200 Subject: Btrfs: fix a tree mod logging issue for root replacement operations Avoid the implicit free by tree_mod_log_set_root_pointer, which is wrong in two places. Where needed, we call tree_mod_log_free_eb explicitly now. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 44a7e25353a6..e6b75ccd2850 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -647,8 +647,6 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; - __tree_mod_log_free_eb(fs_info, old_root); - ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) goto out; @@ -926,12 +924,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } - /* - * don't log freeing in case we're freeing the root node, this - * is done by tree_mod_log_set_root_pointer later - */ - if (buf != root->node && btrfs_header_level(buf) != 0) - tree_mod_log_free_eb(root->fs_info, buf); + tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -1728,6 +1721,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } + tree_mod_log_free_eb(root->fs_info, root->node); tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); -- cgit v1.2.3 From 834328a8493079d15f30866ace42489463f52571 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 23 Oct 2012 11:27:33 +0200 Subject: Btrfs: tree mod log's old roots could still be part of the tree Tree mod log treated old root buffers as always empty buffers when starting the rewind operations. However, the old root may still be part of the current tree at a lower level, with still some valid entries. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e6b75ccd2850..d9308c38a8f2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1239,6 +1239,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; + u32 blocksize; eb = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); @@ -1254,12 +1255,28 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } tm = tree_mod_log_search(root->fs_info, logical, time_seq); - if (old_root) + if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); + blocksize = btrfs_level_size(root, old_root->level); + eb = read_tree_block(root, logical, blocksize, 0); + if (!eb) { + pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", + logical); + WARN_ON(1); + } else { + eb = btrfs_clone_extent_buffer(eb); + } + } else if (old_root) { + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); eb = alloc_dummy_extent_buffer(logical, root->nodesize); - else + } else { eb = btrfs_clone_extent_buffer(root->node); - btrfs_tree_read_unlock(root->node); - free_extent_buffer(root->node); + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); + } + if (!eb) return NULL; btrfs_tree_read_lock(eb); -- cgit v1.2.3 From 5b6602e762cae17c8891d19698afea451e9c1d95 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 23 Oct 2012 11:28:27 +0200 Subject: Btrfs: determine level of old roots In btrfs_find_all_roots' termination condition, we compare the level of the old buffer we got from btrfs_search_old_slot to the level of the current root node. We'd better compare it to the level of the rewinded root node. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 4 +--- fs/btrfs/ctree.c | 17 +++++++++++++++++ fs/btrfs/ctree.h | 1 + 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f3187938e081..65608fbf2232 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -283,9 +283,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - rcu_read_lock(); - root_level = btrfs_header_level(root->node); - rcu_read_unlock(); + root_level = btrfs_old_root_level(root, time_seq); if (root_level + 1 == level) goto out; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d9308c38a8f2..ef68c33eefd9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1297,6 +1297,23 @@ get_old_root(struct btrfs_root *root, u64 time_seq) return eb; } +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + int level; + + tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { + level = tm->old_root.level; + } else { + rcu_read_lock(); + level = btrfs_header_level(root->node); + rcu_read_unlock(); + } + + return level; +} + static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1630be831210..34c5a442dd33 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3120,6 +3120,7 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic_inc_return(&fs_info->tree_mod_seq); } +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, -- cgit v1.2.3 From d638108484d186bf97a838d037f165d9b404e6ee Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 23 Oct 2012 14:21:05 +0200 Subject: Btrfs: fix extent buffer reference for tree mod log roots In get_old_root we grab a lock on the extent buffer before we obtain a reference on that buffer. That order is changed now. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ef68c33eefd9..f6739903d072 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1279,6 +1279,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; + extent_buffer_get(eb); btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); @@ -1291,7 +1292,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) __tree_mod_log_rewind(eb, time_seq, tm); else WARN_ON(btrfs_header_level(eb) != 0); - extent_buffer_get(eb); WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); return eb; -- cgit v1.2.3 From 01763a2e37425ae3f37a3dc051e0703fdade5956 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 23 Oct 2012 15:02:12 +0200 Subject: Btrfs: comment for loop in tree_mod_log_insert_move Emphasis the way tree_mod_log_insert_move avoids adding MOD_LOG_KEY_REMOVE_WHILE_MOVING operations, depending on the direction of the move operation. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f6739903d072..eba44b076829 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -596,6 +596,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, eb)) return 0; + /* + * When we override something during the move, we log these removals. + * This can only happen when we move towards the beginning of the + * buffer, i.e. dst_slot < src_slot. + */ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, MOD_LOG_KEY_REMOVE_WHILE_MOVING); -- cgit v1.2.3 From a4ee8d978e47e79d536226dccb48991f70091168 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Oct 2012 13:51:58 -0400 Subject: LOCKD: fix races in nsm_client_get Commit e9406db20fecbfcab646bad157b4cfdc7cadddfb (lockd: per-net NSM client creation and destruction helpers introduced) contains a nasty race on initialisation of the per-net NSM client because it doesn't check whether or not the client is set after grabbing the nsm_create_mutex. Reported-by: Nix Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/lockd/mon.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index e4fb3ba5a58a..fe695603e395 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -85,29 +85,38 @@ static struct rpc_clnt *nsm_create(struct net *net) return rpc_create(&args); } +static struct rpc_clnt *nsm_client_set(struct lockd_net *ln, + struct rpc_clnt *clnt) +{ + spin_lock(&ln->nsm_clnt_lock); + if (ln->nsm_users == 0) { + if (clnt == NULL) + goto out; + ln->nsm_clnt = clnt; + } + clnt = ln->nsm_clnt; + ln->nsm_users++; +out: + spin_unlock(&ln->nsm_clnt_lock); + return clnt; +} + static struct rpc_clnt *nsm_client_get(struct net *net) { - static DEFINE_MUTEX(nsm_create_mutex); - struct rpc_clnt *clnt; + struct rpc_clnt *clnt, *new; struct lockd_net *ln = net_generic(net, lockd_net_id); - spin_lock(&ln->nsm_clnt_lock); - if (ln->nsm_users) { - ln->nsm_users++; - clnt = ln->nsm_clnt; - spin_unlock(&ln->nsm_clnt_lock); + clnt = nsm_client_set(ln, NULL); + if (clnt != NULL) goto out; - } - spin_unlock(&ln->nsm_clnt_lock); - mutex_lock(&nsm_create_mutex); - clnt = nsm_create(net); - if (!IS_ERR(clnt)) { - ln->nsm_clnt = clnt; - smp_wmb(); - ln->nsm_users = 1; - } - mutex_unlock(&nsm_create_mutex); + clnt = new = nsm_create(net); + if (IS_ERR(clnt)) + goto out; + + clnt = nsm_client_set(ln, new); + if (clnt != new) + rpc_shutdown_client(new); out: return clnt; } -- cgit v1.2.3 From e498daa81295d02f7359af313c2b7f87e1062207 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Oct 2012 08:53:35 -0400 Subject: LOCKD: Clear ln->nsm_clnt only when ln->nsm_users is zero The current code is clearing it in all cases _except_ when zero. Reported-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/lockd/mon.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index fe695603e395..3d7e09bcc0e9 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -124,18 +124,16 @@ out: static void nsm_client_put(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); - struct rpc_clnt *clnt = ln->nsm_clnt; - int shutdown = 0; + struct rpc_clnt *clnt = NULL; spin_lock(&ln->nsm_clnt_lock); - if (ln->nsm_users) { - if (--ln->nsm_users) - ln->nsm_clnt = NULL; - shutdown = !ln->nsm_users; + ln->nsm_users--; + if (ln->nsm_users == 0) { + clnt = ln->nsm_clnt; + ln->nsm_clnt = NULL; } spin_unlock(&ln->nsm_clnt_lock); - - if (shutdown) + if (clnt != NULL) rpc_shutdown_client(clnt); } -- cgit v1.2.3 From 66081a72517a131430dcf986775f3268aafcb546 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sat, 29 Sep 2012 22:23:19 +0200 Subject: sysfs: sysfs_pathname/sysfs_add_one: Use strlcat() instead of strcat() The warning check for duplicate sysfs entries can cause a buffer overflow when printing the warning, as strcat() doesn't check buffer sizes. Use strlcat() instead. Since strlcat() doesn't return a pointer to the passed buffer, unlike strcat(), I had to convert the nested concatenation in sysfs_add_one() to an admittedly more obscure comma operator construct, to avoid emitting code for the concatenation if CONFIG_BUG is disabled. Signed-off-by: Geert Uytterhoeven Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 6b0bb00d4d2b..2fbdff6be25c 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -485,20 +485,18 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) /** * sysfs_pathname - return full path to sysfs dirent * @sd: sysfs_dirent whose path we want - * @path: caller allocated buffer + * @path: caller allocated buffer of size PATH_MAX * * Gives the name "/" to the sysfs_root entry; any path returned * is relative to wherever sysfs is mounted. - * - * XXX: does no error checking on @path size */ static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) { if (sd->s_parent) { sysfs_pathname(sd->s_parent, path); - strcat(path, "/"); + strlcat(path, "/", PATH_MAX); } - strcat(path, sd->s_name); + strlcat(path, sd->s_name, PATH_MAX); return path; } @@ -531,9 +529,11 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) char *path = kzalloc(PATH_MAX, GFP_KERNEL); WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n", - (path == NULL) ? sd->s_name : - strcat(strcat(sysfs_pathname(acxt->parent_sd, path), "/"), - sd->s_name)); + (path == NULL) ? sd->s_name + : (sysfs_pathname(acxt->parent_sd, path), + strlcat(path, "/", PATH_MAX), + strlcat(path, sd->s_name, PATH_MAX), + path)); kfree(path); } -- cgit v1.2.3 From 661bec6ba884b86517ef5ea529aabb281a7198d9 Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Wed, 10 Oct 2012 08:50:47 -0600 Subject: Fix a sign bug causing invalid memory access in the ino_paths ioctl. To see the problem, create many hardlinks to the same file (120 should do it), then look up paths by inode with: ls -i btrfs inspect inode-resolve -v $ino /mnt/btrfs I noticed the memory layout of the fspath->val data had some irregularities (some unnecessary gaps that stop appearing about halfway), so I'm not sure there aren't any bugs left in it. --- fs/btrfs/backref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f3187938e081..2bcbea3f6308 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1186,7 +1186,7 @@ static char *ref_to_path(struct btrfs_root *fs_root, int slot; u64 next_inum; int ret; - s64 bytes_left = size - 1; + s64 bytes_left = ((s64)size) - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; int leave_spinning = path->leave_spinning; -- cgit v1.2.3 From 84167d190569eedcdb24bf2499bdda437e442962 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 11 Oct 2012 07:25:16 -0600 Subject: Btrfs: Fix wrong error handling code gcc says "warning: comparison of unsigned expression >= 0 is always true" because i is an unsigned long. And gcc is right this time. Signed-off-by: Stefan Behrens --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 64dc93f64bc0..a32ebfeb91cf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4104,8 +4104,8 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) return eb; err: - for (i--; i >= 0; i--) - __free_page(eb->pages[i]); + for (; i > 0; i--) + __free_page(eb->pages[i - 1]); __free_extent_buffer(eb); return NULL; } -- cgit v1.2.3 From 96b5bd777118bb673b458b41bbefc7f0f31d65c9 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 15 Oct 2012 08:30:45 +0000 Subject: Btrfs: extended inode refs support for send mechanism This adds support for the new extended inode refs to btrfs send. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 22 +++++----- fs/btrfs/backref.h | 4 ++ fs/btrfs/send.c | 126 ++++++++++++++++++++++++++++++++++------------------- 3 files changed, 94 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2bcbea3f6308..b8b69266393a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1177,11 +1177,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, return ret; } -static char *ref_to_path(struct btrfs_root *fs_root, - struct btrfs_path *path, - u32 name_len, unsigned long name_off, - struct extent_buffer *eb_in, u64 parent, - char *dest, u32 size) +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) { int slot; u64 next_inum; @@ -1266,10 +1265,10 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct extent_buffer *eb_in, u64 parent, char *dest, u32 size) { - return ref_to_path(fs_root, path, - btrfs_inode_ref_name_len(eb_in, iref), - (unsigned long)(iref + 1), - eb_in, parent, dest, size); + return btrfs_ref_to_path(fs_root, path, + btrfs_inode_ref_name_len(eb_in, iref), + (unsigned long)(iref + 1), + eb_in, parent, dest, size); } /* @@ -1715,9 +1714,8 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, ipath->fspath->bytes_left - s_ptr : 0; fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; - fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, - name_off, eb, inum, fspath_min, - bytes_left); + fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, + name_off, eb, inum, fspath_min, bytes_left); if (IS_ERR(fspath)) return PTR_ERR(fspath); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e75533043a5f..d61feca79455 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -62,6 +62,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_inode_ref *iref, struct extent_buffer *eb, u64 parent, char *dest, u32 size); +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c7beb543a4a8..4f20c49edd58 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -745,31 +745,36 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, void *ctx); /* - * Helper function to iterate the entries in ONE btrfs_inode_ref. + * Helper function to iterate the entries in ONE btrfs_inode_ref or + * btrfs_inode_extref. * The iterate callback may return a non zero value to stop iteration. This can * be a negative value for error codes or 1 to simply stop it. * - * path must point to the INODE_REF when called. + * path must point to the INODE_REF or INODE_EXTREF when called. */ static int iterate_inode_ref(struct send_ctx *sctx, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, int resolve, iterate_inode_ref_t iterate, void *ctx) { - struct extent_buffer *eb; + struct extent_buffer *eb = path->nodes[0]; struct btrfs_item *item; struct btrfs_inode_ref *iref; + struct btrfs_inode_extref *extref; struct btrfs_path *tmp_path; struct fs_path *p; - u32 cur; - u32 len; + u32 cur = 0; u32 total; - int slot; + int slot = path->slots[0]; u32 name_len; char *start; int ret = 0; - int num; + int num = 0; int index; + u64 dir; + unsigned long name_off; + unsigned long elem_size; + unsigned long ptr; p = fs_path_alloc_reversed(sctx); if (!p) @@ -781,24 +786,40 @@ static int iterate_inode_ref(struct send_ctx *sctx, return -ENOMEM; } - eb = path->nodes[0]; - slot = path->slots[0]; - item = btrfs_item_nr(eb, slot); - iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - cur = 0; - len = 0; - total = btrfs_item_size(eb, item); - num = 0; + if (found_key->type == BTRFS_INODE_REF_KEY) { + ptr = (unsigned long)btrfs_item_ptr(eb, slot, + struct btrfs_inode_ref); + item = btrfs_item_nr(eb, slot); + total = btrfs_item_size(eb, item); + elem_size = sizeof(*iref); + } else { + ptr = btrfs_item_ptr_offset(eb, slot); + total = btrfs_item_size_nr(eb, slot); + elem_size = sizeof(*extref); + } + while (cur < total) { fs_path_reset(p); - name_len = btrfs_inode_ref_name_len(eb, iref); - index = btrfs_inode_ref_index(eb, iref); + if (found_key->type == BTRFS_INODE_REF_KEY) { + iref = (struct btrfs_inode_ref *)(ptr + cur); + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + index = btrfs_inode_ref_index(eb, iref); + dir = found_key->offset; + } else { + extref = (struct btrfs_inode_extref *)(ptr + cur); + name_len = btrfs_inode_extref_name_len(eb, extref); + name_off = (unsigned long)&extref->name; + index = btrfs_inode_extref_index(eb, extref); + dir = btrfs_inode_extref_parent(eb, extref); + } + if (resolve) { - start = btrfs_iref_to_path(root, tmp_path, iref, eb, - found_key->offset, p->buf, - p->buf_len); + start = btrfs_ref_to_path(root, tmp_path, name_len, + name_off, eb, dir, + p->buf, p->buf_len); if (IS_ERR(start)) { ret = PTR_ERR(start); goto out; @@ -809,9 +830,10 @@ static int iterate_inode_ref(struct send_ctx *sctx, p->buf_len + p->buf - start); if (ret < 0) goto out; - start = btrfs_iref_to_path(root, tmp_path, iref, - eb, found_key->offset, p->buf, - p->buf_len); + start = btrfs_ref_to_path(root, tmp_path, + name_len, name_off, + eb, dir, + p->buf, p->buf_len); if (IS_ERR(start)) { ret = PTR_ERR(start); goto out; @@ -820,21 +842,16 @@ static int iterate_inode_ref(struct send_ctx *sctx, } p->start = start; } else { - ret = fs_path_add_from_extent_buffer(p, eb, - (unsigned long)(iref + 1), name_len); + ret = fs_path_add_from_extent_buffer(p, eb, name_off, + name_len); if (ret < 0) goto out; } - - len = sizeof(*iref) + name_len; - iref = (struct btrfs_inode_ref *)((char *)iref + len); - cur += len; - - ret = iterate(num, found_key->offset, index, p, ctx); + cur += elem_size + name_len; + ret = iterate(num, dir, index, p, ctx); if (ret) goto out; - num++; } @@ -998,7 +1015,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, } btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); if (found_key.objectid != ino || - found_key.type != BTRFS_INODE_REF_KEY) { + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) { ret = -ENOENT; goto out; } @@ -1551,8 +1569,8 @@ static int get_first_ref(struct send_ctx *sctx, struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; - struct btrfs_inode_ref *iref; int len; + u64 parent_dir; path = alloc_path_for_send(); if (!path) @@ -1568,27 +1586,41 @@ static int get_first_ref(struct send_ctx *sctx, if (!ret) btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - if (ret || found_key.objectid != key.objectid || - found_key.type != key.type) { + if (ret || found_key.objectid != ino || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) { ret = -ENOENT; goto out; } - iref = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_ref); - len = btrfs_inode_ref_name_len(path->nodes[0], iref); - ret = fs_path_add_from_extent_buffer(name, path->nodes[0], - (unsigned long)(iref + 1), len); + if (key.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + iref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], iref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)(iref + 1), + len); + parent_dir = found_key.offset; + } else { + struct btrfs_inode_extref *extref; + extref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_extref); + len = btrfs_inode_extref_name_len(path->nodes[0], extref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)&extref->name, len); + parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); + } if (ret < 0) goto out; btrfs_release_path(path); - ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL, + ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; - *dir = found_key.offset; + *dir = parent_dir; out: btrfs_free_path(path); @@ -3226,7 +3258,8 @@ static int process_all_refs(struct send_ctx *sctx, btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || - found_key.type != key.type) + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) break; ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, @@ -3987,7 +4020,7 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) if (sctx->cur_ino == 0) goto out; if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && - sctx->cmp_key->type <= BTRFS_INODE_REF_KEY) + sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) goto out; if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) goto out; @@ -4335,7 +4368,8 @@ static int changed_cb(struct btrfs_root *left_root, if (key->type == BTRFS_INODE_ITEM_KEY) ret = changed_inode(sctx, result); - else if (key->type == BTRFS_INODE_REF_KEY) + else if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) ret = changed_ref(sctx, result); else if (key->type == BTRFS_XATTR_ITEM_KEY) ret = changed_xattr(sctx, result); -- cgit v1.2.3 From d79e50433b2bea09eb680ed5fae15e8a12356353 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 15 Oct 2012 18:28:46 +0000 Subject: Btrfs: send correct rdev and mode in btrfs-send When sending a device file, the stream was missing the mode. Also the rdev was encoded wrongly. Signed-off-by: Arne Jansen --- fs/btrfs/send.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4f20c49edd58..97cc5399306b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2462,7 +2462,8 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { - TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev); + TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode); } ret = send_cmd(sctx); -- cgit v1.2.3 From 5b7ff5b3c4468780b15c6b20cd0567cd9f2aa626 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 16 Oct 2012 05:44:21 +0000 Subject: Btrfs: fix memory leak in btrfs_quota_enable() We should free quota_root before returning from the error handling code. Signed-off-by: Tsutomu Itoh --- fs/btrfs/qgroup.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5039686df6ae..fe9d02c45f8e 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -790,8 +790,10 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out_free_root; + } key.objectid = 0; key.type = BTRFS_QGROUP_STATUS_KEY; @@ -800,7 +802,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*ptr)); if (ret) - goto out; + goto out_free_path; leaf = path->nodes[0]; ptr = btrfs_item_ptr(leaf, path->slots[0], @@ -818,8 +820,15 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, fs_info->quota_root = quota_root; fs_info->pending_quota_state = 1; spin_unlock(&fs_info->qgroup_lock); -out: +out_free_path: btrfs_free_path(path); +out_free_root: + if (ret) { + free_extent_buffer(quota_root->node); + free_extent_buffer(quota_root->commit_root); + kfree(quota_root); + } +out: return ret; } -- cgit v1.2.3 From e515c18bfef718a7900924d50198d968565dd60e Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 16 Oct 2012 09:34:36 +0000 Subject: btrfs: Return EINVAL when length to trim is less than FSB Currently if len argument in btrfs_ioctl_fitrim() is smaller than one FSB we will continue and finally return 0 bytes discarded. However if the length to discard is smaller then file system block we should really return EINVAL. Signed-off-by: Lukas Czerner --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f5a2e6c4320a..da518ded34bd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -343,7 +343,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return -EOPNOTSUPP; if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; - if (range.start > total_bytes) + if (range.start > total_bytes || + range.len < fs_info->sb->s_blocksize) return -EINVAL; range.len = min(range.len, total_bytes - range.start); -- cgit v1.2.3 From 671415b7db49f62896f0b6d50fc4f312a0512983 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Oct 2012 11:26:46 +0000 Subject: Btrfs: fix deadlock caused by the nested chunk allocation Steps to reproduce: # mkfs.btrfs -m raid1 # btrfstune -S 1 # mount # btrfs device add # mount -o remount,rw # dd if=/dev/zero of=/tmpfile bs=1M count=1 Deadlock happened. It is because of the nested chunk allocation. When we wrote the data into the filesystem, we would allocate the data chunk because there was no data chunk in the filesystem. At the end of the data chunk allocation, we should insert the metadata of the data chunk into the extent tree, but there was no raid1 chunk, so we tried to lock the chunk allocation mutex to allocate the new chunk, but we had held the mutex, the deadlock happened. By rights, we would allocate the raid1 chunk when we added the second device because the profile of the seed filesystem is raid1 and we had two devices. But we didn't do that in fact. It is because the last step of the first device insertion didn't commit the transaction. So when we added the second device, we didn't cow the tree, and just inserted the relative metadata into the leaves which were generated by the first device insertion, and its profile was dup. So, I fix this problem by commiting the transaction at the end of the first device insertion. Signed-off-by: Miao Xie --- fs/btrfs/volumes.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 029b903a4ae3..0f5ebb72a5ea 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1819,6 +1819,13 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) == -ENOENT) + return 0; + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); } return ret; -- cgit v1.2.3 From e2d044fe77f8e845333bb1bd29587dc08a4346a0 Mon Sep 17 00:00:00 2001 From: Alex Lyakas Date: Wed, 17 Oct 2012 13:52:47 +0000 Subject: Btrfs: Send: preserve ownership (uid and gid) also for symlinks. This patch also requires a change in the user-space part of "receive". We need to use "lchown" instead of "chown". We will do this in the following patch. Signed-off-by: Alex Lyakas if (S_ISREG(sctx->cur_inode_mode)) { --- fs/btrfs/send.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 97cc5399306b..e78b297b0b00 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4067,22 +4067,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret < 0) goto out; - if (!S_ISLNK(sctx->cur_inode_mode)) { - if (!sctx->parent_root || sctx->cur_inode_new) { + if (!sctx->parent_root || sctx->cur_inode_new) { + need_chown = 1; + if (!S_ISLNK(sctx->cur_inode_mode)) need_chmod = 1; - need_chown = 1; - } else { - ret = get_inode_info(sctx->parent_root, sctx->cur_ino, - NULL, NULL, &right_mode, &right_uid, - &right_gid, NULL); - if (ret < 0) - goto out; + } else { + ret = get_inode_info(sctx->parent_root, sctx->cur_ino, + NULL, NULL, &right_mode, &right_uid, + &right_gid, NULL); + if (ret < 0) + goto out; - if (left_uid != right_uid || left_gid != right_gid) - need_chown = 1; - if (left_mode != right_mode) - need_chmod = 1; - } + if (left_uid != right_uid || left_gid != right_gid) + need_chown = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) + need_chmod = 1; } if (S_ISREG(sctx->cur_inode_mode)) { -- cgit v1.2.3 From be6aef604920406b348acf3be6e6e8db55696386 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Oct 2012 15:43:12 -0400 Subject: Btrfs: Use btrfs_update_inode_fallback when creating a snapshot On a really full file system I was getting ENOSPC back from btrfs_update_inode when trying to update the parent inode when creating a snapshot. Just use the fallback method so we can update the inode and not have to worry about having a delayed ref. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/inode.c | 7 +++---- fs/btrfs/transaction.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1630be831210..8a92ab1632a2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3338,6 +3338,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f92def2467a6..5fc09908f20f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -94,8 +94,6 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); -static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -2746,8 +2744,9 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, return btrfs_update_inode_item(trans, root, inode); } -static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) +noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) { int ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 77db875b5116..04bbfb1052eb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1200,7 +1200,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, parent_root, parent_inode); + ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); if (ret) btrfs_abort_transaction(trans, root, ret); fail: -- cgit v1.2.3 From 7bfdcf7fbad56c0f1fc3e2d26431bed72bdcce2d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 25 Oct 2012 07:30:19 -0600 Subject: Btrfs: fix memory leak when cloning root's node After cloning root's node, we forgot to dec the src's ref which can lead to a memory leak. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index eba44b076829..cdfb4c49a806 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1241,6 +1241,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) { struct tree_mod_elem *tm; struct extent_buffer *eb; + struct extent_buffer *old; struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; @@ -1264,13 +1265,14 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_tree_read_unlock(root->node); free_extent_buffer(root->node); blocksize = btrfs_level_size(root, old_root->level); - eb = read_tree_block(root, logical, blocksize, 0); - if (!eb) { + old = read_tree_block(root, logical, blocksize, 0); + if (!old) { pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", logical); WARN_ON(1); } else { - eb = btrfs_clone_extent_buffer(eb); + eb = btrfs_clone_extent_buffer(old); + free_extent_buffer(old); } } else if (old_root) { btrfs_tree_read_unlock(root->node); -- cgit v1.2.3 From c37b2b6269ee4637fb7cdb5da0d1e47215d57ce2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Oct 2012 15:51:44 -0400 Subject: Btrfs: do not bug when we fail to commit the transaction We BUG if we fail to commit the transaction when creating a snapshot, which is just obnoxious. Remove the BUG_ON(). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index da518ded34bd..84bb4de1bb80 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -571,7 +571,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); } - BUG_ON(ret); + if (ret) + goto fail; ret = pending_snapshot->error; if (ret) -- cgit v1.2.3 From b40a79591ca918e7b91b0d9b6abd5d00f2e88c19 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 25 Oct 2012 22:28:12 +0200 Subject: freezer: exec should clear PF_NOFREEZE along with PF_KTHREAD flush_old_exec() clears PF_KTHREAD but forgets about PF_NOFREEZE. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Cc: stable@vger.kernel.org Signed-off-by: Rafael J. Wysocki --- fs/exec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 8b9011b67041..0039055b1fc6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1083,7 +1083,8 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ set_fs(USER_DS); - current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD); + current->flags &= + ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE); flush_thread(); current->personality &= ~bprm->per_clear; -- cgit v1.2.3 From 12176503366885edd542389eed3aaf94be163fdb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 25 Oct 2012 13:38:16 -0700 Subject: fs/compat_ioctl.c: VIDEO_SET_SPU_PALETTE missing error check The compat ioctl for VIDEO_SET_SPU_PALETTE was missing an error check while converting ioctl arguments. This could lead to leaking kernel stack contents into userspace. Patch extracted from existing fix in grsecurity. Signed-off-by: Kees Cook Cc: David Miller Cc: Brad Spengler Cc: PaX Team Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index f5054025f9da..4c6285fff598 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -210,6 +210,8 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, err = get_user(palp, &up->palette); err |= get_user(length, &up->length); + if (err) + return -EFAULT; up_native = compat_alloc_user_space(sizeof(struct video_spu_palette)); err = put_user(compat_ptr(palp), &up_native->palette); -- cgit v1.2.3 From 98a1eebda3cb2a84ecf1f219bb3a95769033d1bf Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 10 Oct 2012 10:55:28 +0300 Subject: UBIFS: introduce categorized lprops counter This commit is a preparation for a subsequent bugfix. We introduce a counter for categorized lprops. Signed-off-by: Artem Bityutskiy Cc: stable@vger.kernel.org --- fs/ubifs/lprops.c | 6 ++++++ fs/ubifs/ubifs.h | 3 +++ 2 files changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index e5a2a35a46dc..46190a7c42a6 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -300,8 +300,11 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, default: ubifs_assert(0); } + lprops->flags &= ~LPROPS_CAT_MASK; lprops->flags |= cat; + c->in_a_category_cnt += 1; + ubifs_assert(c->in_a_category_cnt <= c->main_lebs); } /** @@ -334,6 +337,9 @@ static void ubifs_remove_from_cat(struct ubifs_info *c, default: ubifs_assert(0); } + + c->in_a_category_cnt -= 1; + ubifs_assert(c->in_a_category_cnt >= 0); } /** diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 5486346d0a3f..d133c276fe05 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1183,6 +1183,8 @@ struct ubifs_debug_info; * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) * @freeable_cnt: number of freeable LEBs in @freeable_list + * @in_a_category_cnt: count of lprops which are in a certain category, which + * basically meants that they were loaded from the flash * * @ltab_lnum: LEB number of LPT's own lprops table * @ltab_offs: offset of LPT's own lprops table @@ -1412,6 +1414,7 @@ struct ubifs_info { struct list_head freeable_list; struct list_head frdi_idx_list; int freeable_cnt; + int in_a_category_cnt; int ltab_lnum; int ltab_offs; -- cgit v1.2.3 From a28ad42a4a0c6f302f488f26488b8b37c9b30024 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 9 Oct 2012 16:20:15 +0300 Subject: UBIFS: fix mounting problems after power cuts This is a bugfix for a problem with the following symptoms: 1. A power cut happens 2. After reboot, we try to mount UBIFS 3. Mount fails with "No space left on device" error message UBIFS complains like this: UBIFS error (pid 28225): grab_empty_leb: could not find an empty LEB The root cause of this problem is that when we mount, not all LEBs are categorized. Only those which were read are. However, the 'ubifs_find_free_leb_for_idx()' function assumes that all LEBs were categorized and 'c->freeable_cnt' is valid, which is a false assumption. This patch fixes the problem by teaching 'ubifs_find_free_leb_for_idx()' to always fall back to LPT scanning if no freeable LEBs were found. This problem was reported by few people in the past, but Brent Taylor was able to reproduce it and send me a flash image which cannot be mounted, which made it easy to hunt the bug. Kudos to Brent. Reported-by: Brent Taylor Signed-off-by: Artem Bityutskiy Cc: stable@vger.kernel.org --- fs/ubifs/find.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 28ec13af28d9..2dcf3d473fec 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -681,8 +681,16 @@ int ubifs_find_free_leb_for_idx(struct ubifs_info *c) if (!lprops) { lprops = ubifs_fast_find_freeable(c); if (!lprops) { - ubifs_assert(c->freeable_cnt == 0); - if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + /* + * The first condition means the following: go scan the + * LPT if there are uncategorized lprops, which means + * there may be freeable LEBs there (UBIFS does not + * store the information about freeable LEBs in the + * master node). + */ + if (c->in_a_category_cnt != c->main_lebs || + c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + ubifs_assert(c->freeable_cnt == 0); lprops = scan_for_leb_for_idx(c); if (IS_ERR(lprops)) { err = PTR_ERR(lprops); -- cgit v1.2.3 From 561ec64ae67ef25cac8d72bb9c4bfc955edfd415 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 26 Oct 2012 10:05:07 -0700 Subject: VFS: don't do protected {sym,hard}links by default In commit 800179c9b8a1 ("This adds symlink and hardlink restrictions to the Linux VFS"), the new link protections were enabled by default, in the hope that no actual application would care, despite it being technically against legacy UNIX (and documented POSIX) behavior. However, it does turn out to break some applications. It's rare, and it's unfortunate, but it's unacceptable to break existing systems, so we'll have to default to legacy behavior. In particular, it has broken the way AFD distributes files, see http://www.dwd.de/AFD/ along with some legacy scripts. Distributions can end up setting this at initrd time or in system scripts: if you have security problems due to link attacks during your early boot sequence, you have bigger problems than some kernel sysctl setting. Do: echo 1 > /proc/sys/fs/protected_symlinks echo 1 > /proc/sys/fs/protected_hardlinks to re-enable the link protections. Alternatively, we may at some point introduce a kernel config option that sets these kinds of "more secure but not traditional" behavioural options automatically. Reported-by: Nick Bowler Reported-by: Holger Kiehl Cc: Kees Cook Cc: Ingo Molnar Cc: Andrew Morton Cc: Al Viro Cc: Alan Cox Cc: Theodore Ts'o Cc: stable@kernel.org # v3.6 Signed-off-by: Linus Torvalds --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index d1895f308156..937f9d50c84b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -705,8 +705,8 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki path_put(link); } -int sysctl_protected_symlinks __read_mostly = 1; -int sysctl_protected_hardlinks __read_mostly = 1; +int sysctl_protected_symlinks __read_mostly = 0; +int sysctl_protected_hardlinks __read_mostly = 0; /** * may_follow_link - Check symlink following for unsafe situations -- cgit v1.2.3 From 1a25b1c4ce189e3926f2981f3302352a930086db Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 15 Oct 2012 17:20:17 -0400 Subject: Lock splice_read and splice_write functions Functions generic_file_splice_read and generic_file_splice_write access the pagecache directly. For block devices these functions must be locked so that block size is not changed while they are in progress. This patch is an additional fix for commit b87570f5d349 ("Fix a crash when block device is read and block size is changed at the same time") that locked aio_read, aio_write and mmap against block size change. Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/block_dev.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index b3c1d3dae77d..1a1e5e3b1eaf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1661,6 +1661,39 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) return ret; } +static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t ret; + struct block_device *bdev = I_BDEV(file->f_mapping->host); + + percpu_down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_splice_read(file, ppos, pipe, len, flags); + + percpu_up_read(&bdev->bd_block_size_semaphore); + + return ret; +} + +static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe, + struct file *file, loff_t *ppos, size_t len, + unsigned int flags) +{ + ssize_t ret; + struct block_device *bdev = I_BDEV(file->f_mapping->host); + + percpu_down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_splice_write(pipe, file, ppos, len, flags); + + percpu_up_read(&bdev->bd_block_size_semaphore); + + return ret; +} + + /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1699,8 +1732,8 @@ const struct file_operations def_blk_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif - .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_read = blkdev_splice_read, + .splice_write = blkdev_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) -- cgit v1.2.3 From ffb5387e85d528fb6d0d924abfa3fbf0fc484071 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 28 Oct 2012 22:24:57 -0400 Subject: ext4: fix unjournaled inode bitmap modification commit 119c0d4460b001e44b41dcf73dc6ee794b98bd31 changed ext4_new_inode() such that the inode bitmap was being modified outside a transaction, which could lead to corruption, and was discovered when journal_checksum found a bad checksum in the journal during log replay. Nix ran into this when using the journal_async_commit mount option, which enables journal checksumming. The ensuing journal replay failures due to the bad checksums led to filesystem corruption reported as the now infamous "Apparent serious progressive ext4 data corruption bug" [ Changed by tytso to only call ext4_journal_get_write_access() only when we're fairly certain that we're going to allocate the inode. ] I've tested this by mounting with journal_checksum and running fsstress then dropping power; I've also tested by hacking DM to create snapshots w/o first quiescing, which allows me to test journal replay repeatedly w/o actually power-cycling the box. Without the patch I hit a journal checksum error every time. With this fix it survives many iterations. Reported-by: Nix Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/ialloc.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4facdd29a350..3a100e7a62a8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -725,6 +725,10 @@ repeat_in_this_group: "inode=%lu", ino + 1); continue; } + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + if (err) + goto fail; ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); ext4_unlock_group(sb, group); @@ -738,6 +742,11 @@ repeat_in_this_group: goto out; got: + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) + goto fail; + /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -771,11 +780,6 @@ got: goto fail; } - BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, inode_bitmap_bh); - if (err) - goto fail; - BUFFER_TRACE(group_desc_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, group_desc_bh); if (err) @@ -823,11 +827,6 @@ got: } ext4_unlock_group(sb, group); - BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); - if (err) - goto fail; - BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) -- cgit v1.2.3 From 52eb5a900a9863a8b77a895f770e5d825c8e02c6 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 18 Oct 2012 14:01:43 -0700 Subject: ceph: fix dentry reference leak in encode_fh() Call to d_find_alias() needs a corresponding dput() This fixes http://tracker.newdream.net/issues/3271 Signed-off-by: David Zafman Reviewed-by: Sage Weil --- fs/ceph/export.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 8e1b60e557b6..862887004d20 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -90,6 +90,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, *max_len = handle_length; type = 255; } + if (dentry) + dput(dentry); return type; } -- cgit v1.2.3 From 08f05c49749ee655bef921d12160960a273aad47 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 Oct 2012 03:37:48 +0000 Subject: Return the right error value when dup[23]() newfd argument is too large Jack Lin reports that the error return from dup3() for the RLIMIT_NOFILE case changed incorrectly after 3.6. The culprit is commit f33ff9927f42 ("take rlimit check to callers of expand_files()") which when it moved the "return -EMFILE" out to the caller, didn't notice that the dup3() had special code to turn the EMFILE return into EBADF. The replace_fd() helper that got added later then inherited the bug too. Reported-by: Jack Lin Signed-off-by: Al Viro [ Noted more bugs, wrote proper changelog, fixed up typos - Linus ] Signed-off-by: Linus Torvalds --- fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index d3b5fa80b71b..708d997a7748 100644 --- a/fs/file.c +++ b/fs/file.c @@ -900,7 +900,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) return __close_fd(files, fd); if (fd >= rlimit(RLIMIT_NOFILE)) - return -EMFILE; + return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); @@ -926,7 +926,7 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) - return -EMFILE; + return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); -- cgit v1.2.3 From 399f11c3d872bd748e1575574de265a6304c7c43 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Tue, 30 Oct 2012 16:06:35 -0400 Subject: NFS: Wait for session recovery to finish before returning Currently, we will schedule session recovery and then return to the caller of nfs4_handle_exception. This works for most cases, but causes a hang on the following test case: Client Server ------ ------ Open file over NFS v4.1 Write to file Expire client Try to lock file The server will return NFS4ERR_BADSESSION, prompting the client to schedule recovery. However, the client will continue placing lock attempts and the open recovery never seems to be scheduled. The simplest solution is to wait for session recovery to run before retrying the lock. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 68b21d81b7ac..d5fbf1f49d5f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -339,8 +339,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc dprintk("%s ERROR: %d Reset session\n", __func__, errorcode); nfs4_schedule_session_recovery(clp->cl_session, errorcode); - exception->retry = 1; - break; + goto wait_on_recovery; #endif /* defined(CONFIG_NFS_V4_1) */ case -NFS4ERR_FILE_OPEN: if (exception->timeout > HZ) { -- cgit v1.2.3 From 2240a9e2d013d8269ea425b73e1d7a54c7bc141f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 29 Oct 2012 18:37:40 -0400 Subject: NFSv4.1: We must release the sequence id when we fail to get a session slot If we do not release the sequence id in cases where we fail to get a session slot, then we can deadlock if we hit a recovery scenario. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d5fbf1f49d5f..e0423bb5a880 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1571,9 +1571,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) data->timestamp = jiffies; if (nfs4_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, - &data->o_res.seq_res, task)) - return; - rpc_call_start(task); + &data->o_res.seq_res, + task) != 0) + nfs_release_seqid(data->o_arg.seqid); + else + rpc_call_start(task); return; unlock_no_action: rcu_read_unlock(); @@ -2295,9 +2297,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (nfs4_setup_sequence(NFS_SERVER(inode), &calldata->arg.seq_args, &calldata->res.seq_res, - task)) - goto out; - rpc_call_start(task); + task) != 0) + nfs_release_seqid(calldata->arg.seqid); + else + rpc_call_start(task); out: dprintk("%s: done!\n", __func__); } @@ -4544,9 +4547,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) calldata->timestamp = jiffies; if (nfs4_setup_sequence(calldata->server, &calldata->arg.seq_args, - &calldata->res.seq_res, task)) - return; - rpc_call_start(task); + &calldata->res.seq_res, + task) != 0) + nfs_release_seqid(calldata->arg.seqid); + else + rpc_call_start(task); } static const struct rpc_call_ops nfs4_locku_ops = { @@ -4691,7 +4696,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) /* Do we need to do an open_to_lock_owner? */ if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) - return; + goto out_release_lock_seqid; data->arg.open_stateid = &state->stateid; data->arg.new_lock_owner = 1; data->res.open_seqid = data->arg.open_seqid; @@ -4700,10 +4705,15 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) data->timestamp = jiffies; if (nfs4_setup_sequence(data->server, &data->arg.seq_args, - &data->res.seq_res, task)) + &data->res.seq_res, + task) == 0) { + rpc_call_start(task); return; - rpc_call_start(task); - dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); + } + nfs_release_seqid(data->arg.open_seqid); +out_release_lock_seqid: + nfs_release_seqid(data->arg.lock_seqid); + dprintk("%s: done!, ret = %d\n", __func__, task->tk_status); } static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata) -- cgit v1.2.3 From 2b1bc308f492589f7d49012ed24561534ea2be8c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 29 Oct 2012 18:53:23 -0400 Subject: NFSv4: nfs4_locku_done must release the sequence id If the state recovery machinery is triggered by the call to nfs4_async_handle_error() then we can deadlock. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e0423bb5a880..1465364501ba 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4531,6 +4531,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } + nfs_release_seqid(calldata->arg.seqid); } static void nfs4_locku_prepare(struct rpc_task *task, void *data) -- cgit v1.2.3 From 8d96b10639fb402357b75b055b1e82a65ff95050 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 31 Oct 2012 12:16:01 +1100 Subject: NFS: fix bug in legacy DNS resolver. The DNS resolver's use of the sunrpc cache involves a 'ttl' number (relative) rather that a timeout (absolute). This confused me when I wrote commit c5b29f885afe890f953f7f23424045cdad31d3e4 "sunrpc: use seconds since boot in expiry cache" and I managed to break it. The effect is that any TTL is interpreted as 0, and nothing useful gets into the cache. This patch removes the use of get_expiry() - which really expects an expiry time - and uses get_uint() instead, treating the int correctly as a ttl. This fixes a regression that has been present since 2.6.37, causing certain NFS accesses in certain environments to incorrectly fail. Reported-by: Chuck Lever Tested-by: Chuck Lever Cc: stable@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/dns_resolve.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 31c26c4dcc23..ca4b11ec87a2 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -217,7 +217,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) { char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; struct nfs_dns_ent key, *item; - unsigned long ttl; + unsigned int ttl; ssize_t len; int ret = -EINVAL; @@ -240,7 +240,8 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) key.namelen = len; memset(&key.h, 0, sizeof(key.h)); - ttl = get_expiry(&buf); + if (get_uint(&buf, &ttl) < 0) + goto out; if (ttl == 0) goto out; key.h.expiry_time = ttl + seconds_since_boot(); -- cgit v1.2.3 From 7175fe90153e6375082d65884fbb41ab3bbb4901 Mon Sep 17 00:00:00 2001 From: Yanchuan Nian Date: Wed, 31 Oct 2012 16:05:48 +0800 Subject: nfs: Check whether a layout pointer is NULL before free it The new layout pointer in pnfs_find_alloc_layout() may be NULL because of out of memory. we must do some check work, otherwise pnfs_free_layout_hdr() will go wrong because it can not deal with a NULL pointer. Signed-off-by: Yanchuan Nian Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index fe624c91bd00..2878f97bd78d 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -925,8 +925,8 @@ pnfs_find_alloc_layout(struct inode *ino, if (likely(nfsi->layout == NULL)) { /* Won the race? */ nfsi->layout = new; return new; - } - pnfs_free_layout_hdr(new); + } else if (new != NULL) + pnfs_free_layout_hdr(new); out_existing: pnfs_get_layout_hdr(nfsi->layout); return nfsi->layout; -- cgit v1.2.3 From acce94e68a0f346115fd41cdc298197d2d5a59ad Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 16 Oct 2012 13:22:19 -0400 Subject: nfsv3: Make v3 mounts fail with ETIMEDOUTs instead EIO on mountd timeouts In very busy v3 environment, rpc.mountd can respond to the NULL procedure but not the MNT procedure in a timely manner causing the MNT procedure to time out. The problem is the mount system call returns EIO which causes the mount to fail, instead of ETIMEDOUT, which would cause the mount to be retried. This patch sets the RPC_TASK_SOFT|RPC_TASK_TIMEOUT flags to the rpc_call_sync() call in nfs_mount() which causes ETIMEDOUT to be returned on timed out connections. Signed-off-by: Steve Dickson Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/mount_clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 8e65c7f1f87c..015f71f8f62c 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -181,7 +181,7 @@ int nfs_mount(struct nfs_mount_request *info) else msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT]; - status = rpc_call_sync(mnt_clnt, &msg, 0); + status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT); rpc_shutdown_client(mnt_clnt); if (status < 0) -- cgit v1.2.3 From 97a54868262da1629a3e65121e65b8e8c4419d9f Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 21 Oct 2012 19:23:52 +0100 Subject: nfs: Show original device name verbatim in /proc/*/mount{s,info} Since commit c7f404b ('vfs: new superblock methods to override /proc/*/mount{s,info}'), nfs_path() is used to generate the mounted device name reported back to userland. nfs_path() always generates a trailing slash when the given dentry is the root of an NFS mount, but userland may expect the original device name to be returned verbatim (as it used to be). Make this canonicalisation optional and change the callers accordingly. [jrnieder@gmail.com: use flag instead of bool argument] Reported-and-tested-by: Chris Hiestand Reference: http://bugs.debian.org/669314 Signed-off-by: Ben Hutchings Cc: # v2.6.39+ Signed-off-by: Jonathan Nieder Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 5 +++-- fs/nfs/namespace.c | 19 ++++++++++++++----- fs/nfs/nfs4namespace.c | 3 ++- fs/nfs/super.c | 2 +- 4 files changed, 20 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 59b133c5d652..a54fe51c1dfb 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -353,8 +353,9 @@ extern void nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); /* namespace.c */ +#define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, - char *buffer, ssize_t buflen); + char *buffer, ssize_t buflen, unsigned flags); extern struct vfsmount *nfs_d_automount(struct path *path); struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); @@ -498,7 +499,7 @@ static inline char *nfs_devname(struct dentry *dentry, char *buffer, ssize_t buflen) { char *dummy; - return nfs_path(&dummy, dentry, buffer, buflen); + return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL); } /* diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 655925373b91..dd057bc6b65b 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -33,6 +33,7 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; * @dentry - pointer to dentry * @buffer - result buffer * @buflen - length of buffer + * @flags - options (see below) * * Helper function for constructing the server pathname * by arbitrary hashed dentry. @@ -40,8 +41,14 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; * This is mainly for use in figuring out the path on the * server side when automounting on top of an existing partition * and in generating /proc/mounts and friends. + * + * Supported flags: + * NFS_PATH_CANONICAL: ensure there is exactly one slash after + * the original device (export) name + * (if unset, the original name is returned verbatim) */ -char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen) +char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, + unsigned flags) { char *end; int namelen; @@ -74,7 +81,7 @@ rename_retry: rcu_read_unlock(); goto rename_retry; } - if (*end != '/') { + if ((flags & NFS_PATH_CANONICAL) && *end != '/') { if (--buflen < 0) { spin_unlock(&dentry->d_lock); rcu_read_unlock(); @@ -91,9 +98,11 @@ rename_retry: return end; } namelen = strlen(base); - /* Strip off excess slashes in base string */ - while (namelen > 0 && base[namelen - 1] == '/') - namelen--; + if (flags & NFS_PATH_CANONICAL) { + /* Strip off excess slashes in base string */ + while (namelen > 0 && base[namelen - 1] == '/') + namelen--; + } buflen -= namelen; if (buflen < 0) { spin_unlock(&dentry->d_lock); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 79fbb61ce202..1e09eb78543b 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -81,7 +81,8 @@ static char *nfs_path_component(const char *nfspath, const char *end) static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen) { char *limit; - char *path = nfs_path(&limit, dentry, buffer, buflen); + char *path = nfs_path(&limit, dentry, buffer, buflen, + NFS_PATH_CANONICAL); if (!IS_ERR(path)) { char *path_component = nfs_path_component(path, limit); if (path_component) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e831bce49766..13c2a5be4765 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -771,7 +771,7 @@ int nfs_show_devname(struct seq_file *m, struct dentry *root) int err = 0; if (!page) return -ENOMEM; - devname = nfs_path(&dummy, root, page, PAGE_SIZE); + devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0); if (IS_ERR(devname)) err = PTR_ERR(devname); else -- cgit v1.2.3 From 324d003b0cd82151adbaecefef57b73f7959a469 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 30 Oct 2012 17:01:39 -0400 Subject: NFS: add nfs_sb_deactive_async to avoid deadlock Use nfs_sb_deactive_async instead of nfs_sb_deactive when in a workqueue context. This avoids a deadlock where rpc_shutdown_client loops forever in a workqueue kworker context, trying to kill all RPC tasks associated with the client, while one or more of these tasks have already been assigned to the same kworker (and will never run rpc_exit_task). This approach is needed because RPC tasks that have already been assigned to a kworker by queue_work cannot be canceled, as explained in the comment for workqueue.c:insert_wq_barrier. Signed-off-by: Weston Andros Adamson [Trond: add module_get/put.] Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 5 ++++- fs/nfs/internal.h | 1 + fs/nfs/nfs4proc.c | 2 +- fs/nfs/super.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/unlink.c | 2 +- 5 files changed, 56 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5c7325c5c5e6..6fa01aea2488 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -685,7 +685,10 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) if (ctx->cred != NULL) put_rpccred(ctx->cred); dput(ctx->dentry); - nfs_sb_deactive(sb); + if (is_sync) + nfs_sb_deactive(sb); + else + nfs_sb_deactive_async(sb); kfree(ctx->mdsthreshold); kfree(ctx); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a54fe51c1dfb..05521cadac2e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -351,6 +351,7 @@ extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern void nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); +extern void nfs_sb_deactive_async(struct super_block *sb); /* namespace.c */ #define NFS_PATH_CANONICAL 1 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1465364501ba..8cfbac1a8d5e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2197,7 +2197,7 @@ static void nfs4_free_closedata(void *data) nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); - nfs_sb_deactive(sb); + nfs_sb_deactive_async(sb); kfree(calldata); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 13c2a5be4765..652d3f7176a9 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -54,6 +54,7 @@ #include #include #include +#include #include @@ -415,6 +416,54 @@ void nfs_sb_deactive(struct super_block *sb) } EXPORT_SYMBOL_GPL(nfs_sb_deactive); +static int nfs_deactivate_super_async_work(void *ptr) +{ + struct super_block *sb = ptr; + + deactivate_super(sb); + module_put_and_exit(0); + return 0; +} + +/* + * same effect as deactivate_super, but will do final unmount in kthread + * context + */ +static void nfs_deactivate_super_async(struct super_block *sb) +{ + struct task_struct *task; + char buf[INET6_ADDRSTRLEN + 1]; + struct nfs_server *server = NFS_SB(sb); + struct nfs_client *clp = server->nfs_client; + + if (!atomic_add_unless(&sb->s_active, -1, 1)) { + rcu_read_lock(); + snprintf(buf, sizeof(buf), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + + __module_get(THIS_MODULE); + task = kthread_run(nfs_deactivate_super_async_work, sb, + "%s-deactivate-super", buf); + if (IS_ERR(task)) { + pr_err("%s: kthread_run: %ld\n", + __func__, PTR_ERR(task)); + /* make synchronous call and hope for the best */ + deactivate_super(sb); + module_put(THIS_MODULE); + } + } +} + +void nfs_sb_deactive_async(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + nfs_deactivate_super_async(sb); +} +EXPORT_SYMBOL_GPL(nfs_sb_deactive_async); + /* * Deliver file system statistics to userspace */ diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 13cea637eff8..3f79c77153b8 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata) nfs_dec_sillycount(data->dir); nfs_free_unlinkdata(data); - nfs_sb_deactive(sb); + nfs_sb_deactive_async(sb); } static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) -- cgit v1.2.3 From f9b1ef5f06d65a01952169b67d474f7f0dcb0206 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 29 Oct 2012 16:48:40 -0400 Subject: NFSv4: Initialise the NFSv4.1 slot table highest_used_slotid correctly Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8cfbac1a8d5e..091baab3eccf 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5677,7 +5677,7 @@ static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, tbl->slots = new; tbl->max_slots = max_slots; } - tbl->highest_used_slotid = -1; /* no slot is currently used */ + tbl->highest_used_slotid = NFS4_NO_SLOT; for (i = 0; i < tbl->max_slots; i++) tbl->slots[i].seq_nr = ivalue; spin_unlock(&tbl->slot_tbl_lock); -- cgit v1.2.3 From 998f40b550f257e436485291802fa938e4cf580f Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 2 Nov 2012 18:00:56 -0400 Subject: NFS4: nfs4_opendata_access should return errno Return errno - not an NFS4ERR_. This worked because NFS4ERR_ACCESS == EACCES. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 091baab3eccf..5eec4429970c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1749,7 +1749,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred, /* even though OPEN succeeded, access is denied. Close the file */ nfs4_close_state(state, fmode); - return -NFS4ERR_ACCESS; + return -EACCES; } /* -- cgit v1.2.3 From 36960e440ccf94349c09fb944930d3bfe4bc473f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 3 Nov 2012 09:37:28 -0400 Subject: cifs: fix potential buffer overrun in cifs.idmap handling code The userspace cifs.idmap program generally works with the wbclient libs to generate binary SIDs in userspace. That program defines the struct that holds these values as having a max of 15 subauthorities. The kernel idmapping code however limits that value to 5. When the kernel copies those values around though, it doesn't sanity check the num_subauths value handed back from userspace or from the server. It's possible therefore for userspace to hand us back a bogus num_subauths value (or one that's valid, but greater than 5) that could cause the kernel to walk off the end of the cifs_sid->sub_auths array. Fix this by defining a new routine for copying sids and using that in all of the places that copy it. If we end up with a sid that's longer than expected then this approach will just lop off the "extra" subauths, but that's basically what the code does today already. Better approaches might be to fix this code to reject SIDs with >5 subauths, or fix it to handle the subauths array dynamically. At the same time, change the kernel to check the length of the data returned by userspace. If it's shorter than struct cifs_sid, reject it and return -EIO. If that happens we'll end up with fields that are basically uninitialized. Long term, it might make sense to redefine cifs_sid using a flexarray at the end, to allow for variable-length subauth lists, and teach the code to handle the case where the subauths array being passed in from userspace is shorter than 5 elements. Note too, that I don't consider this a security issue since you'd need a compromised cifs.idmap program. If you have that, you can do all sorts of nefarious stuff. Still, this is probably reasonable for stable. Cc: stable@kernel.org Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton --- fs/cifs/cifsacl.c | 49 ++++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fc783e264420..0fb15bbbe43c 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -224,6 +224,13 @@ sid_to_str(struct cifs_sid *sidptr, char *sidstr) } } +static void +cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) +{ + memcpy(dst, src, sizeof(*dst)); + dst->num_subauth = min_t(u8, src->num_subauth, NUM_SUBAUTHS); +} + static void id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, struct cifs_sid_id **psidid, char *typestr) @@ -248,7 +255,7 @@ id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, } } - memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid)); + cifs_copy_sid(&(*psidid)->sid, sidptr); (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); (*psidid)->refcount = 0; @@ -354,7 +361,7 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) * any fields of the node after a reference is put . */ if (test_bit(SID_ID_MAPPED, &psidid->state)) { - memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + cifs_copy_sid(ssid, &psidid->sid); psidid->time = jiffies; /* update ts for accessing */ goto id_sid_out; } @@ -370,14 +377,14 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) if (IS_ERR(sidkey)) { rc = -EINVAL; cFYI(1, "%s: Can't map and id to a SID", __func__); + } else if (sidkey->datalen < sizeof(struct cifs_sid)) { + rc = -EIO; + cFYI(1, "%s: Downcall contained malformed key " + "(datalen=%hu)", __func__, sidkey->datalen); } else { lsid = (struct cifs_sid *)sidkey->payload.data; - memcpy(&psidid->sid, lsid, - sidkey->datalen < sizeof(struct cifs_sid) ? - sidkey->datalen : sizeof(struct cifs_sid)); - memcpy(ssid, &psidid->sid, - sidkey->datalen < sizeof(struct cifs_sid) ? - sidkey->datalen : sizeof(struct cifs_sid)); + cifs_copy_sid(&psidid->sid, lsid); + cifs_copy_sid(ssid, &psidid->sid); set_bit(SID_ID_MAPPED, &psidid->state); key_put(sidkey); kfree(psidid->sidstr); @@ -396,7 +403,7 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) return rc; } if (test_bit(SID_ID_MAPPED, &psidid->state)) - memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + cifs_copy_sid(ssid, &psidid->sid); else rc = -EINVAL; } @@ -675,8 +682,6 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) static void copy_sec_desc(const struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 sidsoffset) { - int i; - struct cifs_sid *owner_sid_ptr, *group_sid_ptr; struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; @@ -692,26 +697,14 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd, owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); - - nowner_sid_ptr->revision = owner_sid_ptr->revision; - nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth; - for (i = 0; i < 6; i++) - nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i]; - for (i = 0; i < 5; i++) - nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i]; + cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr); /* copy group sid */ group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + sizeof(struct cifs_sid)); - - ngroup_sid_ptr->revision = group_sid_ptr->revision; - ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth; - for (i = 0; i < 6; i++) - ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i]; - for (i = 0; i < 5; i++) - ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i]; + cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr); return; } @@ -1120,8 +1113,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, kfree(nowner_sid_ptr); return rc; } - memcpy(owner_sid_ptr, nowner_sid_ptr, - sizeof(struct cifs_sid)); + cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr); kfree(nowner_sid_ptr); *aclflag = CIFS_ACL_OWNER; } @@ -1139,8 +1131,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, kfree(ngroup_sid_ptr); return rc; } - memcpy(group_sid_ptr, ngroup_sid_ptr, - sizeof(struct cifs_sid)); + cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr); kfree(ngroup_sid_ptr); *aclflag = CIFS_ACL_GROUP; } -- cgit v1.2.3 From 3798f47aa276b332c30da499cb4df4577e2f8872 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Mon, 5 Nov 2012 11:39:32 +0000 Subject: cifs: Do not lookup hashed negative dentry in cifs_atomic_open We do not need to lookup a hashed negative directory since we have already revalidated it before and have found it to be fine. This also prevents a crash in cifs_lookup() when it attempts to rehash the already hashed negative lookup dentry. The patch has been tested using the reproducer at https://bugzilla.redhat.com/show_bug.cgi?id=867344#c28 Cc: # 3.6.x Reported-by: Vit Zahradka Signed-off-by: Sachin Prabhu --- fs/cifs/dir.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 7c0a81283645..d3671f2acb29 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -398,7 +398,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, * in network traffic in the other paths. */ if (!(oflags & O_CREAT)) { - struct dentry *res = cifs_lookup(inode, direntry, 0); + struct dentry *res; + + /* + * Check for hashed negative dentry. We have already revalidated + * the dentry and it is fine. No need to perform another lookup. + */ + if (!d_unhashed(direntry)) + return -ENOENT; + + res = cifs_lookup(inode, direntry, 0); if (IS_ERR(res)) return PTR_ERR(res); -- cgit v1.2.3 From aaaf68c5629108f6078ab458d34a661143ea6857 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Fri, 12 Oct 2012 16:45:08 +0100 Subject: GFS2: Fix an unchecked error from gfs2_rs_alloc Check the return value of gfs2_rs_alloc(ip) and avoid a possible null pointer dereference. Signed-off-by: Andrew Price Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 40c4b0d42fa8..c5af8e18f27a 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -497,8 +497,11 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) struct gfs2_quota_data **qd; int error; - if (ip->i_res == NULL) - gfs2_rs_alloc(ip); + if (ip->i_res == NULL) { + error = gfs2_rs_alloc(ip); + if (error) + return error; + } qd = ip->i_res->rs_qa_qd; -- cgit v1.2.3 From cd0ed19fb614cb1315c0a510ec6c163d8324fd82 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Fri, 12 Oct 2012 16:45:09 +0100 Subject: GFS2: Fix possible null pointer deref in gfs2_rs_alloc Despite the return value from kmem_cache_zalloc() being checked, the error wasn't being returned until after a possible null pointer dereference. This patch returns the error immediately, allowing the removal of the error variable. Signed-off-by: Andrew Price Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 3cc402ce6fea..43d1a20bdbe4 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -553,7 +553,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd) */ int gfs2_rs_alloc(struct gfs2_inode *ip) { - int error = 0; struct gfs2_blkreserv *res; if (ip->i_res) @@ -561,7 +560,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); if (!res) - error = -ENOMEM; + return -ENOMEM; RB_CLEAR_NODE(&res->rs_node); @@ -571,7 +570,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) else ip->i_res = res; up_write(&ip->i_rw_mutex); - return error; + return 0; } static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) -- cgit v1.2.3 From 73738a77f42c2d7f53fd61f73272c9dd6f520897 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Fri, 12 Oct 2012 16:45:10 +0100 Subject: GFS2: Clean up some unused assignments Cleans up two cases where variables were assigned values but then never used again. Signed-off-by: Andrew Price Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 2 -- fs/gfs2/lops.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 0def0504afc1..377a68dbd066 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -677,10 +677,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, size_t writesize = iov_length(iov, nr_segs); struct dentry *dentry = file->f_dentry; struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_sbd *sdp; int ret; - sdp = GFS2_SB(file->f_mapping->host); ret = gfs2_rs_alloc(ip); if (ret) return ret; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 8ff95a2d54ee..01e444b5b2bd 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -621,7 +621,6 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) static void revoke_lo_before_commit(struct gfs2_sbd *sdp) { - struct gfs2_log_descriptor *ld; struct gfs2_meta_header *mh; unsigned int offset; struct list_head *head = &sdp->sd_log_le_revoke; @@ -634,7 +633,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); - ld = page_address(page); offset = sizeof(struct gfs2_log_descriptor); list_for_each_entry(bd, head, bd_list) { -- cgit v1.2.3 From 3a238adefb8c5b8cb8cde0ce689d513306176ff4 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 16 Oct 2012 11:39:07 +0200 Subject: GFS2: Require user to provide argument for FITRIM When the fstrim_range argument is not provided by user in FITRIM ioctl we should just return EFAULT and not promoting bad behaviour by filling the structure in kernel. Let the user deal with it. Signed-off-by: Lukas Czerner Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 43d1a20bdbe4..b6bbf718d6c3 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1270,11 +1270,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (!blk_queue_discard(q)) return -EOPNOTSUPP; - if (argp == NULL) { - r.start = 0; - r.len = ULLONG_MAX; - r.minlen = 0; - } else if (copy_from_user(&r, argp, sizeof(r))) + if (copy_from_user(&r, argp, sizeof(r))) return -EFAULT; ret = gfs2_rindex_update(sdp); @@ -1323,7 +1319,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) out: r.len = trimmed << 9; - if (argp && copy_to_user(argp, &r, sizeof(r))) + if (copy_to_user(argp, &r, sizeof(r))) return -EFAULT; return ret; -- cgit v1.2.3 From 076f0faa764ab3a5a32fc726ae05e2de0e66151d Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 16 Oct 2012 11:39:08 +0200 Subject: GFS2: Fix FITRIM argument handling Currently implementation in gfs2 uses FITRIM arguments as it were in file system blocks units which is wrong. The FITRIM arguments (fstrim_range.start, fstrim_range.len and fstrim_range.minlen) are actually in bytes. Moreover, check for start argument beyond the end of file system, len argument being smaller than file system block and minlen argument being bigger than biggest resource group were missing. This commit converts the code to convert FITRIM argument to file system blocks and also adds appropriate checks mentioned above. All the problems were recognised by xfstests 251 and 260. Signed-off-by: Lukas Czerner Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index b6bbf718d6c3..38fe18f2f055 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1262,7 +1262,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp) int ret = 0; u64 amt; u64 trimmed = 0; + u64 start, end, minlen; unsigned int x; + unsigned bs_shift = sdp->sd_sb.sb_bsize_shift; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1277,8 +1279,18 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (ret) return ret; - rgd = gfs2_blk2rgrpd(sdp, r.start, 0); - rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0); + start = r.start >> bs_shift; + end = start + (r.len >> bs_shift); + minlen = max_t(u64, r.minlen, + q->limits.discard_granularity) >> bs_shift; + + rgd = gfs2_blk2rgrpd(sdp, start, 0); + rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0); + + if (end <= start || + minlen > sdp->sd_max_rg_data || + start > rgd_end->rd_data0 + rgd_end->rd_data) + return -EINVAL; while (1) { @@ -1290,7 +1302,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp) /* Trim each bitmap in the rgrp */ for (x = 0; x < rgd->rd_length; x++) { struct gfs2_bitmap *bi = rgd->rd_bits + x; - ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt); + ret = gfs2_rgrp_send_discards(sdp, + rgd->rd_data0, NULL, bi, minlen, + &amt); if (ret) { gfs2_glock_dq_uninit(&gh); goto out; -- cgit v1.2.3 From 3d1626889a64bd5a661544d582036a0a02104a60 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 6 Nov 2012 00:49:28 -0600 Subject: GFS2: Don't call file_accessed() with a shared glock file_accessed() was being called by gfs2_mmap() with a shared glock. If it needed to update the atime, it was crashing because it dirtied the inode in gfs2_dirty_inode() without holding an exclusive lock. gfs2_dirty_inode() checked if the caller was already holding a glock, but it didn't make sure that the glock was in the exclusive state. Now, instead of calling file_accessed() while holding the shared lock in gfs2_mmap(), file_accessed() is called after grabbing and releasing the glock to update the inode. If file_accessed() needs to update the atime, it will grab an exclusive lock in gfs2_dirty_inode(). gfs2_dirty_inode() now also checks to make sure that if the calling process has already locked the glock, it has an exclusive lock. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 12 +++++------- fs/gfs2/super.c | 3 ++- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 377a68dbd066..e056b4ce4877 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -516,15 +516,13 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) struct gfs2_holder i_gh; int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - error = gfs2_glock_nq(&i_gh); - if (error == 0) { - file_accessed(file); - gfs2_glock_dq(&i_gh); - } - gfs2_holder_uninit(&i_gh); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); if (error) return error; + /* grab lock to update inode */ + gfs2_glock_dq_uninit(&i_gh); + file_accessed(file); } vma->vm_ops = &gfs2_vm_ops; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index bc737261f234..d6488674d916 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -810,7 +810,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) return; } need_unlock = 1; - } + } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) + return; if (current->journal_info == NULL) { ret = gfs2_trans_begin(sdp, RES_DINODE, 0); -- cgit v1.2.3 From 96e5d1d3adf56f1c7eeb07258f6a1a0a7ae9c489 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 7 Nov 2012 00:38:06 -0600 Subject: GFS2: Test bufdata with buffer locked and gfs2_log_lock held In gfs2_trans_add_bh(), gfs2 was testing if a there was a bd attached to the buffer without having the gfs2_log_lock held. It was then assuming it would stay attached for the rest of the function. However, without either the log lock being held of the buffer locked, __gfs2_ail_flush() could detach bd at any time. This patch moves the locking before the test. If there isn't a bd already attached, gfs2 can safely allocate one and attach it before locking. There is no way that the newly allocated bd could be on the ail list, and thus no way for __gfs2_ail_flush() to detach it. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/lops.c | 14 ++------------ fs/gfs2/trans.c | 8 ++++++++ 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 01e444b5b2bd..9ceccb1595a3 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -393,12 +393,10 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct gfs2_meta_header *mh; struct gfs2_trans *tr; - lock_buffer(bd->bd_bh); - gfs2_log_lock(sdp); tr = current->journal_info; tr->tr_touched = 1; if (!list_empty(&bd->bd_list)) - goto out; + return; set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; @@ -414,9 +412,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) sdp->sd_log_num_buf++; list_add(&bd->bd_list, &sdp->sd_log_le_buf); tr->tr_num_buf_new++; -out: - gfs2_log_unlock(sdp); - unlock_buffer(bd->bd_bh); } static void gfs2_check_magic(struct buffer_head *bh) @@ -775,12 +770,10 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct address_space *mapping = bd->bd_bh->b_page->mapping; struct gfs2_inode *ip = GFS2_I(mapping->host); - lock_buffer(bd->bd_bh); - gfs2_log_lock(sdp); if (tr) tr->tr_touched = 1; if (!list_empty(&bd->bd_list)) - goto out; + return; set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); if (gfs2_is_jdata(ip)) { @@ -791,9 +784,6 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) } else { list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); } -out: - gfs2_log_unlock(sdp); - unlock_buffer(bd->bd_bh); } /** diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index adbd27875ef9..413627072f36 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -155,14 +155,22 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_bufdata *bd; + lock_buffer(bh); + gfs2_log_lock(sdp); bd = bh->b_private; if (bd) gfs2_assert(sdp, bd->bd_gl == gl); else { + gfs2_log_unlock(sdp); + unlock_buffer(bh); gfs2_attach_bufdata(gl, bh, meta); bd = bh->b_private; + lock_buffer(bh); + gfs2_log_lock(sdp); } lops_add(sdp, bd); + gfs2_log_unlock(sdp); + unlock_buffer(bh); } void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -- cgit v1.2.3 From 7e9620f21d8c9e389fd6845487e07d5df898a2e4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:12 +1100 Subject: xfs: only update the last_sync_lsn when a transaction completes The log write code stamps each iclog with the current tail LSN in the iclog header so that recovery knows where to find the tail of thelog once it has found the head. Normally this is taken from the first item on the AIL - the log item that corresponds to the oldest active item in the log. The problem is that when the AIL is empty, the tail lsn is dervied from the the l_last_sync_lsn, which is the LSN of the last iclog to be written to the log. In most cases this doesn't happen, because the AIL is rarely empty on an active filesystem. However, when it does, it opens up an interesting case when the transaction being committed to the iclog spans multiple iclogs. That is, the first iclog is stamped with the l_last_sync_lsn, and IO is issued. Then the next iclog is setup, the changes copied into the iclog (takes some time), and then the l_last_sync_lsn is stamped into the header and IO is issued. This is still the same transaction, so the tail lsn of both iclogs must be the same for log recovery to find the entire transaction to be able to replay it. The problem arises in that the iclog buffer IO completion updates the l_last_sync_lsn with it's own LSN. Therefore, If the first iclog completes it's IO before the second iclog is filled and has the tail lsn stamped in it, it will stamp the LSN of the first iclog into it's tail lsn field. If the system fails at this point, log recovery will not see a complete transaction, so the transaction will no be replayed. The fix is simple - the l_last_sync_lsn is updated when a iclog buffer IO completes, and this is incorrect. The l_last_sync_lsn shoul dbe updated when a transaction is completed by a iclog buffer IO. That is, only iclog buffers that have transaction commit callbacks attached to them should update the l_last_sync_lsn. This means that the last_sync_lsn will only move forward when a commit record it written, not in the middle of a large transaction that is rolling through multiple iclog buffers. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7f4f9370d0e7..4dad756962d0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2387,14 +2387,27 @@ xlog_state_do_callback( /* - * update the last_sync_lsn before we drop the + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the * icloglock to ensure we are the only one that * can update it. */ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); } else ioerrors++; -- cgit v1.2.3 From 408cc4e97a3ccd172d2d676e4b585badf439271b Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 20 Sep 2012 13:16:45 -0500 Subject: xfs: zero allocation_args on the kernel stack Zero the kernel stack space that makes up the xfs_alloc_arg structures. Signed-off-by: Mark Tinguely Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 1 + fs/xfs/xfs_bmap.c | 3 +++ fs/xfs/xfs_ialloc.c | 1 + 3 files changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 4f33c32affe3..0287f3b1b503 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1866,6 +1866,7 @@ xfs_alloc_fix_freelist( /* * Initialize the args structure. */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 848ffa77707b..e1545ec2f7d2 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2437,6 +2437,7 @@ xfs_bmap_btalloc( * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; + memset(&args, 0, sizeof(args)); args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; @@ -3082,6 +3083,7 @@ xfs_bmap_extents_to_btree( * Convert to a btree with two levels, one record in root. */ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; args.firstblock = *firstblock; @@ -3237,6 +3239,7 @@ xfs_bmap_local_to_extents( xfs_buf_t *bp; /* buffer for extent block */ xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.firstblock = *firstblock; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 445bf1aef31c..c5c4ef4f2bdb 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -250,6 +250,7 @@ xfs_ialloc_ag_alloc( /* boundary */ struct xfs_perag *pag; + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; -- cgit v1.2.3 From 326c03555b914ff153ba5b40df87fd6e28e7e367 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:58 +1000 Subject: xfs: introduce XFS_BMAPI_STACK_SWITCH Certain allocation paths through xfs_bmapi_write() are in situations where we have limited stack available. These are almost always in the buffered IO writeback path when convertion delayed allocation extents to real extents. The current stack switch occurs for userdata allocations, which means we also do stack switches for preallocation, direct IO and unwritten extent conversion, even those these call chains have never been implicated in a stack overrun. Hence, let's target just the single stack overun offended for stack switches. To do that, introduce a XFS_BMAPI_STACK_SWITCH flag that the caller can pass xfs_bmapi_write() to indicate it should switch stacks if it needs to do allocation. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_alloc.h | 1 + fs/xfs/xfs_bmap.c | 4 ++++ fs/xfs/xfs_bmap.h | 5 ++++- fs/xfs/xfs_iomap.c | 4 +++- 5 files changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0287f3b1b503..43f791bcd8b1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2447,7 +2447,7 @@ xfs_alloc_vextent( { DECLARE_COMPLETION_ONSTACK(done); - if (!args->userdata) + if (!args->stack_switch) return __xfs_alloc_vextent(args); diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 93be4a667ca1..ef7d4885dc2d 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -123,6 +123,7 @@ typedef struct xfs_alloc_arg { struct completion *done; struct work_struct work; int result; + char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index e1545ec2f7d2..91259554df8b 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,6 +2441,7 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; + args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4675,6 +4676,9 @@ xfs_bmapi_allocate( return error; } + if (flags & XFS_BMAPI_STACK_SWITCH) + bma->stack_switch = 1; + error = xfs_bmap_alloc(bma); if (error) return error; diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 803b56d7ce16..b68c598034c1 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,6 +77,7 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -85,7 +86,8 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" } + { XFS_BMAPI_CONVERT, "CONVERT" }, \ + { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } static inline int xfs_bmapi_aflag(int w) @@ -133,6 +135,7 @@ typedef struct xfs_bmalloca { char userdata;/* set if is user data */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ + char stack_switch; } xfs_bmalloca_t; /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 973dff6ad935..7f537663365b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -584,7 +584,9 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, 1, + count_fsb, + XFS_BMAPI_STACK_SWITCH, + &first_block, 1, imap, &nimaps, &free_list); if (error) goto trans_cancel; -- cgit v1.2.3 From 1f3c785c3adb7d2b109ec7c8f10081d1294b03d3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:59 +1000 Subject: xfs: move allocation stack switch up to xfs_bmapi_allocate Switching stacks are xfs_alloc_vextent can cause deadlocks when we run out of worker threads on the allocation workqueue. This can occur because xfs_bmap_btalloc can make multiple calls to xfs_alloc_vextent() and even if xfs_alloc_vextent() fails it can return with the AGF locked in the current allocation transaction. If we then need to make another allocation, and all the allocation worker contexts are exhausted because the are blocked waiting for the AGF lock, holder of the AGF cannot get it's xfs-alloc_vextent work completed to release the AGF. Hence allocation effectively deadlocks. To avoid this, move the stack switch one layer up to xfs_bmapi_allocate() so that all of the allocation attempts in a single switched stack transaction occur in a single worker context. This avoids the problem of an allocation being blocked waiting for a worker thread whilst holding the AGF. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 42 +------------------------------------- fs/xfs/xfs_alloc.h | 4 ---- fs/xfs/xfs_bmap.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_bmap.h | 4 ++++ 4 files changed, 54 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 43f791bcd8b1..335206a9c698 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2208,7 +2208,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -__xfs_alloc_vextent( +xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2418,46 +2418,6 @@ error0: return error; } -static void -xfs_alloc_vextent_worker( - struct work_struct *work) -{ - struct xfs_alloc_arg *args = container_of(work, - struct xfs_alloc_arg, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_alloc_vextent(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Data allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Metadata - * requests, OTOH, are generally from low stack usage paths, so avoid the - * context switch overhead here. - */ -int -xfs_alloc_vextent( - struct xfs_alloc_arg *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_alloc_vextent(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index ef7d4885dc2d..feacb061bab7 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -120,10 +120,6 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ - struct completion *done; - struct work_struct work; - int result; - char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 91259554df8b..83d0cf3df930 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,7 +2441,6 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; - args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4620,12 +4619,11 @@ xfs_bmapi_delay( STATIC int -xfs_bmapi_allocate( - struct xfs_bmalloca *bma, - int flags) +__xfs_bmapi_allocate( + struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; @@ -4658,25 +4656,25 @@ xfs_bmapi_allocate( * Indicate if this is the first user data in the file, or just any * user data. */ - if (!(flags & XFS_BMAPI_METADATA)) { + if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; } - bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; /* * Only want to do the alignment at the eof if it is userdata and * allocation length is larger than a stripe unit. */ if (mp->m_dalign && bma->length >= mp->m_dalign && - !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { error = xfs_bmap_isaeof(bma, whichfork); if (error) return error; } - if (flags & XFS_BMAPI_STACK_SWITCH) + if (bma->flags & XFS_BMAPI_STACK_SWITCH) bma->stack_switch = 1; error = xfs_bmap_alloc(bma); @@ -4713,7 +4711,7 @@ xfs_bmapi_allocate( * A wasdelay extent has been initialized, so shouldn't be flagged * as unwritten. */ - if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4741,6 +4739,45 @@ xfs_bmapi_allocate( return 0; } +static void +xfs_bmapi_allocate_worker( + struct work_struct *work) +{ + struct xfs_bmalloca *args = container_of(work, + struct xfs_bmalloca, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_bmapi_allocate(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +/* + * Some allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Otherwise just + * call directly to avoid the context switch overhead here. + */ +int +xfs_bmapi_allocate( + struct xfs_bmalloca *args) +{ + DECLARE_COMPLETION_ONSTACK(done); + + if (!args->stack_switch) + return __xfs_bmapi_allocate(args); + + + args->done = &done; + INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, @@ -4926,6 +4963,7 @@ xfs_bmapi_write( bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; bma.offset = bno; + bma.flags = flags; /* * There's a 32/64 bit type mismatch between the @@ -4941,7 +4979,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(bma.length > 0); - error = xfs_bmapi_allocate(&bma, flags); + error = xfs_bmapi_allocate(&bma); if (error) goto error0; if (bma.blkno == NULLFSBLOCK) diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index b68c598034c1..5f469c3516eb 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -136,6 +136,10 @@ typedef struct xfs_bmalloca { char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ char stack_switch; + int flags; + struct completion *done; + struct work_struct work; + int result; } xfs_bmalloca_t; /* -- cgit v1.2.3 From eaef854335ce09956e930fe4a193327417edc6c9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 9 Oct 2012 14:50:52 +1100 Subject: xfs: growfs: don't read garbage for new secondary superblocks When updating new secondary superblocks in a growfs operation, the superblock buffer is read from the newly grown region of the underlying device. This is not guaranteed to be zero, so violates the underlying assumption that the unused parts of superblocks are zero filled. Get a new buffer for these secondary superblocks to ensure that the unused regions are zero filled correctly. Signed-off-by: Dave Chinner Reviewed-by: Carlos Maiolino Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c25b094efbf7..4beaede43277 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -399,9 +399,26 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + error = 0; + /* + * new secondary superblocks need to be zeroed, not read from + * disk as the contents of the new area we are growing into is + * completely unknown. + */ + if (agno < oagcount) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); + } else { + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (bp) + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + else + error = ENOMEM; + } + if (error) { xfs_warn(mp, "error %d reading secondary superblock for ag %d", @@ -423,7 +440,7 @@ xfs_growfs_data_private( break; /* no point in continuing */ } } - return 0; + return error; error0: xfs_trans_cancel(tp, XFS_TRANS_ABORT); -- cgit v1.2.3 From 1e7acbb7bc1ae7c1c62fd1310b3176a820225056 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 25 Oct 2012 17:22:30 +1100 Subject: xfs: silence uninitialised f.file warning. Uninitialised variable build warning introduced by 2903ff0 ("switch simple cases of fget_light to fdget"), gcc is not smart enough to work out that the variable is not used uninitialised, and the commit removed the initialisation at declaration that the old variable had. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 8305f2ac6773..c1df3c623de2 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -70,7 +70,7 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f; + struct fd f = {0}; struct path path; int error; struct xfs_inode *ip; -- cgit v1.2.3 From ca250b1b3d711936d7dae9e97871f2261347f82d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:41 +1100 Subject: xfs: invalidate allocbt blocks moved to the free list When we free a block from the alloc btree tree, we move it to the freelist held in the AGFL and mark it busy in the busy extent tree. This typically happens when we merge btree blocks. Once the transaction is committed and checkpointed, the block can remain on the free list for an indefinite amount of time. Now, this isn't the end of the world at this point - if the free list is shortened, the buffer is invalidated in the transaction that moves it back to free space. If the buffer is allocated as metadata from the free list, then all the modifications getted logged, and we have no issues, either. And if it gets allocated as userdata direct from the freelist, it gets invalidated and so will never get written. However, during the time it sits on the free list, pressure on the log can cause the AIL to be pushed and the buffer that covers the block gets pushed for write. IOWs, we end up writing a freed metadata block to disk. Again, this isn't the end of the world because we know from the above we are only writing to free space. The problem, however, is for validation callbacks. If the block was on old btree root block, then the level of the block is going to be higher than the current tree root, and so will fail validation. There may be other inconsistencies in the block as well, and currently we don't care because the block is in free space. Shutting down the filesystem because a freed block doesn't pass write validation, OTOH, is rather unfriendly. So, make sure we always invalidate buffers as they move from the free space trees to the free list so that we guarantee they never get written to disk while on the free list. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc_btree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f1647caace8f..f7876c6d6165 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -121,6 +121,8 @@ xfs_allocbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); return 0; } -- cgit v1.2.3 From 4b62acfe99e158fb7812982d1cf90a075710a92c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:42 +1100 Subject: xfs: don't vmap inode cluster buffers during free Inode buffers do not need to be mapped as inodes are read or written directly from/to the pages underlying the buffer. This fixes a regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the default behaviour"). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2778258fcfa2..1938b41ee9f5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1509,7 +1509,8 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!bp) return ENOMEM; -- cgit v1.2.3 From 03b1293edad462ad1ad62bcc5160c76758e450d5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 14:23:12 +1100 Subject: xfs: fix buffer shudown reference count mismatch When we shut down the filesystem, we have to unpin and free all the buffers currently active in the CIL. To do this we unpin and remove them in one operation as a result of a failed iclogbuf write. For buffers, we do this removal via a simultated IO completion of after marking the buffer stale. At the time we do this, we have two references to the buffer - the active LRU reference and the buf log item. The LRU reference is removed by marking the buffer stale, and the active CIL reference is by the xfs_buf_iodone() callback that is run by xfs_buf_do_callbacks() during ioend processing (via the bp->b_iodone callback). However, ioend processing requires one more reference - that of the IO that it is completing. We don't have this reference, so we free the buffer prematurely and use it after it is freed. For buffers marked with XBF_ASYNC, this leads to assert failures in xfs_buf_rele() on debug kernels because the b_hold count is zero. Fix this by making sure we take the necessary IO reference before starting IO completion processing on the stale buffer, and set the XBF_ASYNC flag to ensure that IO completion processing removes all the active references from the buffer to ensure it is fully torn down. Cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a8d0ed911196..becf4a97efc6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -526,7 +526,25 @@ xfs_buf_item_unpin( } xfs_buf_relse(bp); } else if (freed && remove) { + /* + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. + */ xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; xfs_buf_ioerror(bp, EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); -- cgit v1.2.3 From 6ce377afd1755eae5c93410ca9a1121dfead7b87 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:44 +1100 Subject: xfs: fix reading of wrapped log data Commit 4439647 ("xfs: reset buffer pointers before freeing them") in 3.0-rc1 introduced a regression when recovering log buffers that wrapped around the end of log. The second part of the log buffer at the start of the physical log was being read into the header buffer rather than the data buffer, and hence recovery was seeing garbage in the data buffer when it got to the region of the log buffer that was incorrectly read. Cc: # 3.0.x, 3.2.x, 3.4.x 3.6.x Reported-by: Torsten Kaiser Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log_recover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5da3ace352bf..d308749fabf1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3541,7 +3541,7 @@ xlog_do_recovery_pass( * - order is important. */ error = xlog_bread_offset(log, 0, - bblks - split_bblks, hbp, + bblks - split_bblks, dbp, offset + BBTOB(split_bblks)); if (error) goto bread_err2; -- cgit v1.2.3 From a80a6b85b428e6ce12a8363bb1f08d44c50f3252 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 8 Nov 2012 15:53:35 -0800 Subject: revert "epoll: support for disabling items, and a self-test app" Revert commit 03a7beb55b9f ("epoll: support for disabling items, and a self-test app") pending resolution of the issues identified by Michael Kerrisk, copied below. We'll revisit this for 3.8. : I've taken a look at this patch as it currently stands in 3.7-rc1, and : done a bit of testing. (By the way, the test program : tools/testing/selftests/epoll/test_epoll.c does not compile...) : : There are one or two places where the behavior seems a little strange, : so I have a question or two at the end of this mail. But other than : that, I want to check my understanding so that the interface can be : correctly documented. : : Just to go though my understanding, the problem is the following : scenario in a multithreaded application: : : 1. Multiple threads are performing epoll_wait() operations, : and maintaining a user-space cache that contains information : corresponding to each file descriptor being monitored by : epoll_wait(). : : 2. At some point, a thread wants to delete (EPOLL_CTL_DEL) : a file descriptor from the epoll interest list, and : delete the corresponding record from the user-space cache. : : 3. The problem with (2) is that some other thread may have : previously done an epoll_wait() that retrieved information : about the fd in question, and may be in the middle of using : information in the cache that relates to that fd. Thus, : there is a potential race. : : 4. The race can't solved purely in user space, because doing : so would require applying a mutex across the epoll_wait() : call, which would of course blow thread concurrency. : : Right? : : Your solution is the EPOLL_CTL_DISABLE operation. I want to : confirm my understanding about how to use this flag, since : the description that has accompanied the patches so far : has been a bit sparse : : 0. In the scenario you're concerned about, deleting a file : descriptor means (safely) doing the following: : (a) Deleting the file descriptor from the epoll interest list : using EPOLL_CTL_DEL : (b) Deleting the corresponding record in the user-space cache : : 1. It's only meaningful to use this EPOLL_CTL_DISABLE in : conjunction with EPOLLONESHOT. : : 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in : conjunction is a logical error. : : 3. The correct way to code multithreaded applications using : EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows: : : a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should : should EPOLLONESHOT. : : b. When a thread wants to delete a file descriptor, it : should do the following: : : [1] Call epoll_ctl(EPOLL_CTL_DISABLE) : [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE) : was zero, then the file descriptor can be safely : deleted by the thread that made this call. : [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY, : then the descriptor is in use. In this case, the calling : thread should set a flag in the user-space cache to : indicate that the thread that is using the descriptor : should perform the deletion operation. : : Is all of the above correct? : : The implementation depends on checking on whether : (events & ~EP_PRIVATE_BITS) == 0 : This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always : set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT : causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be : cleared. : : A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE : is only useful in conjunction with EPOLLONESHOT. However, as things : stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does : not have EPOLLONESHOT set in 'events' This results in the following : (slightly surprising) behavior: : : (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0 : (the indicator that the file descriptor can be safely deleted). : (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY. : : This doesn't seem particularly useful, and in fact is probably an : indication that the user made a logic error: they should only be using : epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which : EPOLLONESHOT was set in 'events'. If that is correct, then would it : not make sense to return an error to user space for this case? Cc: Michael Kerrisk Cc: "Paton J. Lewis" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 38 +++----------------------------------- 1 file changed, 3 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index da72250ddc1c..cd96649bfe62 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { - return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; + return op != EPOLL_CTL_DEL; } /* Initialize the poll safe wake up structure */ @@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) return 0; } -/* - * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item - * had no event flags set, indicating that another thread may be currently - * handling that item's events (in the case that EPOLLONESHOT was being - * used). Otherwise a zero result indicates that the item has been disabled - * from receiving events. A disabled item may be re-enabled via - * EPOLL_CTL_MOD. Must be called with "mtx" held. - */ -static int ep_disable(struct eventpoll *ep, struct epitem *epi) -{ - int result = 0; - unsigned long flags; - - spin_lock_irqsave(&ep->lock, flags); - if (epi->event.events & ~EP_PRIVATE_BITS) { - if (ep_is_linked(&epi->rdllink)) - list_del_init(&epi->rdllink); - /* Ensure ep_poll_callback will not add epi back onto ready - list: */ - epi->event.events &= EP_PRIVATE_BITS; - } - else - result = -EBUSY; - spin_unlock_irqrestore(&ep->lock, flags); - - return result; -} - static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; @@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate @@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else error = -ENOENT; break; - case EPOLL_CTL_DISABLE: - if (epi) - error = ep_disable(ep, epi); - else - error = -ENOENT; - break; } mutex_unlock(&ep->mtx); -- cgit v1.2.3 From 848561d368751a1c0f679b9f045a02944506a801 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 8 Nov 2012 15:53:37 -0800 Subject: fanotify: fix missing break Anders Blomdell noted in 2010 that Fanotify lost events and provided a test case. Eric Paris confirmed it was a bug and posted a fix to the list https://groups.google.com/forum/?fromgroups=#!topic/linux.kernel/RrJfTfyW2BE but never applied it. Repeated attempts over time to actually get him to apply it have never had a reply from anyone who has raised it So apply it anyway Signed-off-by: Alan Cox Reported-by: Anders Blomdell Cc: Eric Paris Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f35794b97e8e..a50636025364 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -21,6 +21,7 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) if ((old->path.mnt == new->path.mnt) && (old->path.dentry == new->path.dentry)) return true; + break; case (FSNOTIFY_EVENT_NONE): return true; default: -- cgit v1.2.3 From 5ffd3412ae5536a4c57469cb8ea31887121dcb2e Mon Sep 17 00:00:00 2001 From: Thomas Betker Date: Wed, 17 Oct 2012 22:59:30 +0200 Subject: jffs2: Fix lock acquisition order bug in jffs2_write_begin jffs2_write_begin() first acquires the page lock, then f->sem. This causes an AB-BA deadlock with jffs2_garbage_collect_live(), which first acquires f->sem, then the page lock: jffs2_garbage_collect_live mutex_lock(&f->sem) (A) jffs2_garbage_collect_dnode jffs2_gc_fetch_page read_cache_page_async do_read_cache_page lock_page(page) (B) jffs2_write_begin grab_cache_page_write_begin find_lock_page lock_page(page) (B) mutex_lock(&f->sem) (A) We fix this by restructuring jffs2_write_begin() to take f->sem before the page lock. However, we make sure that f->sem is not held when calling jffs2_reserve_space(), as this is not permitted by the locking rules. The deadlock above was observed multiple times on an SoC with a dual ARMv7 (Cortex-A9), running the long-term 3.4.11 kernel; it occurred when using scp to copy files from a host system to the ARM target system. The fix was heavily tested on the same target system. Cc: stable@vger.kernel.org Signed-off-by: Thomas Betker Acked-by: Joakim Tjernlund Signed-off-by: Artem Bityutskiy --- fs/jffs2/file.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 60ef3fb707ff..1506673c087e 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -138,33 +138,39 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode ri; + uint32_t alloc_len = 0; pgoff_t index = pos >> PAGE_CACHE_SHIFT; uint32_t pageofs = index << PAGE_CACHE_SHIFT; int ret = 0; + jffs2_dbg(1, "%s()\n", __func__); + + if (pageofs > inode->i_size) { + ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) + return ret; + } + + mutex_lock(&f->sem); pg = grab_cache_page_write_begin(mapping, index, flags); - if (!pg) + if (!pg) { + if (alloc_len) + jffs2_complete_reservation(c); + mutex_unlock(&f->sem); return -ENOMEM; + } *pagep = pg; - jffs2_dbg(1, "%s()\n", __func__); - - if (pageofs > inode->i_size) { + if (alloc_len) { /* Make new hole frag from old EOF to new page */ - struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); - struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; - uint32_t alloc_len; jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", (unsigned int)inode->i_size, pageofs); - ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, - ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); - if (ret) - goto out_page; - - mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -191,7 +197,6 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, if (IS_ERR(fn)) { ret = PTR_ERR(fn); jffs2_complete_reservation(c); - mutex_unlock(&f->sem); goto out_page; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); @@ -206,12 +211,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_mark_node_obsolete(c, fn->raw); jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); - mutex_unlock(&f->sem); goto out_page; } jffs2_complete_reservation(c); inode->i_size = pageofs; - mutex_unlock(&f->sem); } /* @@ -220,18 +223,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, * case of a short-copy. */ if (!PageUptodate(pg)) { - mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); - mutex_unlock(&f->sem); if (ret) goto out_page; } + mutex_unlock(&f->sem); jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); return ret; out_page: unlock_page(pg); page_cache_release(pg); + mutex_unlock(&f->sem); return ret; } -- cgit v1.2.3 From 5a8477660d9ddc090203736d7271137265cb25bb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 12 Nov 2012 01:19:02 -0500 Subject: kill bogus BUG_ON() in do_close_on_exec() It can be legitimately triggered via procfs access. Now, at least 2 of 3 of get_files_struct() callers in procfs are useless, but when and if we get rid of those we can always add WARN_ON() here. BUG_ON() at that spot is simply wrong. Signed-off-by: Al Viro --- fs/file.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index d3b5fa80b71b..331e7d24d9d3 100644 --- a/fs/file.c +++ b/fs/file.c @@ -685,7 +685,6 @@ void do_close_on_exec(struct files_struct *files) struct fdtable *fdt; /* exec unshares first */ - BUG_ON(atomic_read(&files->count) != 1); spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; -- cgit v1.2.3 From 70a6f46d7b0ec03653b9ab3f8063a9717a4a53ef Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 14 Nov 2012 11:49:53 +0000 Subject: pstore: Fix NULL pointer dereference in console writes Passing a NULL id causes a NULL pointer deference in writers such as erst_writer and efi_pstore_write because they expect to update this id. Pass a dummy id instead. This avoids a cascade of oopses caused when the initial pstore_console_write passes a null which in turn causes writes to the console causing further oopses in subsequent pstore_console_write calls. Signed-off-by: Colin Ian King Acked-by: Kees Cook Cc: stable@vger.kernel.org Signed-off-by: Anton Vorontsov --- fs/pstore/platform.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index a40da07e93d6..947fbe06c3b1 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -161,6 +161,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) while (s < e) { unsigned long flags; + u64 id; if (c > psinfo->bufsize) c = psinfo->bufsize; @@ -172,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) spin_lock_irqsave(&psinfo->buf_lock, flags); } memcpy(psinfo->buf, s, c); - psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo); + psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; -- cgit v1.2.3 From fa0cbbf145aabbf29c6f28f8a11935c0b0fd86fc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 12 Nov 2012 17:53:04 -0800 Subject: mm, oom: reintroduce /proc/pid/oom_adj This is mostly a revert of 01dc52ebdf47 ("oom: remove deprecated oom_adj") from Davidlohr Bueso. It reintroduces /proc/pid/oom_adj for backwards compatibility with earlier kernels. It simply scales the value linearly when /proc/pid/oom_score_adj is written. The major difference is that its scheduled removal is no longer included in Documentation/feature-removal-schedule.txt. We do warn users with a single printk, though, to suggest the more powerful and supported /proc/pid/oom_score_adj interface. Reported-by: Artem S. Tashkinov Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- fs/proc/base.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 144a96732dd7..3c231adf8450 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -873,6 +873,113 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; +static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + int oom_adj = OOM_ADJUST_MIN; + size_t len; + unsigned long flags; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int oom_adj; + unsigned long flags; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adj); + if (err) + goto out; + if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && + oom_adj != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (oom_adj == OOM_ADJUST_MAX) + oom_adj = OOM_SCORE_ADJ_MAX; + else + oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + + task->signal->oom_score_adj = oom_adj; + trace_oom_score_adj_update(task); +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_adj_operations = { + .read = oom_adj_read, + .write = oom_adj_write, + .llseek = generic_file_llseek, +}; + static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -2598,6 +2705,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -2964,6 +3072,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), -- cgit v1.2.3 From 42e2976f131d65555d5c1d6c3d47facc63577814 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:44 +1100 Subject: xfs: fix attr tree double split corruption In certain circumstances, a double split of an attribute tree is needed to insert or replace an attribute. In rare situations, this can go wrong, leaving the attribute tree corrupted. In this case, the attr being replaced is the last attr in a leaf node, and the replacement is larger so doesn't fit in the same leaf node. When we have the initial condition of a node format attribute btree with two leaves at index 1 and 2. Call them L1 and L2. The leaf L1 is completely full, there is not a single byte of free space in it. L2 is mostly empty. The attribute being replaced - call it X - is the last attribute in L1. The way an attribute replace is executed is that the replacement attribute - call it Y - is first inserted into the tree, but has an INCOMPLETE flag set on it so that list traversals ignore it. Once this transaction is committed, a second transaction it run to atomically mark Y as COMPLETE and X as INCOMPLETE, so that a traversal will now find Y and skip X. Once that transaction is committed, attribute X is then removed. So, the initial condition is: +--------+ +--------+ | L1 | | L2 | | fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 | | fsp: 0 | | fsp: N | |--------| |--------| | attr A | | attr 1 | |--------| |--------| | attr B | | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr X | | attr n | +--------+ +--------+ So now we go to replace X, and see that L1:fsp = 0 - it is full so we can't insert Y in the same leaf. So we record the the location of attribute X so we can track it for later use, then we split L1 into L1 and L3 and reblance across the two leafs. We end with: +--------+ +--------+ +--------+ | L1 | | L3 | | L2 | | fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: N | |--------| |--------| |--------| | attr A | | attr X | | attr 1 | |--------| +--------+ |--------| | attr B | | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ And we track that the original attribute is now at L3:0. We then try to insert Y into L1 again, and find that there isn't enough room because the new attribute is larger than the old one. Hence we have to split again to make room for Y. We end up with this: +--------+ +--------+ +--------+ +--------+ | L1 | | L4 | | L3 | | L2 | | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: J | | fsp: N | |--------| |--------| |--------| |--------| | attr A | | attr Y | | attr X | | attr 1 | |--------| + INCOMP + +--------+ |--------| | attr B | +--------+ | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ And now we have the new (incomplete) attribute @ L4:0, and the original attribute at L3:0. At this point, the first transaction is committed, and we move to the flipping of the flags. This is where we are supposed to end up with this: +--------+ +--------+ +--------+ +--------+ | L1 | | L4 | | L3 | | L2 | | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: J | | fsp: N | |--------| |--------| |--------| |--------| | attr A | | attr Y | | attr X | | attr 1 | |--------| +--------+ + INCOMP + |--------| | attr B | +--------+ | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ But that doesn't happen properly - the attribute tracking indexes are not pointing to the right locations. What we end up with is both the old attribute to be removed pointing at L4:0 and the new attribute at L4:1. On a debug kernel, this assert fails like so: XFS: Assertion failed: args->index2 < be16_to_cpu(leaf2->hdr.count), file: fs/xfs/xfs_attr_leaf.c, line: 2725 because the new attribute location does not exist. On a production kernel, this goes unnoticed and the code proceeds ahead merrily and removes L4 because it thinks that is the block that is no longer needed. This leaves the hash index node pointing to entries L1, L4 and L2, but only blocks L1, L3 and L2 to exist. Further, the leaf level sibling list is L1 <-> L4 <-> L2, but L4 is now free space, and so everything is busted. This corruption is caused by the removal of the old attribute triggering a join - it joins everything correctly but then frees the wrong block. xfs_repair will report something like: bad sibling back pointer for block 4 in attribute fork for inode 131 problem with attribute contents in inode 131 would clear attr fork bad nblocks 8 for inode 131, would reset to 3 bad anextents 4 for inode 131, would reset to 0 The problem lies in the assignment of the old/new blocks for tracking purposes when the double leaf split occurs. The first split tries to place the new attribute inside the current leaf (i.e. "inleaf == true") and moves the old attribute (X) to the new block. This sets up the old block/index to L1:X, and newly allocated block to L3:0. It then moves attr X to the new block and tries to insert attr Y at the old index. That fails, so it splits again. With the second split, the rebalance ends up placing the new attr in the second new block - L4:0 - and this is where the code goes wrong. What is does is it sets both the new and old block index to the second new block. Hence it inserts attr Y at the right place (L4:0) but overwrites the current location of the attr to replace that is held in the new block index (currently L3:0). It over writes it with L4:1 - the index we later assert fail on. Hopefully this table will show this in a foramt that is a bit easier to understand: Split old attr index new attr index vanilla patched vanilla patched before 1st L1:26 L1:26 N/A N/A after 1st L3:0 L3:0 L1:26 L1:26 after 2nd L4:0 L3:0 L4:1 L4:0 ^^^^ ^^^^ wrong wrong The fix is surprisingly simple, for all this analysis - just stop the rebalance on the out-of leaf case from overwriting the new attr index - it's already correct for the double split case. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_attr_leaf.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d330111ca738..70eec1829776 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1291,6 +1291,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, leaf2 = blk2->bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + ASSERT(leaf2->hdr.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1361,6 +1362,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * I assert that since all callers pass in an empty * second buffer, this code should never execute. */ + ASSERT(0); /* * Figure the total bytes to be added to the destination leaf. @@ -1422,10 +1424,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, args->index2 = 0; args->blkno2 = blk2->blkno; } else { + /* + * On a double leaf split, the original attr location + * is already stored in blkno2/index2, so don't + * overwrite it overwise we corrupt the tree. + */ blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); - args->index = args->index2 = blk2->index; - args->blkno = args->blkno2 = blk2->blkno; + args->index = blk2->index; + args->blkno = blk2->blkno; + if (!state->extravalid) { + /* + * set the new attr location to match the old + * one and let the higher level split code + * decide where in the leaf to place it. + */ + args->index2 = blk2->index; + args->blkno2 = blk2->blkno; + } } } else { ASSERT(state->inleaf == 1); -- cgit v1.2.3 From 3daed8bc3e49b9695ae931b9f472b5b90d1965b3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:45 +1100 Subject: xfs: fix broken error handling in xfs_vm_writepage When we shut down the filesystem, it might first be detected in writeback when we are allocating a inode size transaction. This happens after we have moved all the pages into the writeback state and unlocked them. Unfortunately, if we fail to set up the transaction we then abort writeback and try to invalidate the current page. This then triggers are BUG() in block_invalidatepage() because we are trying to invalidate an unlocked page. Fixing this is a bit of a chicken and egg problem - we can't allocate the transaction until we've clustered all the pages into the IO and we know the size of it (i.e. whether the last block of the IO is beyond the current EOF or not). However, we don't want to hold pages locked for long periods of time, especially while we lock other pages to cluster them into the write. To fix this, we need to make a clear delineation in writeback where errors can only be handled by IO completion processing. That is, once we have marked a page for writeback and unlocked it, we have to report errors via IO completion because we've already started the IO. We may not have submitted any IO, but we've changed the page state to indicate that it is under IO so we must now use the IO completion path to report errors. To do this, add an error field to xfs_submit_ioend() to pass it the error that occurred during the building on the ioend chain. When this is non-zero, mark each ioend with the error and call xfs_finish_ioend() directly rather than building bios. This will immediately push the ioends through completion processing with the error that has occurred. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 54 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e562dd43f41f..e57e2daa357c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -481,11 +481,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) * * The fix is two passes across the ioend list - one to start writeback on the * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. */ STATIC void xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend) + xfs_ioend_t *ioend, + int fail) { xfs_ioend_t *head = ioend; xfs_ioend_t *next; @@ -506,6 +512,18 @@ xfs_submit_ioend( next = ioend->io_list; bio = NULL; + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = -fail; + xfs_finish_ioend(ioend); + continue; + } + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (!bio) { @@ -1060,7 +1078,18 @@ xfs_vm_writepage( xfs_start_page_writeback(page, 1, count); - if (ioend && imap_valid) { + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { xfs_off_t end_index; end_index = imap.br_startoff + imap.br_blockcount; @@ -1079,20 +1108,15 @@ xfs_vm_writepage( wbc, end_index); } - if (iohead) { - /* - * Reserve log space if we might write beyond the on-disk - * inode size. - */ - if (ioend->io_type != XFS_IO_UNWRITTEN && - xfs_ioend_is_append(ioend)) { - err = xfs_setfilesize_trans_alloc(ioend); - if (err) - goto error; - } - xfs_submit_ioend(wbc, iohead); - } + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); return 0; -- cgit v1.2.3 From d69043c42d8c6414fa28ad18d99973aa6c1c2e24 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:46 +1100 Subject: xfs: drop buffer io reference when a bad bio is built Error handling in xfs_buf_ioapply_map() does not handle IO reference counts correctly. We increment the b_io_remaining count before building the bio, but then fail to decrement it in the failure case. This leads to the buffer never running IO completion and releasing the reference that the IO holds, so at unmount we can leak the buffer. This leak is captured by this assert failure during unmount: XFS: Assertion failed: atomic_read(&pag->pag_ref) == 0, file: fs/xfs/xfs_mount.c, line: 273 This is not a new bug - the b_io_remaining accounting has had this problem for a long, long time - it's just very hard to get a zero length bio being built by this code... Further, the buffer IO error can be overwritten on a multi-segment buffer by subsequent bio completions for partial sections of the buffer. Hence we should only set the buffer error status if the buffer is not already carrying an error status. This ensures that a partial IO error on a multi-segment buffer will not be lost. This part of the problem is a regression, however. cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 933b7930b863..4b0b8dd1b7b0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1197,9 +1197,14 @@ xfs_buf_bio_end_io( { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - xfs_buf_ioerror(bp, -error); + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (!bp->b_error) + xfs_buf_ioerror(bp, -error); - if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); _xfs_buf_ioend(bp, 1); @@ -1279,6 +1284,11 @@ next_chunk: if (size) goto next_chunk; } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_iorequest) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, EIO); bio_put(bio); } -- cgit v1.2.3 From 3587b1b097d70c2eb9fee95ea7995d13c05f66e5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 18 Nov 2012 19:19:00 +0000 Subject: fanotify: fix FAN_Q_OVERFLOW case of fanotify_read() If the FAN_Q_OVERFLOW bit set in event->mask, the fanotify event metadata will not contain a valid file descriptor, but copy_event_to_user() didn't check for that, and unconditionally does a fd_install() on the file descriptor. Which in turn will cause a BUG_ON() in __fd_install(). Introduced by commit 352e3b249284 ("fanotify: sanitize failure exits in copy_event_to_user()") Mea culpa - missed that path ;-/ Reported-by: Alex Shi Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 721d692fa8d4..6fcaeb8c902e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -258,7 +258,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (ret) goto out_close_fd; - fd_install(fd, f); + if (fd != FAN_NOFD) + fd_install(fd, f); return fanotify_event_metadata.event_len; out_close_fd: -- cgit v1.2.3 From 3bb3e1fc47aca554e7e2cc4deeddc24750987ac2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 Nov 2012 14:55:52 +0100 Subject: reiserfs: Fix lock ordering during remount When remounting reiserfs dquot_suspend() or dquot_resume() can be called. These functions take dqonoff_mutex which ranks above write lock so we have to drop it before calling into quota code. CC: stable@vger.kernel.org # >= 3.0 Signed-off-by: Jan Kara --- fs/reiserfs/super.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 1078ae179993..5372980ec458 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1335,7 +1335,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) kfree(qf_names[i]); #endif err = -EINVAL; - goto out_err; + goto out_unlock; } #ifdef CONFIG_QUOTA handle_quota_files(s, qf_names, &qfmt); @@ -1379,7 +1379,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (blocks) { err = reiserfs_resize(s, blocks); if (err != 0) - goto out_err; + goto out_unlock; } if (*mount_flags & MS_RDONLY) { @@ -1389,9 +1389,15 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) /* it is read-only already */ goto out_ok; + /* + * Drop write lock. Quota will retake it when needed and lock + * ordering requires calling dquot_suspend() without it. + */ + reiserfs_write_unlock(s); err = dquot_suspend(s, -1); if (err < 0) goto out_err; + reiserfs_write_lock(s); /* try to remount file system with read-only permissions */ if (sb_umount_state(rs) == REISERFS_VALID_FS @@ -1401,7 +1407,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) err = journal_begin(&th, s, 10); if (err) - goto out_err; + goto out_unlock; /* Mounting a rw partition read-only. */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1416,7 +1422,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (reiserfs_is_journal_aborted(journal)) { err = journal->j_errno; - goto out_err; + goto out_unlock; } handle_data_mode(s, mount_options); @@ -1425,7 +1431,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ err = journal_begin(&th, s, 10); if (err) - goto out_err; + goto out_unlock; /* Mount a partition which is read-only, read-write */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1442,10 +1448,16 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) SB_JOURNAL(s)->j_must_wait = 1; err = journal_end(&th, s, 10); if (err) - goto out_err; + goto out_unlock; if (!(*mount_flags & MS_RDONLY)) { + /* + * Drop write lock. Quota will retake it when needed and lock + * ordering requires calling dquot_resume() without it. + */ + reiserfs_write_unlock(s); dquot_resume(s, -1); + reiserfs_write_lock(s); finish_unfinished(s); reiserfs_xattr_init(s, *mount_flags); } @@ -1455,9 +1467,10 @@ out_ok: reiserfs_write_unlock(s); return 0; +out_unlock: + reiserfs_write_unlock(s); out_err: kfree(new_opts); - reiserfs_write_unlock(s); return err; } -- cgit v1.2.3 From b9e06ef2e8706fe669b51f4364e3aeed58639eb2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 Nov 2012 16:34:17 +0100 Subject: reiserfs: Protect reiserfs_quota_on() with write lock In reiserfs_quota_on() we do quite some work - for example unpacking tail of a quota file. Thus we have to hold write lock until a moment we call back into the quota code. CC: stable@vger.kernel.org # >= 3.0 Signed-off-by: Jan Kara --- fs/reiserfs/super.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 5372980ec458..e59d6ddcc69f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2216,8 +2216,11 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, struct reiserfs_transaction_handle th; int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA; - if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) - return -EINVAL; + reiserfs_write_lock(sb); + if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) { + err = -EINVAL; + goto out; + } /* Quotafile not on the same filesystem? */ if (path->dentry->d_sb != sb) { @@ -2259,8 +2262,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, if (err) goto out; } - err = dquot_quota_on(sb, type, format_id, path); + reiserfs_write_unlock(sb); + return dquot_quota_on(sb, type, format_id, path); out: + reiserfs_write_unlock(sb); return err; } -- cgit v1.2.3 From 361d94a338a3fd0cee6a4ea32bbc427ba228e628 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 Nov 2012 18:25:38 +0100 Subject: reiserfs: Protect reiserfs_quota_write() with write lock Calls into reiserfs journalling code and reiserfs_get_block() need to be protected with write lock. We remove write lock around calls to high level quota code in the next patch so these paths would suddently become unprotected. CC: stable@vger.kernel.org # >= 3.0 Signed-off-by: Jan Kara --- fs/reiserfs/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index e59d6ddcc69f..c101704ece48 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2338,7 +2338,9 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; + reiserfs_write_lock(sb); err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); + reiserfs_write_unlock(sb); if (err) goto out; if (offset || tocopy != sb->s_blocksize) @@ -2354,10 +2356,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, flush_dcache_page(bh->b_page); set_buffer_uptodate(bh); unlock_buffer(bh); + reiserfs_write_lock(sb); reiserfs_prepare_for_journal(sb, bh, 1); journal_mark_dirty(current->journal_info, sb, bh); if (!journal_quota) reiserfs_add_ordered_list(inode, bh); + reiserfs_write_unlock(sb); brelse(bh); offset = 0; towrite -= tocopy; -- cgit v1.2.3 From 7af11686933726e99af22901d622f9e161404e6b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 Nov 2012 17:05:14 +0100 Subject: reiserfs: Move quota calls out of write lock Calls into highlevel quota code cannot happen under the write lock. These calls take dqio_mutex which ranks above write lock. So drop write lock before calling back into quota code. CC: stable@vger.kernel.org # >= 3.0 Signed-off-by: Jan Kara --- fs/reiserfs/inode.c | 10 +++++++--- fs/reiserfs/stree.c | 4 ++++ fs/reiserfs/super.c | 18 ++++++++++++++---- 3 files changed, 25 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f27f01a98aa2..d83736fbc26c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1782,8 +1782,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - dquot_initialize(inode); + reiserfs_write_unlock(inode->i_sb); err = dquot_alloc_inode(inode); + reiserfs_write_lock(inode->i_sb); if (err) goto out_end_trans; if (!dir->i_nlink) { @@ -1979,8 +1980,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, out_end_trans: journal_end(th, th->t_super, th->t_blocks_allocated); + reiserfs_write_unlock(inode->i_sb); /* Drop can be outside and it needs more credits so it's better to have it outside */ dquot_drop(inode); + reiserfs_write_lock(inode->i_sb); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); @@ -3103,10 +3106,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - depth = reiserfs_write_lock_once(inode->i_sb); if (is_quota_modification(inode, attr)) dquot_initialize(inode); - + depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate @@ -3170,7 +3172,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) error = journal_begin(&th, inode->i_sb, jbegin_count); if (error) goto out; + reiserfs_write_unlock_once(inode->i_sb, depth); error = dquot_transfer(inode, attr); + depth = reiserfs_write_lock_once(inode->i_sb); if (error) { journal_end(&th, inode->i_sb, jbegin_count); goto out; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index f8afa4b162b8..2f40a4c70a4d 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1968,7 +1968,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree key2type(&(key->on_disk_key))); #endif + reiserfs_write_unlock(inode->i_sb); retval = dquot_alloc_space_nodirty(inode, pasted_size); + reiserfs_write_lock(inode->i_sb); if (retval) { pathrelse(search_path); return retval; @@ -2061,9 +2063,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif + reiserfs_write_unlock(inode->i_sb); /* We can't dirty inode here. It would be immediately written but * appropriate stat item isn't inserted yet... */ retval = dquot_alloc_space_nodirty(inode, quota_bytes); + reiserfs_write_lock(inode->i_sb); if (retval) { pathrelse(path); return retval; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index c101704ece48..418bdc3a57da 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -298,7 +298,9 @@ static int finish_unfinished(struct super_block *s) retval = remove_save_link_only(s, &save_link_key, 0); continue; } + reiserfs_write_unlock(s); dquot_initialize(inode); + reiserfs_write_lock(s); if (truncate && S_ISDIR(inode->i_mode)) { /* We got a truncate request for a dir which is impossible. @@ -2108,13 +2110,15 @@ static int reiserfs_write_dquot(struct dquot *dquot) REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (ret) goto out; + reiserfs_write_unlock(dquot->dq_sb); ret = dquot_commit(dquot); + reiserfs_write_lock(dquot->dq_sb); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (!ret && err) ret = err; - out: +out: reiserfs_write_unlock(dquot->dq_sb); return ret; } @@ -2130,13 +2134,15 @@ static int reiserfs_acquire_dquot(struct dquot *dquot) REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (ret) goto out; + reiserfs_write_unlock(dquot->dq_sb); ret = dquot_acquire(dquot); + reiserfs_write_lock(dquot->dq_sb); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (!ret && err) ret = err; - out: +out: reiserfs_write_unlock(dquot->dq_sb); return ret; } @@ -2150,19 +2156,21 @@ static int reiserfs_release_dquot(struct dquot *dquot) ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + reiserfs_write_unlock(dquot->dq_sb); if (ret) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); goto out; } ret = dquot_release(dquot); + reiserfs_write_lock(dquot->dq_sb); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (!ret && err) ret = err; - out: reiserfs_write_unlock(dquot->dq_sb); +out: return ret; } @@ -2187,11 +2195,13 @@ static int reiserfs_write_info(struct super_block *sb, int type) ret = journal_begin(&th, sb, 2); if (ret) goto out; + reiserfs_write_unlock(sb); ret = dquot_commit_info(sb, type); + reiserfs_write_lock(sb); err = journal_end(&th, sb, 2); if (!ret && err) ret = err; - out: +out: reiserfs_write_unlock(sb); return ret; } -- cgit v1.2.3 From ae49eeec785025373e28dc24c8351c6bba688d99 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 11 Oct 2012 12:28:38 +0200 Subject: ext3: Avoid underflow of in ext3_trim_fs() Currently if len argument in ext3_trim_fs() is smaller than one block, the 'end' variable underflow. Avoid that by returning EINVAL if len is smaller than file system block. Also remove useless unlikely(). Signed-off-by: Lukas Czerner Signed-off-by: Jan Kara --- fs/ext3/balloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 7320a66e958f..22548f56197b 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -2101,8 +2101,9 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = range->minlen >> sb->s_blocksize_bits; - if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) || - unlikely(start >= max_blks)) + if (minlen > EXT3_BLOCKS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) return -EINVAL; if (end >= max_blks) end = max_blks - 1; -- cgit v1.2.3 From 25389bb207987b5774182f763b9fb65ff08761c8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 23 Nov 2012 14:03:04 +0100 Subject: jbd: Fix lock ordering bug in journal_unmap_buffer() Commit 09e05d48 introduced a wait for transaction commit into journal_unmap_buffer() in the case we are truncating a buffer undergoing commit in the page stradding i_size on a filesystem with blocksize < pagesize. Sadly we forgot to drop buffer lock before waiting for transaction commit and thus deadlock is possible when kjournald wants to lock the buffer. Fix the problem by dropping the buffer lock before waiting for transaction commit. Since we are still holding page lock (and that is OK), buffer cannot disappear under us. CC: stable@vger.kernel.org # Wherever commit 09e05d48 was taken Signed-off-by: Jan Kara --- fs/jbd/transaction.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 78b7f84241d4..7f5120bf0ec2 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1961,7 +1961,9 @@ retry: spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); spin_unlock(&journal->j_state_lock); + unlock_buffer(bh); log_wait_commit(journal, tid); + lock_buffer(bh); goto retry; } /* -- cgit v1.2.3 From 05f564849d49499ced97913a0914b5950577d07d Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 26 Nov 2012 16:29:42 -0800 Subject: proc: check vma->vm_file before dereferencing Commit 7b540d0646ce ("proc_map_files_readdir(): don't bother with grabbing files") switched proc_map_files_readdir() to use @f_mode directly instead of grabbing @file reference, but same time the test for @vm_file presence was lost leading to nil dereference. The patch brings the test back. The all proc_map_files feature is CONFIG_CHECKPOINT_RESTORE wrapped (which is set to 'n' by default) so the bug doesn't affect regular kernels. The regression is 3.7-rc1 only as far as I can tell. [gorcunov@openvz.org: provided changelog] Signed-off-by: Stanislav Kinsbursky Acked-by: Cyrill Gorcunov Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 3c231adf8450..9e28356a959a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1877,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!vma) goto out_no_vma; - result = proc_map_files_instantiate(dir, dentry, task, - (void *)(unsigned long)vma->vm_file->f_mode); + if (vma->vm_file) + result = proc_map_files_instantiate(dir, dentry, task, + (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: up_read(&mm->mmap_sem); -- cgit v1.2.3 From 4eff96dd5283a102e0c1cac95247090be74a38ed Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 26 Nov 2012 16:29:51 -0800 Subject: writeback: put unused inodes to LRU after writeback completion Commit 169ebd90131b ("writeback: Avoid iput() from flusher thread") removed iget-iput pair from inode writeback. As a side effect, inodes that are dirty during iput_final() call won't be ever added to inode LRU (iput_final() doesn't add dirty inodes to LRU and later when the inode is cleaned there's noone to add the inode there). Thus inodes are effectively unreclaimable until someone looks them up again. The practical effect of this bug is limited by the fact that inodes are pinned by a dentry for long enough that the inode gets cleaned. But still the bug can have nasty consequences leading up to OOM conditions under certain circumstances. Following can easily reproduce the problem: for (( i = 0; i < 1000; i++ )); do mkdir $i for (( j = 0; j < 1000; j++ )); do touch $i/$j echo 2 > /proc/sys/vm/drop_caches done done then one needs to run 'sync; ls -lR' to make inodes reclaimable again. We fix the issue by inserting unused clean inodes into the LRU after writeback finishes in inode_sync_complete(). Signed-off-by: Jan Kara Reported-by: OGAWA Hirofumi Cc: Al Viro Cc: OGAWA Hirofumi Cc: Wu Fengguang Cc: Dave Chinner Cc: [3.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 ++ fs/inode.c | 16 ++++++++++++++-- fs/internal.h | 1 + 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 51ea267d444c..3e3422f7f0a4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -228,6 +228,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) static void inode_sync_complete(struct inode *inode) { inode->i_state &= ~I_SYNC; + /* If inode is clean an unused, put it into LRU now... */ + inode_add_lru(inode); /* Waiters must see I_SYNC cleared before being woken up */ smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); diff --git a/fs/inode.c b/fs/inode.c index b03c71957246..64999f144153 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -408,6 +408,19 @@ static void inode_lru_list_add(struct inode *inode) spin_unlock(&inode->i_sb->s_inode_lru_lock); } +/* + * Add inode to LRU if needed (inode is unused and clean). + * + * Needs inode->i_lock held. + */ +void inode_add_lru(struct inode *inode) +{ + if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && + !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) + inode_lru_list_add(inode); +} + + static void inode_lru_list_del(struct inode *inode) { spin_lock(&inode->i_sb->s_inode_lru_lock); @@ -1390,8 +1403,7 @@ static void iput_final(struct inode *inode) if (!drop && (sb->s_flags & MS_ACTIVE)) { inode->i_state |= I_REFERENCED; - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - inode_lru_list_add(inode); + inode_add_lru(inode); spin_unlock(&inode->i_lock); return; } diff --git a/fs/internal.h b/fs/internal.h index 916b7cbf3e3e..2f6af7f645eb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f); * inode.c */ extern spinlock_t inode_sb_list_lock; +extern void inode_add_lru(struct inode *inode); /* * fs-writeback.c -- cgit v1.2.3 From 3a98b8614312026d489e56c1d0e294a68e2aad77 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 26 Nov 2012 09:48:41 -0500 Subject: cifs: fix writeback race with file that is growing Commit eddb079deb4 created a regression in the writepages codepath. Previously, whenever it needed to check the size of the file, it did so by consulting the inode->i_size field directly. With that patch, the i_size was fetched once on entry into the writepages code and that value was used henceforth. If the file is changing size though (for instance, if someone is writing to it or has truncated it), then that value is likely to be wrong. This can lead to data corruption. Pages past the EOF at the time that the writepages call was issued may be silently dropped and ignored because cifs_writepages wrongly assumes that the file must have been truncated in the interim. Fix cifs_writepages to properly fetch the size from the inode->i_size field instead to properly account for this possibility. Original bug report is here: https://bugzilla.kernel.org/show_bug.cgi?id=50991 Reported-and-Tested-by: Maxim Britov Reviewed-by: Suresh Jayaraman Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index edb25b4bbb95..70b6f4c3a0c1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1794,7 +1794,6 @@ static int cifs_writepages(struct address_space *mapping, struct TCP_Server_Info *server; struct page *page; int rc = 0; - loff_t isize = i_size_read(mapping->host); /* * If wsize is smaller than the page cache size, default to writing @@ -1899,7 +1898,7 @@ retry: */ set_page_writeback(page); - if (page_offset(page) >= isize) { + if (page_offset(page) >= i_size_read(mapping->host)) { done = true; unlock_page(page); end_page_writeback(page); @@ -1932,7 +1931,8 @@ retry: wdata->offset = page_offset(wdata->pages[0]); wdata->pagesz = PAGE_CACHE_SIZE; wdata->tailsz = - min(isize - page_offset(wdata->pages[nr_pages - 1]), + min(i_size_read(mapping->host) - + page_offset(wdata->pages[nr_pages - 1]), (loff_t)PAGE_CACHE_SIZE); wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz; -- cgit v1.2.3 From c772aa92b6deb2857d4b39a5cc3bd3679cc5f4a6 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 28 Nov 2012 15:27:54 +0400 Subject: CIFS: Fix wrong buffer pointer usage in smb_set_file_info Commit 6bdf6dbd662176c0da5c3ac8ed10ac94e7776c85 caused a regression in setattr codepath that leads to files with wrong attributes. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/smb1ops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 56cc4be87807..34cea2798333 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -766,7 +766,6 @@ smb_set_file_info(struct inode *inode, const char *full_path, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; struct cifs_tcon *tcon; - FILE_BASIC_INFO info_buf; /* if the file is already open for write, just use that fileid */ open_file = find_writable_file(cinode, true); @@ -817,7 +816,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, netpid = current->tgid; set_via_filehandle: - rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid); + rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid); if (!rc) cinode->cifsAttrs = le32_to_cpu(buf->Attributes); -- cgit v1.2.3 From 45bce8f3e3436bbe2e03dd2b076abdce79ffabb7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 10:21:43 -0800 Subject: fs/buffer.c: make block-size be per-page and protected by the page lock This makes the buffer size handling be a per-page thing, which allows us to not have to worry about locking too much when changing the buffer size. If a page doesn't have buffers, we still need to read the block size from the inode, but we can do that with ACCESS_ONCE(), so that even if the size is changing, we get a consistent value. This doesn't convert all functions - many of the buffer functions are used purely by filesystems, which in turn results in the buffer size being fixed at mount-time. So they don't have the same consistency issues that the raw device access can have. Signed-off-by: Linus Torvalds --- fs/buffer.c | 79 +++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index b5f044283edb..28a74ff5324b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1552,6 +1552,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block) } EXPORT_SYMBOL(unmap_underlying_metadata); +/* + * Size is a power-of-two in the range 512..PAGE_SIZE, + * and the case we care about most is PAGE_SIZE. + * + * So this *could* possibly be written with those + * constraints in mind (relevant mostly if some + * architecture has a slow bit-scan instruction) + */ +static inline int block_size_bits(unsigned int blocksize) +{ + return ilog2(blocksize); +} + +static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) +{ + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); + return page_buffers(page); +} + /* * NOTE! All mapped/uptodate combinations are valid: * @@ -1589,19 +1611,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, sector_t block; sector_t last_block; struct buffer_head *bh, *head; - const unsigned blocksize = 1 << inode->i_blkbits; + unsigned int blocksize, bbits; int nr_underway = 0; int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); - BUG_ON(!PageLocked(page)); - - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; - - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, + head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); - } /* * Be very careful. We have no exclusion from __set_page_dirty_buffers @@ -1613,9 +1629,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page, * handle that here by just cleaning them. */ - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - head = page_buffers(page); bh = head; + blocksize = bh->b_size; + bbits = block_size_bits(blocksize); + + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + last_block = (i_size_read(inode) - 1) >> bbits; /* * Get all the dirty buffers mapped to disk addresses and @@ -1806,12 +1825,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, BUG_ON(to > PAGE_CACHE_SIZE); BUG_ON(from > to); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); - bbits = inode->i_blkbits; block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; @@ -1881,11 +1898,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, unsigned blocksize; struct buffer_head *bh, *head; - blocksize = 1 << inode->i_blkbits; + bh = head = page_buffers(page); + blocksize = bh->b_size; - for(bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { + block_start = 0; + do { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) @@ -1895,7 +1912,10 @@ static int __block_commit_write(struct inode *inode, struct page *page, mark_buffer_dirty(bh); } clear_buffer_new(bh); - } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); /* * If this is a partial write which happened to make all buffers @@ -2020,7 +2040,6 @@ EXPORT_SYMBOL(generic_write_end); int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, unsigned long from) { - struct inode *inode = page->mapping->host; unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; @@ -2029,13 +2048,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, if (!page_has_buffers(page)) return 0; - blocksize = 1 << inode->i_blkbits; + head = page_buffers(page); + blocksize = head->b_size; to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); to = from + to; if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) return 0; - head = page_buffers(page); bh = head; block_start = 0; do { @@ -2068,18 +2087,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block) struct inode *inode = page->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - unsigned int blocksize; + unsigned int blocksize, bbits; int nr, i; int fully_mapped = 1; - BUG_ON(!PageLocked(page)); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); - iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + lblock = (i_size_read(inode)+blocksize-1) >> bbits; bh = head; nr = 0; i = 0; -- cgit v1.2.3 From 1e8b33328a5407b447ff80953655a47014a6dcb9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 10:49:50 -0800 Subject: blockdev: remove bd_block_size_semaphore again This reverts the block-device direct access code to the previous unlocked code, now that fs/buffer.c no longer needs external locking. With this, fs/block_dev.c is back to the original version, apart from a whitespace cleanup that I didn't want to revert. Signed-off-by: Linus Torvalds --- fs/block_dev.c | 105 +++------------------------------------------------------ 1 file changed, 4 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a1e5e3b1eaf..47a949d8a07e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -116,8 +116,6 @@ EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { - struct address_space *mapping; - /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; @@ -126,19 +124,6 @@ int set_blocksize(struct block_device *bdev, int size) if (size < bdev_logical_block_size(bdev)) return -EINVAL; - /* Prevent starting I/O or mapping the device */ - percpu_down_write(&bdev->bd_block_size_semaphore); - - /* Check that the block device is not memory mapped */ - mapping = bdev->bd_inode->i_mapping; - mutex_lock(&mapping->i_mmap_mutex); - if (mapping_mapped(mapping)) { - mutex_unlock(&mapping->i_mmap_mutex); - percpu_up_write(&bdev->bd_block_size_semaphore); - return -EBUSY; - } - mutex_unlock(&mapping->i_mmap_mutex); - /* Don't change the size if it is same as current */ if (bdev->bd_block_size != size) { sync_blockdev(bdev); @@ -146,9 +131,6 @@ int set_blocksize(struct block_device *bdev, int size) bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } - - percpu_up_write(&bdev->bd_block_size_semaphore); - return 0; } @@ -459,12 +441,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); if (!ei) return NULL; - - if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) { - kmem_cache_free(bdev_cachep, ei); - return NULL; - } - return &ei->vfs_inode; } @@ -473,8 +449,6 @@ static void bdev_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); - percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore); - kmem_cache_free(bdev_cachep, bdi); } @@ -1593,22 +1567,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } -ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} -EXPORT_SYMBOL_GPL(blkdev_aio_read); - /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -1620,16 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(file->f_mapping->host); struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); - - percpu_down_read(&bdev->bd_block_size_semaphore); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; @@ -1638,62 +1592,11 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } - - percpu_up_read(&bdev->bd_block_size_semaphore); - blk_finish_plug(&plug); - return ret; } EXPORT_SYMBOL_GPL(blkdev_aio_write); -static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) -{ - int ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_mmap(file, vma); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - -static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_splice_read(file, ppos, pipe, len, flags); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - -static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe, - struct file *file, loff_t *ppos, size_t len, - unsigned int flags) -{ - ssize_t ret; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - - percpu_down_read(&bdev->bd_block_size_semaphore); - - ret = generic_file_splice_write(pipe, file, ppos, len, flags); - - percpu_up_read(&bdev->bd_block_size_semaphore); - - return ret; -} - - /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1724,16 +1627,16 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read = do_sync_read, .write = do_sync_write, - .aio_read = blkdev_aio_read, + .aio_read = generic_file_aio_read, .aio_write = blkdev_aio_write, - .mmap = blkdev_mmap, + .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif - .splice_read = blkdev_splice_read, - .splice_write = blkdev_splice_write, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) -- cgit v1.2.3 From ab73857e354ab9e317613cba7db714e2c12c6547 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 12:27:00 -0800 Subject: direct-io: don't read inode->i_blkbits multiple times Since directio can work on a raw block device, and the block size of the device can change under it, we need to do the same thing that fs/buffer.c now does: read the block size a single time, using ACCESS_ONCE(). Reading it multiple times can get different results, which will then confuse the code because it actually encodes the i_blksize in relationship to the underlying logical blocksize. Signed-off-by: Linus Torvalds --- fs/direct-io.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index f86c720dba0e..cf5b44b10c67 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -540,6 +540,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ int create; + unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; /* * If there was a memory error and we've overwritten all the @@ -554,7 +555,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; - map_bh->b_size = fs_count << dio->inode->i_blkbits; + map_bh->b_size = fs_count << i_blkbits; /* * For writes inside i_size on a DIO_SKIP_HOLES filesystem we @@ -1053,7 +1054,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int seg; size_t size; unsigned long addr; - unsigned blkbits = inode->i_blkbits; + unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1149,7 +1151,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio->inode = inode; dio->rw = rw; sdio.blkbits = blkbits; - sdio.blkfactor = inode->i_blkbits - blkbits; + sdio.blkfactor = i_blkbits - blkbits; sdio.block_in_file = offset >> blkbits; sdio.get_block = get_block; -- cgit v1.2.3 From bbec0270bdd887f96377065ee38b8848b5afa395 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 29 Nov 2012 12:31:52 -0800 Subject: blkdev_max_block: make private to fs/buffer.c We really don't want to look at the block size for the raw block device accesses in fs/block-dev.c, because it may be changing from under us. So get rid of the max_block logic entirely, since the caller should already have done it anyway. That leaves the only user of this function in fs/buffer.c, so move the whole function there and make it static. Signed-off-by: Linus Torvalds --- fs/block_dev.c | 55 +------------------------------------------------------ fs/buffer.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 47a949d8a07e..a1e09b4fe1ba 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode, spin_unlock(&dst->wb.list_lock); } -sector_t blkdev_max_block(struct block_device *bdev) -{ - sector_t retval = ~((sector_t)0); - loff_t sz = i_size_read(bdev->bd_inode); - - if (sz) { - unsigned int size = block_size(bdev); - unsigned int sizebits = blksize_bits(size); - retval = (sz >> sizebits); - } - return retval; -} - /* Kill _all_ buffers and pagecache , dirty or not.. */ void kill_bdev(struct block_device *bdev) { @@ -163,52 +150,12 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - if (iblock >= blkdev_max_block(I_BDEV(inode))) { - if (create) - return -EIO; - - /* - * for reads, we're just trying to fill a partial page. - * return a hole, they will have to call get_block again - * before they can fill it, and they will get -EIO at that - * time - */ - return 0; - } bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } -static int -blkdev_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - sector_t end_block = blkdev_max_block(I_BDEV(inode)); - unsigned long max_blocks = bh->b_size >> inode->i_blkbits; - - if ((iblock + max_blocks) > end_block) { - max_blocks = end_block - iblock; - if ((long)max_blocks <= 0) { - if (create) - return -EIO; /* write fully beyond EOF */ - /* - * It is a read which is fully beyond EOF. We return - * a !buffer_mapped buffer - */ - max_blocks = 0; - } - } - - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - bh->b_size = max_blocks << inode->i_blkbits; - if (max_blocks) - set_buffer_mapped(bh); - return 0; -} - static ssize_t blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -217,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct inode *inode = file->f_mapping->host; return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, - nr_segs, blkdev_get_blocks, NULL, NULL, 0); + nr_segs, blkdev_get_block, NULL, NULL, 0); } int __sync_blockdev(struct block_device *bdev, int wait) diff --git a/fs/buffer.c b/fs/buffer.c index 28a74ff5324b..3586fb05c8ce 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -911,6 +911,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head) attach_page_buffers(page, head); } +static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) +{ + sector_t retval = ~((sector_t)0); + loff_t sz = i_size_read(bdev->bd_inode); + + if (sz) { + unsigned int sizebits = blksize_bits(size); + retval = (sz >> sizebits); + } + return retval; +} + /* * Initialise the state of a blockdev page's buffers. */ @@ -921,7 +933,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); - sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); + sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); do { if (!buffer_mapped(bh)) { -- cgit v1.2.3 From 696199f8ccf7fc6d17ef89c296ad3b6c78c52d9c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Nov 2012 22:00:51 -0500 Subject: don't do blind d_drop() in nfs_prime_dcache() Signed-off-by: Al Viro --- fs/nfs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ce8cb926526b..99489cfca24d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -450,7 +450,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) nfs_refresh_inode(dentry->d_inode, entry->fattr); goto out; } else { - d_drop(dentry); + if (d_invalidate(dentry) != 0) + goto out; dput(dentry); } } -- cgit v1.2.3 From c44600c9d1de64314c2bd58103f15acb53e10073 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Nov 2012 22:04:36 -0500 Subject: nfs_lookup_revalidate(): fix a leak We are leaking fattr and fhandle if we decide that dentry is not to be invalidated, after all (e.g. happens to be a mountpoint). Just free both before that... Signed-off-by: Al Viro --- fs/nfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 99489cfca24d..b9e66b7e0c14 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1101,6 +1101,8 @@ out_set_verifier: out_zap_parent: nfs_zap_caches(dir); out_bad: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); nfs_mark_for_revalidate(dir); if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ @@ -1113,8 +1115,6 @@ out_zap_parent: shrink_dcache_parent(dentry); } d_drop(dentry); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", __func__, dentry->d_parent->d_name.name, -- cgit v1.2.3 From 0903a0c8491c1e987dfc6eb294199a36760398bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Nov 2012 22:11:06 -0500 Subject: cifs: get rid of blind d_drop() in readdir Signed-off-by: Al Viro --- fs/cifs/readdir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index f9b5d3d6cf33..1c576e871366 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -86,14 +86,17 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, dentry = d_lookup(parent, name); if (dentry) { + int err; inode = dentry->d_inode; /* update inode in place if i_ino didn't change */ if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { cifs_fattr_to_inode(inode, fattr); return dentry; } - d_drop(dentry); + err = d_invalidate(dentry); dput(dentry); + if (err) + return NULL; } dentry = d_alloc(parent, name); -- cgit v1.2.3 From 21d8a15ac333b05f1fecdf9fdc30996be2e11d60 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Nov 2012 22:17:21 -0500 Subject: lookup_one_len: don't accept . and .. Signed-off-by: Al Viro --- fs/namei.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 937f9d50c84b..5f4cdf3ad913 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2131,6 +2131,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (!len) return ERR_PTR(-EACCES); + if (unlikely(name[0] == '.')) { + if (len < 2 || (len == 2 && name[1] == '.')) + return ERR_PTR(-EACCES); + } + while (len--) { c = *(const unsigned char *)name++; if (c == '/' || c == '\0') -- cgit v1.2.3 From a77cfcb429ed98845a4e4df72473b8f37acd890b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Nov 2012 22:57:33 -0500 Subject: fix off-by-one in argument passed by iterate_fd() to callbacks Noticed by Pavel Roskin; the thing in his patch I disagree with was compensating for that shite in callbacks instead of fixing it once in the iterator itself. Signed-off-by: Al Viro --- fs/file.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 7cb71b992603..eff23162485f 100644 --- a/fs/file.c +++ b/fs/file.c @@ -994,16 +994,18 @@ int iterate_fd(struct files_struct *files, unsigned n, const void *p) { struct fdtable *fdt; - struct file *file; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); - fdt = files_fdtable(files); - while (!res && n < fdt->max_fds) { - file = rcu_dereference_check_fdtable(files, fdt->fd[n++]); - if (file) - res = f(p, file, n); + for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { + struct file *file; + file = rcu_dereference_check_fdtable(files, fdt->fd[n]); + if (!file) + continue; + res = f(p, file, n); + if (res) + break; } spin_unlock(&files->file_lock); return res; -- cgit v1.2.3 From 57302e0ddf8a210a66fd8a1a2fa50844863b5ded Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 4 Dec 2012 08:25:11 -0800 Subject: vfs: avoid "attempt to access beyond end of device" warnings The block device access simplification that avoided accessing the (racy) block size information (commit bbec0270bdd8: "blkdev_max_block: make private to fs/buffer.c") no longer checks the maximum block size in the block mapping path. That was _almost_ as simple as just removing the code entirely, because the readers and writers all check the size of the device anyway, so under normal circumstances it "just worked". However, the block size may be such that the end of the device may straddle one single buffer_head. At which point we may still want to access the end of the device, but the buffer we use to access it partially extends past the end. The 'bd_set_size()' function intentionally sets the block size to avoid this, but mounting the device - or setting the block size by hand to some other value - can modify that block size. So instead, teach 'submit_bh()' about the special case of the buffer head straddling the end of the device, and turning such an access into a smaller IO access, avoiding the problem. This, btw, also means that unlike before, we can now access the whole device regardless of device block size setting. So now, even if the device size is only 512-byte aligned, we can read and write even the last sector even when having a much bigger block size for accessing the rest of the device. So with this, we could now get rid of the 'bd_set_size()' block size code entirely - resulting in faster IO for the common case - but that would be a separate patch. Reported-and-tested-by: Romain Francoise Reporeted-and-tested-by: Meelis Roos Reported-by: Tony Luck Signed-off-by: Linus Torvalds --- fs/buffer.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 3586fb05c8ce..c4e11390a44c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2893,6 +2893,55 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) bio_put(bio); } +/* + * This allows us to do IO even on the odd last sectors + * of a device, even if the bh block size is some multiple + * of the physical sector size. + * + * We'll just truncate the bio to the size of the device, + * and clear the end of the buffer head manually. + * + * Truly out-of-range accesses will turn into actual IO + * errors, this only handles the "we need to be able to + * do IO at the final sector" case. + */ +static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) +{ + sector_t maxsector; + unsigned bytes; + + maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_sector; + bytes = bio->bi_size; + if (likely((bytes >> 9) <= maxsector)) + return; + + /* Uhhuh. We've got a bh that straddles the device size! */ + bytes = maxsector << 9; + + /* Truncate the bio.. */ + bio->bi_size = bytes; + bio->bi_io_vec[0].bv_len = bytes; + + /* ..and clear the end of the buffer for reads */ + if (rw & READ) { + void *kaddr = kmap_atomic(bh->b_page); + memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); + kunmap_atomic(kaddr); + } +} + int submit_bh(int rw, struct buffer_head * bh) { struct bio *bio; @@ -2929,6 +2978,9 @@ int submit_bh(int rw, struct buffer_head * bh) bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; + /* Take care of bh's that straddle the end of the device */ + guard_bh_eod(rw, bio, bh); + bio_get(bio); submit_bio(rw, bio); -- cgit v1.2.3 From 27d7c2a006a81c04fab00b8cd81b99af3b32738d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 5 Dec 2012 20:01:24 +0300 Subject: vfs: clear to the end of the buffer on partial buffer reads READ is zero so the "rw & READ" test is always false. The intended test was "((rw & RW_MASK) == READ)". Signed-off-by: Dan Carpenter Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index c4e11390a44c..ec0aca8ba6bf 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2935,7 +2935,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) bio->bi_io_vec[0].bv_len = bytes; /* ..and clear the end of the buffer for reads */ - if (rw & READ) { + if ((rw & RW_MASK) == READ) { void *kaddr = kmap_atomic(bh->b_page); memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); kunmap_atomic(kaddr); -- cgit v1.2.3