From c5c7b8ddfbf8cb3b2291e515a34ab1b8982f5a2d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 15 Jun 2014 23:46:28 -0400 Subject: ext4: Fix buffer double free in ext4_alloc_branch() Error recovery in ext4_alloc_branch() calls ext4_forget() even for buffer corresponding to indirect block it did not allocate. This leads to brelse() being called twice for that buffer (once from ext4_forget() and once from cleanup in ext4_ind_map_blocks()) leading to buffer use count misaccounting. Eventually (but often much later because there are other users of the buffer) we will see messages like: VFS: brelse: Trying to free free buffer Another manifestation of this problem is an error: JBD2 unexpected failure: jbd2_journal_revoke: !buffer_revoked(bh); inconsistent data on disk The fix is easy - don't forget buffer we did not allocate. Also add an explanatory comment because the indexing at ext4_alloc_branch() is somewhat subtle. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/indirect.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 8a57e9fcd1b9..f85bafd474dc 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return 0; failed: for (; i >= 0; i--) { - if (i != indirect_blks && branch[i].bh) + /* + * We want to ext4_forget() only freshly allocated indirect + * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and + * buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called. + */ + if (i > 0 && i != indirect_blks && branch[i].bh) ext4_forget(handle, 1, inode, branch[i].bh, branch[i].bh->b_blocknr); ext4_free_blocks(handle, inode, NULL, new_blocks[i], -- cgit v1.2.3 From ccfb30001f37ace4690a74c27b4812cf054e123a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Jun 2014 13:02:11 +0900 Subject: f2fs: fix to report newly allocate region as extent Previous get_block in f2fs didn't report the newly allocated region which has NEW_ADDR. For reader, it should not report, but fiemap needs this. So, this patch introduces two get_block sharing core function. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0924521306b4..f8cf619edb5f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -608,8 +608,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, bool fiemap) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; @@ -637,7 +637,7 @@ static int get_data_block(struct inode *inode, sector_t iblock, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; if (dn.data_blkaddr != NULL_ADDR) { @@ -671,7 +671,7 @@ get_next: err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); @@ -708,10 +708,23 @@ out: return err; } +static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, false); +} + +static int get_data_block_fiemap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, true); +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, get_data_block); + return generic_block_fiemap(inode, fieinfo, + start, len, get_data_block_fiemap); } static int f2fs_read_data_page(struct file *file, struct page *page) -- cgit v1.2.3 From ead432756ab2c76b1f1de742a1c8a06992cb98eb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Jun 2014 13:05:55 +0900 Subject: f2fs: recover fallocated data and its i_size together This patch arranges the f2fs_locks to cover the fallocated data and its i_size. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c58e33075719..f6c4bdaaae86 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -659,13 +659,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset, off_start = offset & (PAGE_CACHE_SIZE - 1); off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + f2fs_lock_op(sbi); + for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; - f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_reserve_block(&dn, index); - f2fs_unlock_op(sbi); if (ret) break; @@ -683,8 +683,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset, i_size_read(inode) < new_size) { i_size_write(inode, new_size); mark_inode_dirty(inode); - f2fs_write_inode(inode, NULL); + update_inode_page(inode); } + f2fs_unlock_op(sbi); return ret; } -- cgit v1.2.3 From 98397ff3cddcfdedd2ba1701bd30a73c1d733769 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Jun 2014 13:07:31 +0900 Subject: f2fs: fix not to allocate unnecessary blocks during fallocate This patch fixes the fallocate bug like below. (See xfstests/255) In fallocate(fd, 0, 20480), expand_inode_data processes for (index = pg_start; index <= pg_end; index++) { f2fs_reserve_block(); ... } So, even though fallocate requests 20480, 5 blocks, f2fs allocates 6 blocks including pg_end. So, this patch adds one condition to avoid block allocation. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f6c4bdaaae86..7d8b96275092 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -664,11 +664,14 @@ static int expand_inode_data(struct inode *inode, loff_t offset, for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; + if (index == pg_end && !off_end) + goto noalloc; + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_reserve_block(&dn, index); if (ret) break; - +noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) -- cgit v1.2.3 From ec7756ae1517af483d995f386936d00a4cb1ab7d Mon Sep 17 00:00:00 2001 From: T Makphaibulchoke Date: Wed, 25 Jun 2014 22:08:29 -0400 Subject: fs/mbcache: replace __builtin_log2() with ilog2() Fix compiler error with some gcc version(s) that do not support __builtin_log2() by replacing __builtin_log2() with ilog2(). Signed-off-by: T. Makphaibulchoke Signed-off-by: Theodore Ts'o Reviewed-by: Maciej W. Rozycki --- fs/mbcache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/mbcache.c b/fs/mbcache.c index bf166e388f0d..187477ded6b3 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -73,6 +73,7 @@ #include #include #include +#include #ifdef MB_CACHE_DEBUG # define mb_debug(f...) do { \ @@ -93,7 +94,7 @@ #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) -#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS) +#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) #define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) -- cgit v1.2.3 From e43bb4e612b402a631bc549ac496f78bc7a79438 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 26 Jun 2014 10:11:53 -0400 Subject: ext4: decrement free clusters/inodes counters when block group declared bad We should decrement free clusters counter when block bitmap is marked as corrupt and free inodes counter when the allocation bitmap is marked as corrupt to avoid misunderstanding due to incorrect available size in statfs result. User can get immediately ENOSPC error from write begin without reaching for the writepages. Cc: Darrick J. Wong Reported-by: Amit Sahrawat Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan --- fs/ext4/balloc.c | 16 ++++++++++++++++ fs/ext4/ialloc.c | 23 +++++++++++++++++++++++ fs/ext4/mballoc.c | 8 ++++++++ 3 files changed, 47 insertions(+) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0762d143e252..fca382037ddd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -194,7 +194,16 @@ static void ext4_init_block_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -359,6 +368,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return; @@ -369,6 +379,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -376,6 +389,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0ee59a6644e2..a87455df38bc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, struct ext4_group_desc *gdp) { struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent @@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +196,12 @@ verify: ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } @@ -321,6 +338,12 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 59e31622cc6e..7f72f50a8fa7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; @@ -759,6 +760,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -1431,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1441,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); /* Mark the block group as corrupt. */ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &e4b->bd_info->bb_state); -- cgit v1.2.3 From 77ea2a4ba657a1ad4fb7c64bc5cdce84b8a132b6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Jun 2014 12:28:57 -0400 Subject: ext4: Fix block zeroing when punching holes in indirect block files free_holes_block() passed local variable as a block pointer to ext4_clear_blocks(). Thus ext4_clear_blocks() zeroed out this local variable instead of proper place in inode / indirect block. We later zero out proper place in inode / indirect block but don't dirty the inode / buffer again which can lead to subtle issues (some changes e.g. to inode can be lost). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/indirect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index f85bafd474dc..6f3bb55567b6 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1335,8 +1335,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, if (level == 0 || (bh && all_zeroes((__le32 *)bh->b_data, (__le32 *)bh->b_data + addr_per_block))) { - ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); - *i_data = 0; + ext4_free_data(handle, inode, parent_bh, + i_data, i_data + 1); } brelse(bh); bh = NULL; -- cgit v1.2.3 From a93cd4cf86466caa49cfe64607bea7f0bde3f916 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Jun 2014 12:30:54 -0400 Subject: ext4: Fix hole punching for files with indirect blocks Hole punching code for files with indirect blocks wrongly computed number of blocks which need to be cleared when traversing the indirect block tree. That could result in punching more blocks than actually requested and thus effectively cause a data loss. For example: fallocate -n -p 10240000 4096 will punch the range 10240000 - 12632064 instead of the range 1024000 - 10244096. Fix the calculation. CC: stable@vger.kernel.org Fixes: 8bad6fc813a3a5300f51369c39d315679fd88c72 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/indirect.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 6f3bb55567b6..fd69da194826 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1316,16 +1316,24 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, blk = *i_data; if (level > 0) { ext4_lblk_t first2; + ext4_lblk_t count2; + bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); if (!bh) { EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), "Read failure"); return -EIO; } - first2 = (first > offset) ? first - offset : 0; + if (first > offset) { + first2 = first - offset; + count2 = count; + } else { + first2 = 0; + count2 = count - (offset - first); + } ret = free_hole_blocks(handle, inode, bh, (__le32 *)bh->b_data, level - 1, - first2, count - offset, + first2, count2, inode->i_sb->s_blocksize >> 2); if (ret) { brelse(bh); -- cgit v1.2.3 From 76f47128f9b33af1e96819746550d789054c9664 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 19 Jun 2014 16:44:48 -0400 Subject: nfsd: fix rare symlink decoding bug An NFS operation that creates a new symlink includes the symlink data, which is xdr-encoded as a length followed by the data plus 0 to 3 bytes of zero-padding as required to reach a 4-byte boundary. The vfs, on the other hand, wants null-terminated data. The simple way to handle this would be by copying the data into a newly allocated buffer with space for the final null. The current nfsd_symlink code tries to be more clever by skipping that step in the (likely) case where the byte following the string is already 0. But that assumes that the byte following the string is ours to look at. In fact, it might be the first byte of a page that we can't read, or of some object that another task might modify. Worse, the NFSv4 code tries to fix the problem by actually writing to that byte. In the NFSv2/v3 cases this actually appears to be safe: - nfs3svc_decode_symlinkargs explicitly null-terminates the data (after first checking its length and copying it to a new page). - NFSv2 limits symlinks to 1k. The buffer holding the rpc request is always at least a page, and the link data (and previous fields) have maximum lengths that prevent the request from reaching the end of a page. In the NFSv4 case the CREATE op is potentially just one part of a long compound so can end up on the end of a page if you're unlucky. The minimal fix here is to copy and null-terminate in the NFSv4 case. The nfsd_symlink() interface here seems too fragile, though. It should really either do the copy itself every time or just require a null-terminated string. Reported-by: Jeff Layton Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 9 --------- fs/nfsd/nfs4xdr.c | 13 ++++++++++++- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6851b003f2a4..8f029db5d271 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -617,15 +617,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (create->cr_type) { case NF4LNK: - /* ugh! we have to null-terminate the linktext, or - * vfs_symlink() will choke. it is always safe to - * null-terminate by brute force, since at worst we - * will overwrite the first byte of the create namelen - * in the XDR buffer, which has already been extracted - * during XDR decode. - */ - create->cr_linkname[create->cr_linklen] = 0; - status = nfsd_symlink(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, create->cr_linkname, create->cr_linklen, diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 83baf2bfe9e9..5b4fef55676a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -600,7 +600,18 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create READ_BUF(4); create->cr_linklen = be32_to_cpup(p++); READ_BUF(create->cr_linklen); - SAVEMEM(create->cr_linkname, create->cr_linklen); + /* + * The VFS will want a null-terminated string, and + * null-terminating in place isn't safe since this might + * end on a page boundary: + */ + create->cr_linkname = + kmalloc(create->cr_linklen + 1, GFP_KERNEL); + if (!create->cr_linkname) + return nfserr_jukebox; + memcpy(create->cr_linkname, p, create->cr_linklen); + create->cr_linkname[create->cr_linklen] = '\0'; + defer_free(argp, kfree, create->cr_linkname); break; case NF4BLK: case NF4CHR: -- cgit v1.2.3 From 9b4eaf43f4b0207b5d1ca8b8d22df88ea9e142fe Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Jun 2014 11:35:59 +0800 Subject: btrfs: rename add_device_membership to btrfs_kobj_add_device Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index df39458f1487..1395efbffa2d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -605,7 +605,7 @@ static void init_feature_attrs(void) } } -static int add_device_membership(struct btrfs_fs_info *fs_info) +static int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info) { int error = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -666,7 +666,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) if (error) goto failure; - error = add_device_membership(fs_info); + error = btrfs_kobj_add_device(fs_info); if (error) goto failure; -- cgit v1.2.3 From 99994cde9c59c2b8bb67d46d531b26cc73e39747 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Jun 2014 11:36:00 +0800 Subject: btrfs: dev delete should remove sysfs entry when we delete the device from the mounted btrfs, we would need its corresponding sysfs enty to be removed as well. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/sysfs.c | 20 ++++++++++++++++++++ fs/btrfs/sysfs.h | 2 ++ fs/btrfs/volumes.c | 4 ++++ 3 files changed, 26 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1395efbffa2d..401677b11045 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -605,6 +605,26 @@ static void init_feature_attrs(void) } } +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!fs_info->device_dir_kobj) + return -EINVAL; + + if (one_device) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_info->device_dir_kobj, + disk_kobj->name); + } + + return 0; +} + static int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info) { int error = 0; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9ab576318a84..529554e5f9de 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -66,4 +66,6 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c83b24251e53..be2c8e285afb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,7 @@ #include "rcu-string.h" #include "math.h" #include "dev-replace.h" +#include "sysfs.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1680,6 +1681,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) device->fs_devices->open_devices--; + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); + call_rcu(&device->rcu, free_device); num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; -- cgit v1.2.3 From 0d39376aa28eba6d63d0120ccc399735842abc8e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Jun 2014 11:36:01 +0800 Subject: btrfs: dev add should add its sysfs entry we would need the device links to be created, when device is added. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/sysfs.c | 12 +++++++++--- fs/btrfs/sysfs.h | 2 ++ fs/btrfs/volumes.c | 5 +++++ 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 401677b11045..78699364f537 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -625,14 +625,17 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, return 0; } -static int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info) +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) { int error = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *dev; - fs_info->device_dir_kobj = kobject_create_and_add("devices", + if (!fs_info->device_dir_kobj) + fs_info->device_dir_kobj = kobject_create_and_add("devices", &fs_info->super_kobj); + if (!fs_info->device_dir_kobj) return -ENOMEM; @@ -643,6 +646,9 @@ static int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info) if (!dev->bdev) continue; + if (one_device && one_device != dev) + continue; + disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; @@ -686,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) if (error) goto failure; - error = btrfs_kobj_add_device(fs_info); + error = btrfs_kobj_add_device(fs_info, NULL); if (error) goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 529554e5f9de..ac46df37504c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -66,6 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_device *one_device); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index be2c8e285afb..19b67e969b52 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2147,6 +2147,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); + + /* add sysfs device entry */ + btrfs_kobj_add_device(root->fs_info, device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { @@ -2209,6 +2213,7 @@ error_trans: unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); + btrfs_kobj_rm_device(root->fs_info, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); -- cgit v1.2.3 From 49c6f736f34f901117c20960ebd7d5e60f12fcac Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Jun 2014 11:36:02 +0800 Subject: btrfs: dev replace should replace the sysfs entry when we replace the device its corresponding sysfs entry has to be replaced as well Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2af6e66fe788..eea26e1b2fda 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "sysfs.h" static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); @@ -562,6 +563,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_srcdev(fs_info, src_device); -- cgit v1.2.3 From b2373f255cacdc1ea4da25e75a5a78949ffd9d66 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Jun 2014 11:36:03 +0800 Subject: btrfs: create sprout should rename fsid on the sysfs as well Creating sprout will change the fsid of the mounted root. do the same on the sysfs as well. reproducer: mount /dev/sdb /btrfs (seed disk) btrfs dev add /dev/sdc /btrfs mount -o rw,remount /btrfs btrfs dev del /dev/sdb /btrfs mount /dev/sdb /btrfs Error: kobject_add_internal failed for fe350492-dc28-4051-a601-e017b17e6145 with -EEXIST, don't try to register things with the same name in the same directory. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19b67e969b52..6ca0d9cc46f9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2154,6 +2154,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = init_first_rw_device(trans, root, device); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2164,6 +2165,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_abort_transaction(trans, root, ret); goto error_trans; } + + /* Sprouting would change fsid of the mounted root, + * so rename the fsid on the sysfs + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", + root->fs_info->fsid); + if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); if (ret) { -- cgit v1.2.3 From 5588383ece6127909df5b9d601d562fe5b9fe38a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 24 Jun 2014 15:39:16 +0800 Subject: Btrfs: fix crash when mounting raid5 btrfs with missing disks The reproducer is $ mkfs.btrfs D1 D2 D3 -mraid5 $ mkfs.ext4 D2 && mkfs.ext4 D3 $ mount D1 /btrfs -odegraded ------------------- [ 87.672992] ------------[ cut here ]------------ [ 87.673845] kernel BUG at fs/btrfs/raid56.c:1828! ... [ 87.673845] RIP: 0010:[] [] __raid_recover_end_io+0x4ae/0x4d0 ... [ 87.673845] Call Trace: [ 87.673845] [] ? mempool_free+0x36/0xa0 [ 87.673845] [] raid_recover_end_io+0x75/0xa0 [ 87.673845] [] bio_endio+0x5b/0xa0 [ 87.673845] [] bio_endio_nodec+0x12/0x20 [ 87.673845] [] end_workqueue_fn+0x41/0x50 [ 87.673845] [] normal_work_helper+0xca/0x2c0 [ 87.673845] [] process_one_work+0x1eb/0x530 [ 87.673845] [] ? process_one_work+0x189/0x530 [ 87.673845] [] worker_thread+0x11b/0x4f0 [ 87.673845] [] ? rescuer_thread+0x290/0x290 [ 87.673845] [] kthread+0xe4/0x100 [ 87.673845] [] ? kthread_create_on_node+0x220/0x220 [ 87.673845] [] ret_from_fork+0x7c/0xb0 [ 87.673845] [] ? kthread_create_on_node+0x220/0x220 ------------------- It's because that we miscalculate @rbio->bbio->error so that it doesn't reach maximum of tolerable errors while it should have. Signed-off-by: Liu Bo Tested-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/raid56.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4055291a523e..4a88f073fdd7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1956,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * pages are going to be uptodate. */ for (stripe = 0; stripe < bbio->num_stripes; stripe++) { - if (rbio->faila == stripe || - rbio->failb == stripe) + if (rbio->faila == stripe || rbio->failb == stripe) { + atomic_inc(&rbio->bbio->error); continue; + } for (pagenr = 0; pagenr < nr_pages; pagenr++) { struct page *p; -- cgit v1.2.3 From c39aa7056f50fa7b49265ea35c0f3eddb7c7be68 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 25 Jun 2014 00:00:44 +0900 Subject: btrfs compression: reuse recently used workspace Add compression `workspace' in free_workspace() to `idle_workspace' list head, instead of tail. So we have better chances to reuse most recently used `workspace'. Signed-off-by: Sergey Senozhatsky Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 92371c414228..1daea0b47187 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace) spin_lock(workspace_lock); if (*num_workspace < num_online_cpus()) { - list_add_tail(workspace, idle_workspace); + list_add(workspace, idle_workspace); (*num_workspace)++; spin_unlock(workspace_lock); goto wake; -- cgit v1.2.3 From 46c4e71e9b02a649c722f06569f5b6575da02dba Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 24 Jun 2014 17:48:28 +0100 Subject: Btrfs: assert send doesn't attempt to start transactions When starting a transaction just assert that current->journal_info doesn't contain a send transaction stub, since send isn't supposed to start transactions and when it finishes (either successfully or not) it's supposed to set current->journal_info to NULL. This is motivated by the change titled: Btrfs: fix crash when starting transaction Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 511839c04f11..a32ad65b6cdc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -386,11 +386,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, bool reloc_reserved = false; int ret; + /* Send isn't supposed to start transactions. */ + ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); - if (current->journal_info && - current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) { + if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; -- cgit v1.2.3 From 472b909ff6f4884d235ef7b9d3847fad5efafbff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 25 Jun 2014 13:45:41 -0700 Subject: btrfs: only unlock block in verify_parent_transid if we locked it This is a regression from my patch a26e8c9f75b0bfd8cccc9e8f110737b136eb5994, we need to only unlock the block if we were the one who locked it. Otherwise this will trip BUG_ON()'s in locking.c Thanks, cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8bb4aa19898f..f00165de6fbf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -369,7 +369,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); - btrfs_tree_read_unlock_blocking(eb); + if (need_lock) + btrfs_tree_read_unlock_blocking(eb); return ret; } -- cgit v1.2.3 From 4e26445faad366d67d7723622bf6a60a6f0f5993 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 30 Jun 2014 11:50:28 +0800 Subject: kernfs: introduce kernfs_pin_sb() kernfs_pin_sb() tries to get a refcnt of the superblock. This will be used by cgroupfs. v2: - make kernfs_pin_sb() return the superblock. - drop kernfs_drop_sb(). tj: Updated the comment a bit. [ This is a prerequisite for a bugfix. ] Cc: # 3.15 Acked-by: Greg Kroah-Hartman Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- fs/kernfs/mount.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'fs') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d171b98a6cdd..f973ae9b05f1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb) kernfs_put(root_kn); } +/** + * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root + * @kernfs_root: the kernfs_root in question + * @ns: the namespace tag + * + * Pin the superblock so the superblock won't be destroyed in subsequent + * operations. This can be used to block ->kill_sb() which may be useful + * for kernfs users which dynamically manage superblocks. + * + * Returns NULL if there's no superblock associated to this kernfs_root, or + * -EINVAL if the superblock is being freed. + */ +struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) +{ + struct kernfs_super_info *info; + struct super_block *sb = NULL; + + mutex_lock(&kernfs_mutex); + list_for_each_entry(info, &root->supers, node) { + if (info->ns == ns) { + sb = info->sb; + if (!atomic_inc_not_zero(&info->sb->s_active)) + sb = ERR_PTR(-EINVAL); + break; + } + } + mutex_unlock(&kernfs_mutex); + return sb; +} + void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", -- cgit v1.2.3 From ecca47ce8294843045e7465d76fee84dbf07a004 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 1 Jul 2014 16:41:03 -0400 Subject: kernfs: kernfs_notify() must be useable from non-sleepable contexts d911d9874801 ("kernfs: make kernfs_notify() trigger inotify events too") added fsnotify triggering to kernfs_notify() which requires a sleepable context. There are already existing users of kernfs_notify() which invoke it from an atomic context and in general it's silly to require a sleepable context for triggering a notification. The following is an invalid context bug triggerd by md invoking sysfs_notify() from IO completion path. BUG: sleeping function called from invalid context at kernel/locking/mutex.c:586 in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1 2 locks held by swapper/1/0: #0: (&(&vblk->vq_lock)->rlock){-.-...}, at: [] virtblk_done+0x42/0xe0 [virtio_blk] #1: (&(&bitmap->counts.lock)->rlock){-.....}, at: [] bitmap_endwrite+0x68/0x240 irq event stamp: 33518 hardirqs last enabled at (33515): [] default_idle+0x1f/0x230 hardirqs last disabled at (33516): [] common_interrupt+0x6d/0x72 softirqs last enabled at (33518): [] _local_bh_enable+0x22/0x50 softirqs last disabled at (33517): [] irq_enter+0x60/0x80 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.16.0-0.rc2.git2.1.fc21.x86_64 #1 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 0000000000000000 f90db13964f4ee05 ffff88007d403b80 ffffffff81807b4c 0000000000000000 ffff88007d403ba8 ffffffff810d4f14 0000000000000000 0000000000441800 ffff880078fa1780 ffff88007d403c38 ffffffff8180caf2 Call Trace: [] dump_stack+0x4d/0x66 [] __might_sleep+0x184/0x240 [] mutex_lock_nested+0x42/0x440 [] kernfs_notify+0x90/0x150 [] bitmap_endwrite+0xcc/0x240 [] close_write+0x93/0xb0 [raid1] [] r1_bio_write_done+0x29/0x50 [raid1] [] raid1_end_write_request+0xe4/0x260 [raid1] [] bio_endio+0x6b/0xa0 [] blk_update_request+0x94/0x420 [] blk_mq_end_io+0x1a/0x70 [] virtblk_request_done+0x32/0x80 [virtio_blk] [] __blk_mq_complete_request+0x88/0x120 [] blk_mq_complete_request+0x2a/0x30 [] virtblk_done+0x66/0xe0 [virtio_blk] [] vring_interrupt+0x3a/0xa0 [virtio_ring] [] handle_irq_event_percpu+0x77/0x340 [] handle_irq_event+0x3d/0x60 [] handle_edge_irq+0x66/0x130 [] handle_irq+0x84/0x150 [] do_IRQ+0x4d/0xe0 [] common_interrupt+0x72/0x72 [] ? native_safe_halt+0x6/0x10 [] default_idle+0x24/0x230 [] arch_cpu_idle+0xf/0x20 [] cpu_startup_entry+0x37c/0x7b0 [] start_secondary+0x25b/0x300 This patch fixes it by punting the notification delivery through a work item. This ends up adding an extra pointer to kernfs_elem_attr enlarging kernfs_node by a pointer, which is not ideal but not a very big deal either. If this turns out to be an actual issue, we can move kernfs_elem_attr->size to kernfs_node->iattr later. Signed-off-by: Tejun Heo Reported-by: Josh Boyer Cc: Jens Axboe Reviewed-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 69 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e3d37f607f97..d895b4b7b661 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -39,6 +39,19 @@ struct kernfs_open_node { struct list_head files; /* goes through kernfs_open_file.list */ }; +/* + * kernfs_notify() may be called from any context and bounces notifications + * through a work item. To minimize space overhead in kernfs_node, the + * pending queue is implemented as a singly linked list of kernfs_nodes. + * The list is terminated with the self pointer so that whether a + * kernfs_node is on the list or not can be determined by testing the next + * pointer for NULL. + */ +#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) + +static DEFINE_SPINLOCK(kernfs_notify_lock); +static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; + static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; @@ -783,24 +796,25 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) return DEFAULT_POLLMASK|POLLERR|POLLPRI; } -/** - * kernfs_notify - notify a kernfs file - * @kn: file to notify - * - * Notify @kn such that poll(2) on @kn wakes up. - */ -void kernfs_notify(struct kernfs_node *kn) +static void kernfs_notify_workfn(struct work_struct *work) { - struct kernfs_root *root = kernfs_root(kn); + struct kernfs_node *kn; struct kernfs_open_node *on; struct kernfs_super_info *info; - unsigned long flags; - - if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) +repeat: + /* pop one off the notify_list */ + spin_lock_irq(&kernfs_notify_lock); + kn = kernfs_notify_list; + if (kn == KERNFS_NOTIFY_EOL) { + spin_unlock_irq(&kernfs_notify_lock); return; + } + kernfs_notify_list = kn->attr.notify_next; + kn->attr.notify_next = NULL; + spin_unlock_irq(&kernfs_notify_lock); /* kick poll */ - spin_lock_irqsave(&kernfs_open_node_lock, flags); + spin_lock_irq(&kernfs_open_node_lock); on = kn->attr.open; if (on) { @@ -808,12 +822,12 @@ void kernfs_notify(struct kernfs_node *kn) wake_up_interruptible(&on->poll); } - spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + spin_unlock_irq(&kernfs_open_node_lock); /* kick fsnotify */ mutex_lock(&kernfs_mutex); - list_for_each_entry(info, &root->supers, node) { + list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct inode *inode; struct dentry *dentry; @@ -833,6 +847,33 @@ void kernfs_notify(struct kernfs_node *kn) } mutex_unlock(&kernfs_mutex); + kernfs_put(kn); + goto repeat; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any + * context. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); + unsigned long flags; + + if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) + return; + + spin_lock_irqsave(&kernfs_notify_lock, flags); + if (!kn->attr.notify_next) { + kernfs_get(kn); + kn->attr.notify_next = kernfs_notify_list; + kernfs_notify_list = kn; + schedule_work(&kernfs_notify_work); + } + spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); -- cgit v1.2.3 From 69bbd9c7b99974f3a701d4de6ef7010c37182a47 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Jun 2014 17:23:12 +0300 Subject: nfs: fix nfs4d readlink truncated packet XDR requires 4-byte alignment; nfs4d READLINK reply writes out the padding, but truncates the packet to the padding-less size. Fix by taking the padding into consideration when truncating the packet. Symptoms: # ll /mnt/ ls: cannot read symbolic link /mnt/test: Input/output error total 4 -rw-r--r--. 1 root root 0 Jun 14 01:21 123456 lrwxrwxrwx. 1 root root 6 Jul 2 03:33 test drwxr-xr-x. 1 root root 0 Jul 2 23:50 tmp drwxr-xr-x. 1 root root 60 Jul 2 23:44 tree Signed-off-by: Avi Kivity Fixes: 476a7b1f4b2c (nfsd4: don't treat readlink like a zero-copy operation) Reviewed-by: Kinglong Mee Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5b4fef55676a..2fc7abebeb9b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3278,7 +3278,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd wire_count = htonl(maxcount); write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); - xdr_truncate_encode(xdr, length_offset + 4 + maxcount); + xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4)); if (maxcount & 3) write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero, 4 - (maxcount&3)); -- cgit v1.2.3 From 3cc79392558f1789e5e1d2fce44b681980f403c3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Jun 2014 22:36:02 +0100 Subject: Btrfs: atomically set inode->i_flags in btrfs_update_iflags This change is based on the corresponding recent change for ext4: ext4: atomically set inode->i_flags in ext4_set_inode_flags() That has the following commit message that applies to btrfs as well: "Use cmpxchg() to atomically set i_flags instead of clearing out the S_IMMUTABLE, S_APPEND, etc. flags and then setting them from the EXT4_IMMUTABLE_FL, EXT4_APPEND_FL flags, since this opens up a race where an immutable file has the immutable flag cleared for a brief window of time." Replacing EXT4_IMMUTABLE_FL and EXT4_APPEND_FL with BTRFS_INODE_IMMUTABLE and BTRFS_INODE_APPEND, respectively. Reviewed-by: David Sterba Reviewed-by: Satoru Takeuchi Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6ea15469c63f..02dc64bbf1cb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -136,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) void btrfs_update_iflags(struct inode *inode) { struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + unsigned int new_fl = 0; if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); } /* -- cgit v1.2.3 From 5f3164813b90f7dbcb5c3ab9006906222ce471b7 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 26 Jun 2014 11:08:16 +0800 Subject: Btrfs: fix race between balance recovery and root deletion Balance recovery is called when RW mounting or remounting from RO to RW, it is called to finish roots merging. When doing balance recovery, relocation root's corresponding fs root(whose root refs is 0) might be destroyed by cleaner thread, this will make btrfs fail to mount. Fix this problem by holding @cleaner_mutex when doing balance recovery. Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 ++ fs/btrfs/super.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f00165de6fbf..08e65e9cf2aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2905,7 +2905,9 @@ retry_root_backup: if (ret) goto fail_qgroup; + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING "BTRFS: failed to recover relocation\n"); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4662d92a4b73..b6ebde231de7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1467,7 +1467,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; /* recover relocation */ + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret) goto restore; -- cgit v1.2.3 From 2aa06a35d06a34b3109bdbf1d653de1695dc8f12 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 27 Jun 2014 16:50:31 -0500 Subject: btrfs: fix nossd and ssd_spread mount option regression The commit 0780253 btrfs: Cleanup the btrfs_parse_options for remount. broke ssd options quite badly; it stopped making ssd_spread imply ssd, and it made "nossd" unsettable. Put things back at least as well as they were before (though ssd mount option handling is still pretty odd: # mount -o "nossd,ssd_spread" works?) Reported-by: Roman Mamedov Signed-off-by: Eric Sandeen Signed-off-by: Chris Mason --- fs/btrfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b6ebde231de7..0927e463afca 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -522,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_ssd_spread: btrfs_set_and_info(root, SSD_SPREAD, "use spread ssd allocation scheme"); + btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_clear_and_info(root, NOSSD, + btrfs_set_and_info(root, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; -- cgit v1.2.3 From e755f780865221252ef3321215c9796b78e7b1c5 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 30 Jun 2014 17:12:47 +0800 Subject: btrfs: fix null pointer dereference in clone_fs_devices when name is null when one of the device path is missing btrfs_device name is null. So this patch will check for that. stack: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: [] strlen+0x0/0x30 [] ? clone_fs_devices+0xaa/0x160 [btrfs] [] btrfs_init_new_device+0x317/0xca0 [btrfs] [] ? __kmalloc_track_caller+0x15a/0x1a0 [] btrfs_ioctl+0xaa3/0x2860 [btrfs] [] ? handle_mm_fault+0x48c/0x9c0 [] ? __blkdev_put+0x171/0x180 [] ? __do_page_fault+0x4ac/0x590 [] ? blkdev_put+0x106/0x110 [] ? mntput+0x35/0x40 [] do_vfs_ioctl+0x460/0x4a0 [] ? ____fput+0xe/0x10 [] ? task_work_run+0xb3/0xd0 [] SyS_ioctl+0x57/0x90 [] ? do_page_fault+0xe/0x10 [] system_call_fastpath+0x16/0x1b reproducer: mkfs.btrfs -draid1 -mraid1 /dev/sdg1 /dev/sdg2 btrfstune -S 1 /dev/sdg1 modprobe -r btrfs && modprobe btrfs mount -o degraded /dev/sdg1 /btrfs btrfs dev add /dev/sdg3 /btrfs Signed-off-by: Anand Jain Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6ca0d9cc46f9..6104676857f5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -555,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * This is ok to do without rcu read locked because we hold the * uuid mutex so nothing we touch in here is going to disappear. */ - name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); - if (!name) { - kfree(device); - goto error; + if (orig_dev->name) { + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { + kfree(device); + goto error; + } + rcu_assign_pointer(device->name, name); } - rcu_assign_pointer(device->name, name); list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; -- cgit v1.2.3 From 0aeb8a6e67cddeac1d42cf64795fde0641a1cffb Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 30 Jun 2014 17:12:48 +0800 Subject: btrfs: fix null pointer dereference in btrfs_show_devname when name is null dev->name is null but missing flag is not set. Strictly speaking the missing flag should have been set, but there are more places where code just checks if name is null. For now this patch does the same. stack: BUG: unable to handle kernel NULL pointer dereference at 0000000000000064 IP: [] btrfs_show_devname+0x58/0xf0 [btrfs] [] show_vfsmnt+0x39/0x130 [] m_show+0x16/0x20 [] seq_read+0x296/0x390 [] vfs_read+0x9d/0x160 [] SyS_read+0x49/0x90 [] system_call_fastpath+0x16/0x1b reproducer: mkfs.btrfs -draid1 -mraid1 /dev/sdg1 /dev/sdg2 btrfstune -S 1 /dev/sdg1 modprobe -r btrfs && modprobe btrfs mount -o degraded /dev/sdg1 /btrfs btrfs dev add /dev/sdg3 /btrfs Signed-off-by: Anand Jain Signed-off-by: Chris Mason --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0927e463afca..8e16bca69c56 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1811,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) list_for_each_entry(dev, head, dev_list) { if (dev->missing) continue; + if (!dev->name) + continue; if (!first_dev || dev->devid < first_dev->devid) first_dev = dev; } -- cgit v1.2.3 From 14f5979633a67de81b9bd4a36a0eb99125728f9b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 29 Jun 2014 21:45:40 +0100 Subject: Btrfs: fix use-after-free when cloning a trailing file hole The transaction handle was being used after being freed. Cc: Chris Mason Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 02dc64bbf1cb..2a99f4987bb1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3142,7 +3142,6 @@ out: static void clone_update_extent_map(struct inode *inode, const struct btrfs_trans_handle *trans, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, const u64 hole_offset, const u64 hole_len) { @@ -3157,7 +3156,11 @@ static void clone_update_extent_map(struct inode *inode, return; } - if (fi) { + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); btrfs_extent_item_to_extent_map(inode, path, fi, false, em); em->generation = -1; if (btrfs_file_extent_type(path->nodes[0], fi) == @@ -3511,18 +3514,15 @@ process_slot: btrfs_item_ptr_offset(leaf, slot), size); inode_add_bytes(inode, datal); - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); } /* If we have an implicit hole (NO_HOLES feature). */ if (drop_start < new_key.offset) clone_update_extent_map(inode, trans, - path, NULL, drop_start, + NULL, drop_start, new_key.offset - drop_start); - clone_update_extent_map(inode, trans, path, - extent, 0, 0); + clone_update_extent_map(inode, trans, path, 0, 0); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -3565,12 +3565,10 @@ process_slot: btrfs_end_transaction(trans, root); goto out; } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen); - if (ret) - goto out; - clone_update_extent_map(inode, trans, path, NULL, last_dest_end, - destoff + len - last_dest_end); } out: -- cgit v1.2.3 From 0a4eaea892a479aeebccde65986b27cfb6e33a78 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 20 Jun 2014 11:31:44 +0200 Subject: btrfs: remove stale comment from btrfs_flush_all_pending_stuffs Commit fcebe4562dec83b3f8d3088d77584727b09130b2 (Btrfs: rework qgroup accounting) removed the qgroup accounting after delayed refs. Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a32ad65b6cdc..98e76a333596 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1617,11 +1617,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_items(trans, root); - /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ if (ret) return ret; -- cgit v1.2.3 From 130d5b415a091e493ac1508b9d27bbb85ba7b8c0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 20 Jun 2014 11:43:20 +0200 Subject: btrfs: use E2BIG instead of EIO if compression does not help Return codes got updated in 60e1975acb48fc3d74a3422b21dde74c977ac3d5 (btrfs: return errno instead of -1 from compression) lzo wrapper returns E2BIG in this case, do the same for zlib. Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 4f196314c0c1..b67d8fc81277 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws, if (workspace->def_strm.total_in > 8192 && workspace->def_strm.total_in < workspace->def_strm.total_out) { - ret = -EIO; + ret = -E2BIG; goto out; } /* we need another page for writing out. Test this -- cgit v1.2.3 From d288db5dc0110c8e0732d099aaf7a05e2ea0e0c8 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 2 Jul 2014 16:58:01 +0800 Subject: Btrfs: fix race of using total_bytes_pinned This percpu counter @total_bytes_pinned is introduced to skip unnecessary operations of 'commit transaction', it accounts for those space we may free but are stuck in delayed refs. And we zero out @space_info->total_bytes_pinned every transaction period so we have a better idea of how much space we'll actually free up by committing this transaction. However, we do the 'zero out' part a little earlier, before we actually unpin space, so we end up returning ENOSPC when we actually have free space that's just unpinned from committing transaction. xfstests/generic/074 complained then. This fixes it by actually accounting the percpu pinned number when 'unpin', and since it's protected by space_info->lock, the race is gone now. Signed-off-by: Liu Bo Reviewed-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 99c253918208..813537f362f9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5678,7 +5678,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; down_write(&fs_info->commit_root_sem); @@ -5701,9 +5700,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->commit_root_sem); - list_for_each_entry_rcu(space_info, &fs_info->space_info, list) - percpu_counter_set(&space_info->total_bytes_pinned, 0); - update_global_block_rsv(fs_info); } @@ -5741,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; -- cgit v1.2.3 From be2c765dff9e5584965f78853c2addd2bb926946 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 Jul 2014 10:20:48 -0700 Subject: Btrfs: fix btrfs_print_leaf for skinny metadata We wouldn't actuall print the extent information if we had a skinny metadata item, this fixes that. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/print-tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6efd70d3b64f..9626b4ad3b9a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot) +static void print_extent_item(struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - int type; u32 item_size = btrfs_item_size_nr(eb, slot); u64 flags; u64 offset; @@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot) btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), flags); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); @@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); break; case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref\n"); -- cgit v1.2.3 From abdd2e80a57e5f7278f47913315065f0a3d78d20 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 24 Jun 2014 17:46:58 +0100 Subject: Btrfs: fix crash when starting transaction Often when starting a transaction we commit the currently running transaction, which can end up writing block group caches when the current process has its journal_info set to NULL (and not to a transaction). This makes our assertion at btrfs_check_data_free_space() (current_journal != NULL) fail, resulting in a crash/hang. Therefore fix it by setting journal_info. Two different traces of this issue follow below. 1) [51502.241936] BTRFS: assertion failed: current->journal_info, file: fs/btrfs/extent-tree.c, line: 3670 [51502.242213] ------------[ cut here ]------------ [51502.242493] kernel BUG at fs/btrfs/ctree.h:3964! [51502.242669] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC (...) [51502.244010] Call Trace: [51502.244010] [] btrfs_check_data_free_space+0x395/0x3a0 [btrfs] [51502.244010] [] btrfs_write_dirty_block_groups+0x4ac/0x640 [btrfs] [51502.244010] [] commit_cowonly_roots+0x164/0x226 [btrfs] [51502.244010] [] btrfs_commit_transaction+0x4ed/0xab0 [btrfs] [51502.244010] [] ? _raw_spin_unlock+0x2b/0x40 [51502.244010] [] start_transaction+0x459/0x620 [btrfs] [51502.244010] [] btrfs_start_transaction+0x1b/0x20 [btrfs] [51502.244010] [] __unlink_start_trans+0x31/0xe0 [btrfs] [51502.244010] [] btrfs_unlink+0x37/0xc0 [btrfs] [51502.244010] [] ? do_unlinkat+0x114/0x2a0 [51502.244010] [] vfs_unlink+0xcc/0x150 [51502.244010] [] do_unlinkat+0x260/0x2a0 [51502.244010] [] ? filp_close+0x64/0x90 [51502.244010] [] ? trace_hardirqs_on_caller+0x16/0x1e0 [51502.244010] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [51502.244010] [] SyS_unlinkat+0x1b/0x40 [51502.244010] [] system_call_fastpath+0x16/0x1b [51502.244010] Code: 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 89 f1 48 c7 c2 71 13 36 a0 48 89 fe 31 c0 48 c7 c7 b8 43 36 a0 48 89 e5 e8 5d b0 32 e1 <0f> 0b 0f 1f 44 00 00 55 b9 11 00 00 00 48 89 e5 41 55 49 89 f5 [51502.244010] RIP [] assfail.constprop.88+0x1e/0x20 [btrfs] 2) [25405.097230] BTRFS: assertion failed: current->journal_info, file: fs/btrfs/extent-tree.c, line: 3670 [25405.097488] ------------[ cut here ]------------ [25405.097767] kernel BUG at fs/btrfs/ctree.h:3964! [25405.097940] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC (...) [25405.100008] Call Trace: [25405.100008] [] btrfs_check_data_free_space+0x395/0x3a0 [btrfs] [25405.100008] [] btrfs_write_dirty_block_groups+0x4ac/0x640 [btrfs] [25405.100008] [] commit_cowonly_roots+0x164/0x226 [btrfs] [25405.100008] [] btrfs_commit_transaction+0x4ed/0xab0 [btrfs] [25405.100008] [] ? bit_waitqueue+0xc0/0xc0 [25405.100008] [] start_transaction+0x459/0x620 [btrfs] [25405.100008] [] btrfs_start_transaction+0x1b/0x20 [btrfs] [25405.100008] [] btrfs_create+0x47/0x210 [btrfs] [25405.100008] [] ? btrfs_permission+0x3c/0x80 [btrfs] [25405.100008] [] vfs_create+0x9b/0x130 [25405.100008] [] do_last+0x849/0xe20 [25405.100008] [] ? link_path_walk+0x79/0x820 [25405.100008] [] path_openat+0xc5/0x690 [25405.100008] [] ? trace_hardirqs_on+0xd/0x10 [25405.100008] [] ? __alloc_fd+0x32/0x1d0 [25405.100008] [] do_filp_open+0x43/0xa0 [25405.100008] [] ? __alloc_fd+0x151/0x1d0 [25405.100008] [] do_sys_open+0x13c/0x230 [25405.100008] [] ? trace_hardirqs_on_caller+0x16/0x1e0 [25405.100008] [] SyS_open+0x22/0x30 [25405.100008] [] system_call_fastpath+0x16/0x1b [25405.100008] Code: 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 89 f1 48 c7 c2 51 13 36 a0 48 89 fe 31 c0 48 c7 c7 d0 43 36 a0 48 89 e5 e8 6d b5 32 e1 <0f> 0b 0f 1f 44 00 00 55 b9 11 00 00 00 48 89 e5 41 55 49 89 f5 [25405.100008] RIP [] assfail.constprop.88+0x1e/0x20 [btrfs] Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 98e76a333596..5f379affdf23 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -493,6 +493,7 @@ again: smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && may_wait_transaction(root, type)) { + current->journal_info = h; btrfs_commit_transaction(h, root); goto again; } -- cgit v1.2.3 From 571ff4731bc9e5811080f7d16d24c3af7cbca142 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Wed, 2 Jul 2014 15:22:35 -0700 Subject: autofs4: fix false positive compile error On strict build environments we can see: fs/autofs4/inode.c: In function 'autofs4_fill_super': fs/autofs4/inode.c:312: error: 'pgrp' may be used uninitialized in this function make[2]: *** [fs/autofs4/inode.o] Error 1 make[1]: *** [fs/autofs4] Error 2 make: *** [fs] Error 2 make: *** Waiting for unfinished jobs.... This is due to the use of pgrp_set being used to indicate pgrp has has been set rather than initializing pgrp itself. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d7bd395ab586..1c55388ae633 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -210,7 +210,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; - int pgrp; + int pgrp = 0; bool pgrp_set = false; int ret = -EINVAL; -- cgit v1.2.3 From f74373a5cc7a0155d232c4e999648c7a95435bb2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Jul 2014 15:22:37 -0700 Subject: /proc/stat: convert to single_open_size() These two patches are supposed to "fix" failed order-4 memory allocations which have been observed when reading /proc/stat. The problem has been observed on s390 as well as on x86. To address the problem change the seq_file memory allocations to fallback to use vmalloc, so that allocations also work if memory is fragmented. This approach seems to be simpler and less intrusive than changing /proc/stat to use an interator. Also it "fixes" other users as well, which use seq_file's single_open() interface. This patch (of 2): Use seq_file's single_open_size() to preallocate a buffer that is large enough to hold the whole output, instead of open coding it. Also calculate the requested size using the number of online cpus instead of possible cpus, since the size of the output only depends on the number of online cpus. Signed-off-by: Heiko Carstens Acked-by: David Rientjes Cc: Ian Kent Cc: Hendrik Brueckner Cc: Thorsten Diehl Cc: Andrea Righi Cc: Christoph Hellwig Cc: Al Viro Cc: Stefan Bader Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9d231e9e5f0e..bf2d03f8fd3e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -184,29 +184,11 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_possible_cpus(); - char *buf; - struct seq_file *m; - int res; + size_t size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; - - /* don't ask for more than the kmalloc() max size */ - if (size > KMALLOC_MAX_SIZE) - size = KMALLOC_MAX_SIZE; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = ksize(buf); - } else - kfree(buf); - return res; + return single_open_size(file, show_stat, NULL, size); } static const struct file_operations proc_stat_operations = { -- cgit v1.2.3 From 058504edd02667eef8fac9be27ab3ea74332e9b4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Jul 2014 15:22:37 -0700 Subject: fs/seq_file: fallback to vmalloc allocation There are a couple of seq_files which use the single_open() interface. This interface requires that the whole output must fit into a single buffer. E.g. for /proc/stat allocation failures have been observed because an order-4 memory allocation failed due to memory fragmentation. In such situations reading /proc/stat is not possible anymore. Therefore change the seq_file code to fallback to vmalloc allocations which will usually result in a couple of order-0 allocations and hence also work if memory is fragmented. For reference a call trace where reading from /proc/stat failed: sadc: page allocation failure: order:4, mode:0x1040d0 CPU: 1 PID: 192063 Comm: sadc Not tainted 3.10.0-123.el7.s390x #1 [...] Call Trace: show_stack+0x6c/0xe8 warn_alloc_failed+0xd6/0x138 __alloc_pages_nodemask+0x9da/0xb68 __get_free_pages+0x2e/0x58 kmalloc_order_trace+0x44/0xc0 stat_open+0x5a/0xd8 proc_reg_open+0x8a/0x140 do_dentry_open+0x1bc/0x2c8 finish_open+0x46/0x60 do_last+0x382/0x10d0 path_openat+0xc8/0x4f8 do_filp_open+0x46/0xa8 do_sys_open+0x114/0x1f0 sysc_tracego+0x14/0x1a Signed-off-by: Heiko Carstens Tested-by: David Rientjes Cc: Ian Kent Cc: Hendrik Brueckner Cc: Thorsten Diehl Cc: Andrea Righi Cc: Christoph Hellwig Cc: Al Viro Cc: Stefan Bader Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 1d641bb108d2..3857b720cb1b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -8,8 +8,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -30,6 +32,16 @@ static void seq_set_overflow(struct seq_file *m) m->count = m->size; } +static void *seq_buf_alloc(unsigned long size) +{ + void *buf; + + buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!buf && size > PAGE_SIZE) + buf = vmalloc(size); + return buf; +} + /** * seq_open - initialize sequential file * @file: file we initialize @@ -96,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset) return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } @@ -135,9 +147,9 @@ static int traverse(struct seq_file *m, loff_t offset) Eoverflow: m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -192,7 +204,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } @@ -232,9 +244,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (m->count < m->size) goto Fill; m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; m->version = 0; @@ -350,7 +362,7 @@ EXPORT_SYMBOL(seq_lseek); int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - kfree(m->buf); + kvfree(m->buf); kfree(m); return 0; } @@ -605,13 +617,13 @@ EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { - char *buf = kmalloc(size, GFP_KERNEL); + char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { - kfree(buf); + kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; -- cgit v1.2.3 From 61c219f5814277ecb71d64cb30297028d6665979 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 5 Jul 2014 16:28:35 -0400 Subject: ext4: fix unjournalled bg descriptor while initializing inode bitmap The first time that we allocate from an uninitialized inode allocation bitmap, if the block allocation bitmap is also uninitalized, we need to get write access to the block group descriptor before we start modifying the block group descriptor flags and updating the free block count, etc. Otherwise, there is the potential of a bad journal checksum (if journal checksums are enabled), and of the file system becoming inconsistent if we crash at exactly the wrong time. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ialloc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index a87455df38bc..0840bf321cdb 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -874,6 +874,13 @@ got: goto out; } + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -910,13 +917,6 @@ got: } } - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) { - ext4_std_error(sb, err); - goto out; - } - /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; -- cgit v1.2.3 From ae0f78de2c43b6fadd007c231a352b13b5be8ed2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 5 Jul 2014 18:40:52 -0400 Subject: ext4: clarify error count warning messages Make it clear that values printed are times, and that it is error since last fsck. Also add note about fsck version required. Signed-off-by: Pavel Machek Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org --- fs/ext4/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b9b9aabfb4d2..342394772b6c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2809,10 +2809,11 @@ static void print_daily_error_info(unsigned long arg) es = sbi->s_es; if (es->s_error_count) - ext4_msg(sb, KERN_NOTICE, "error count: %u", + /* fsck newer than v1.41.13 is needed to clean this condition. */ + ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, @@ -2826,7 +2827,7 @@ static void print_daily_error_info(unsigned long arg) printk("\n"); } if (es->s_last_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, -- cgit v1.2.3 From 94d4c066a4ff170a2671b1a9b153febbf36796f6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 5 Jul 2014 19:15:50 -0400 Subject: ext4: clarify ext4_error message in ext4_mb_generate_buddy_error() We are spending a lot of time explaining to users what this error means. Let's try to improve the message to avoid this problem. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7f72f50a8fa7..2dcb936be90e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -752,8 +752,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd; " - "block bitmap corrupt.", + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor -- cgit v1.2.3 From 5dd214248f94d430d70e9230bda72f2654ac88a8 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 5 Jul 2014 19:18:22 -0400 Subject: ext4: disable synchronous transaction batching if max_batch_time==0 The mount manpage says of the max_batch_time option, This optimization can be turned off entirely by setting max_batch_time to 0. But the code doesn't do that. So fix the code to do that. Signed-off-by: Eric Sandeen Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 2 -- fs/jbd2/transaction.c | 5 ++++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 342394772b6c..6297c07c937f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1525,8 +1525,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, arg = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * arg; } else if (token == Opt_max_batch_time) { - if (arg == 0) - arg = EXT4_DEF_MAX_BATCH_TIME; sbi->s_max_batch_time = arg; } else if (token == Opt_min_batch_time) { sbi->s_min_batch_time = arg; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 38cfcf5f6fce..6f0f590cc5a3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1588,9 +1588,12 @@ int jbd2_journal_stop(handle_t *handle) * to perform a synchronous write. We do this to detect the * case where a single process is doing a stream of sync * writes. No point in waiting for joiners in that case. + * + * Setting max_batch_time to 0 disables this completely. */ pid = current->pid; - if (handle->h_sync && journal->j_last_sync_writer != pid) { + if (handle->h_sync && journal->j_last_sync_writer != pid && + journal->j_max_batch_time) { u64 commit_time, trans_time; journal->j_last_sync_writer = pid; -- cgit v1.2.3 From 126b9d4365b110c157bc4cbc32540dfa66c9c85a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Jul 2014 15:28:50 +0200 Subject: fuse: timeout comparison fix As suggested by checkpatch.pl, use time_before64() instead of direct comparison of jiffies64 values. Signed-off-by: Miklos Szeredi Cc: --- fs/fuse/dir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 42198359fa1b..225176203a8c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -198,7 +198,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) goto invalid; - else if (fuse_dentry_time(entry) < get_jiffies_64()) { + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64())) { int err; struct fuse_entry_out outarg; struct fuse_req *req; @@ -985,7 +985,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, int err; bool r; - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { r = true; err = fuse_do_getattr(inode, stat, file); } else { @@ -1171,7 +1171,7 @@ static int fuse_permission(struct inode *inode, int mask) ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); -- cgit v1.2.3 From 154210ccb3a871e631bf39fdeb7a8731d98af87b Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Thu, 26 Jun 2014 20:21:57 -0400 Subject: fuse: ignore entry-timeout on LOOKUP_REVAL The following test case demonstrates the bug: sh# mount -t glusterfs localhost:meta-test /mnt/one sh# mount -t glusterfs localhost:meta-test /mnt/two sh# echo stuff > /mnt/one/file; rm -f /mnt/two/file; echo stuff > /mnt/one/file bash: /mnt/one/file: Stale file handle sh# echo stuff > /mnt/one/file; rm -f /mnt/two/file; sleep 1; echo stuff > /mnt/one/file On the second open() on /mnt/one, FUSE would have used the old nodeid (file handle) trying to re-open it. Gluster is returning -ESTALE. The ESTALE propagates back to namei.c:filename_lookup() where lookup is re-attempted with LOOKUP_REVAL. The right behavior now, would be for FUSE to ignore the entry-timeout and and do the up-call revalidation. Instead FUSE is ignoring LOOKUP_REVAL, succeeding the revalidation (because entry-timeout has not passed), and open() is again retried on the old file handle and finally the ESTALE is going back to the application. Fix: if revalidation is happening with LOOKUP_REVAL, then ignore entry-timeout and always do the up-call. Signed-off-by: Anand Avati Reviewed-by: Niels de Vos Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org --- fs/fuse/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 225176203a8c..202a9721be93 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -198,7 +198,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) goto invalid; - else if (time_before64(fuse_dentry_time(entry), get_jiffies_64())) { + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || + (flags & LOOKUP_REVAL)) { int err; struct fuse_entry_out outarg; struct fuse_req *req; -- cgit v1.2.3 From 7b3d8bf7718a8de8ac6a25ed7332c1c129839400 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Thu, 26 Jun 2014 23:06:52 +0530 Subject: fuse: inode: drop cast This patch removes the cast on data of type void * as it is not needed. The following Coccinelle semantic patch was used for making the change: @r@ expression x; void* e; type T; identifier f; @@ ( *((T *)e) | ((T *)x)[...] | ((T *)x)->f | - (T *) e ) Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 754dcf23de8a..7b8fc7d33def 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1006,7 +1006,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); - if (!parse_fuse_opt((char *) data, &d, is_bdev)) + if (!parse_fuse_opt(data, &d, is_bdev)) goto err; if (is_bdev) { -- cgit v1.2.3 From 233a01fa9c4c7c41238537e8db8434667ff28a2f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Jul 2014 15:28:51 +0200 Subject: fuse: handle large user and group ID If the number in "user_id=N" or "group_id=N" mount options was larger than INT_MAX then fuse returned EINVAL. Fix this to handle all valid uid/gid values. Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org --- fs/fuse/inode.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7b8fc7d33def..8474028d7848 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -478,6 +478,17 @@ static const match_table_t tokens = { {OPT_ERR, NULL} }; +static int fuse_match_uint(substring_t *s, unsigned int *res) +{ + int err = -ENOMEM; + char *buf = match_strdup(s); + if (buf) { + err = kstrtouint(buf, 10, res); + kfree(buf); + } + return err; +} + static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) { char *p; @@ -488,6 +499,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) while ((p = strsep(&opt, ",")) != NULL) { int token; int value; + unsigned uv; substring_t args[MAX_OPT_ARGS]; if (!*p) continue; @@ -511,18 +523,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_USER_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->user_id = make_kuid(current_user_ns(), value); + d->user_id = make_kuid(current_user_ns(), uv); if (!uid_valid(d->user_id)) return 0; d->user_id_present = 1; break; case OPT_GROUP_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->group_id = make_kgid(current_user_ns(), value); + d->group_id = make_kgid(current_user_ns(), uv); if (!gid_valid(d->group_id)) return 0; d->group_id_present = 1; -- cgit v1.2.3 From c55a01d360afafcd52bc405c044a6ebf5de436d5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Jul 2014 15:28:51 +0200 Subject: fuse: avoid scheduling while atomic As reported by Richard Sharpe, an attempt to use fuse_notify_inval_entry() triggers complains about scheduling while atomic: BUG: scheduling while atomic: fuse.hf/13976/0x10000001 This happens because fuse_notify_inval_entry() attempts to allocate memory with GFP_KERNEL, holding "struct fuse_copy_state" mapped by kmap_atomic(). Introduced by commit 58bda1da4b3c "fuse/dev: use atomic maps" Fix by moving the map/unmap to just cover the actual memcpy operation. Original patch from Maxim Patlasov Reported-by: Richard Sharpe Signed-off-by: Miklos Szeredi Cc: # v3.15+ --- fs/fuse/dev.c | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index aac71ce373e4..75fa055012b2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -643,9 +643,8 @@ struct fuse_copy_state { unsigned long seglen; unsigned long addr; struct page *pg; - void *mapaddr; - void *buf; unsigned len; + unsigned offset; unsigned move_pages:1; }; @@ -666,23 +665,17 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; - if (!cs->write) { - kunmap_atomic(cs->mapaddr); - } else { - kunmap_atomic(cs->mapaddr); + if (cs->write) buf->len = PAGE_SIZE - cs->len; - } cs->currbuf = NULL; - cs->mapaddr = NULL; - } else if (cs->mapaddr) { - kunmap_atomic(cs->mapaddr); + } else if (cs->pg) { if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); } put_page(cs->pg); - cs->mapaddr = NULL; } + cs->pg = NULL; } /* @@ -691,7 +684,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) */ static int fuse_copy_fill(struct fuse_copy_state *cs) { - unsigned long offset; + struct page *page; int err; unlock_request(cs->fc, cs->req); @@ -706,14 +699,12 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = kmap_atomic(buf->page); + cs->pg = buf->page; + cs->offset = buf->offset; cs->len = buf->len; - cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; cs->nr_segs--; } else { - struct page *page; - if (cs->nr_segs == cs->pipe->buffers) return -EIO; @@ -726,8 +717,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap_atomic(page); - cs->buf = cs->mapaddr; + cs->pg = page; + cs->offset = 0; cs->len = PAGE_SIZE; cs->pipebufs++; cs->nr_segs++; @@ -740,14 +731,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->iov++; cs->nr_segs--; } - err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + err = get_user_pages_fast(cs->addr, 1, cs->write, &page); if (err < 0) return err; BUG_ON(err != 1); - offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg); - cs->buf = cs->mapaddr + offset; - cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->pg = page; + cs->offset = cs->addr % PAGE_SIZE; + cs->len = min(PAGE_SIZE - cs->offset, cs->seglen); cs->seglen -= cs->len; cs->addr += cs->len; } @@ -760,15 +750,20 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { + void *pgaddr = kmap_atomic(cs->pg); + void *buf = pgaddr + cs->offset; + if (cs->write) - memcpy(cs->buf, *val, ncpy); + memcpy(buf, *val, ncpy); else - memcpy(*val, cs->buf, ncpy); + memcpy(*val, buf, ncpy); + + kunmap_atomic(pgaddr); *val += ncpy; } *size -= ncpy; cs->len -= ncpy; - cs->buf += ncpy; + cs->offset += ncpy; return ncpy; } @@ -874,8 +869,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = kmap_atomic(buf->page); - cs->buf = cs->mapaddr + buf->offset; + cs->pg = buf->page; + cs->offset = buf->offset; err = lock_request(cs->fc, cs->req); if (err) -- cgit v1.2.3 From c3a4561796cffae6996264876ffca147b5c3709a Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sun, 6 Jul 2014 11:34:43 +0800 Subject: nfsd: Fix bad reserving space for encoding rdattr_error Introduced by commit 561f0ed498 (nfsd4: allow large readdirs). Signed-off-by: Kinglong Mee Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2fc7abebeb9b..b56b1cc02718 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2641,7 +2641,7 @@ nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { __be32 *p; - p = xdr_reserve_space(xdr, 6); + p = xdr_reserve_space(xdr, 20); if (!p) return NULL; *p++ = htonl(2); -- cgit v1.2.3 From 8bc6f60e3f7f31c4ce370b4b27b8f4b355b7f07e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 11 Jun 2014 18:32:23 +0800 Subject: f2fs: remove unused variables in f2fs_sm_info Remove unused variables in struct f2fs_sm_info. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 --- fs/f2fs/segment.c | 2 -- 2 files changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e51c732b0dd9..7ef7acdb9350 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -342,9 +342,6 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct list_head wblist_head; /* list of under-writeback pages */ - spinlock_t wblist_lock; /* lock for checkpoint */ - block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f25f0e07e26f..b22d5a0652bb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1885,8 +1885,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) /* init sm info */ sbi->sm_info = sm_info; - INIT_LIST_HEAD(&sm_info->wblist_head); - spin_lock_init(&sm_info->wblist_lock); sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); -- cgit v1.2.3 From d6b7d4b31dfd5a454a71c445b8086bc098237334 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 12 Jun 2014 13:23:41 +0800 Subject: f2fs: check lower bound nid value in check_nid_range This patch add lower bound verification for nid in check_nid_range, so nids reserved like 0, node, meta passed by caller could be checked there. And then check_nid_range could be used in f2fs_nfs_get_inode for simplifying code. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/inode.c | 1 + fs/f2fs/super.c | 4 +--- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7ef7acdb9350..58df97e174d0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -641,7 +641,8 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) */ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - WARN_ON((nid >= NM_I(sbi)->max_nid)); + if (unlikely(nid < F2FS_ROOT_INO(sbi))) + return -EINVAL; if (unlikely(nid >= NM_I(sbi)->max_nid)) return -EINVAL; return 0; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index adc622c6bdce..2cf6962f6cc8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -78,6 +78,7 @@ static int do_read_inode(struct inode *inode) if (check_nid_range(sbi, inode->i_ino)) { f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", (unsigned long) inode->i_ino); + WARN_ON(1); return -EINVAL; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b2b18637cb9e..8f96d9372ade 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -689,9 +689,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (unlikely(ino < F2FS_ROOT_INO(sbi))) - return ERR_PTR(-ESTALE); - if (unlikely(ino >= NM_I(sbi)->max_nid)) + if (check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* -- cgit v1.2.3 From 90d72459ccb47335a4348947506fd091e63f7cf8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Jun 2014 10:34:00 +0800 Subject: f2fs: fix error path in init_inode_metadata If we fail in this path: ->init_inode_metadata ->make_empty_dir ->get_new_data_page ->grab_cache_page return -ENOMEM We will bug on in error path of init_inode_metadata when call remove_inode_page because i_block = 2 (one inode block will be released later & one dentry block). We should release the dentry block in init_inode_metadata to avoid this BUG_ON, and avoid leak of dentry block resource, because we never have second chance to release that block in ->evict_inode as in upper error path we make this inode 'bad'. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 966acb039e3b..a4addd72ebbd 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -376,11 +376,11 @@ static struct page *init_inode_metadata(struct inode *inode, put_error: f2fs_put_page(page, 1); +error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); truncate_blocks(inode, 0); remove_dirty_dir_inode(inode); -error: remove_inode_page(inode); return ERR_PTR(err); } -- cgit v1.2.3 From dd4d961fe7d1cb1cba6e9d8d132475d4917ba864 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Jun 2014 14:13:13 +0800 Subject: f2fs: release new entry page correctly in error path of f2fs_rename This patch correct releasing code of new_page to avoid BUG_ON in error patch of f2fs_rename. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 9138c32aa698..aa3ddb228f98 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -474,7 +474,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; put_out_dir: - f2fs_put_page(new_page, 1); + kunmap(new_page); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) { kunmap(old_dir_page); -- cgit v1.2.3 From b2c0829912493df596706a1a036c67beb1bd6ff5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 30 Jun 2014 18:09:55 +0900 Subject: f2fs: do checkpoint for the renamed inode If an inode is renamed, it should be registered as file_lost_pino to conduct checkpoint at f2fs_sync_file. Otherwise, the inode cannot be recovered due to no dent_mark in the following scenario. Note that, this scenario is from xfstests/322. 1. create "a" 2. fsync "a" 3. rename "a" to "b" 4. fsync "b" 5. Sudden power-cut After recovery is done, "b" should be seen. However, the result shows "a", since the recovery procedure does not enter recover_dentry due to no dent_mark. The reason is like below. - The nid of "a" is checkpointed during #2, f2fs_sync_file. - The inode page for "b" produced by #3 is written without dent_mark by sync_node_pages. So, this patch fixes this bug by assinging file_lost_pino to the "a"'s inode. If the pino is lost, f2fs_sync_file conducts checkpoint, and then recovers the latest pino and its dentry information for further recovery. Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index aa3ddb228f98..a6bdddc33ce2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -417,9 +417,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); @@ -448,6 +445,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); @@ -457,9 +458,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); update_inode_page(old_inode); } else { kunmap(old_dir_page); -- cgit v1.2.3 From 2743f865543c0c4a5e12fc13edb2bf89a6e9687c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 28 Jun 2014 01:00:41 +0900 Subject: f2fs: check bdi->dirty_exceeded when trying to skip data writes If we don't check the current backing device status, balance_dirty_pages can fall into infinite pausing routine. This can be occurred when a lot of directories make a small number of dirty dentry pages including files. Reported-by: Brian Chadwick Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9dfb9a042fd2..4b697ccc9b0c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -42,6 +42,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12; res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1); } -- cgit v1.2.3 From 50e1f8d22199b557337b3d1ec8520e4c5aa5c76e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Jul 2014 09:39:32 +0800 Subject: f2fs: avoid to access NULL pointer in issue_flush_thread Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=75861 Denis 2014-05-10 11:28:59 UTC reported: "F2FS-fs (mmcblk0p28): mounting.. Unable to handle kernel NULL pointer dereference at virtual address 00000018 ... [] (_raw_spin_lock+0x3c/0x70) from [] (issue_flush_thread+0x50/0x17c) [] (issue_flush_thread+0x50/0x17c) from [] (kthread+0x98/0xa4) [] (kthread+0x98/0xa4) from [] (kernel_thread_exit+0x0/0x8)" This patch assign cmd_control_info in sm_info before issue_flush_thread is being created, so this make sure that issue flush thread will have no chance to access invalid info in fcc. Signed-off-by: Chao Yu Reviewed-by: Gu Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b22d5a0652bb..d04613df710a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -272,14 +272,15 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; spin_lock_init(&fcc->issue_lock); init_waitqueue_head(&fcc->flush_wait_queue); + sbi->sm_info->cmd_control_info = fcc; fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); + sbi->sm_info->cmd_control_info = NULL; return err; } - sbi->sm_info->cmd_control_info = fcc; return err; } -- cgit v1.2.3 From 4237ba43b65aa989674c89fc4f2fe46eebc501ee Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 Jul 2014 10:50:19 +0200 Subject: fuse: restructure ->rename2() Make ->rename2() universal, i.e. able to handle zero flags. This is to make future change of the API easier. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 202a9721be93..0c6048247a34 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -815,13 +815,6 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) -{ - return fuse_rename_common(olddir, oldent, newdir, newent, 0, - FUSE_RENAME, sizeof(struct fuse_rename_in)); -} - static int fuse_rename2(struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) @@ -832,17 +825,30 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent, if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; - if (fc->no_rename2 || fc->minor < 23) - return -EINVAL; + if (flags) { + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; - err = fuse_rename_common(olddir, oldent, newdir, newent, flags, - FUSE_RENAME2, sizeof(struct fuse_rename2_in)); - if (err == -ENOSYS) { - fc->no_rename2 = 1; - err = -EINVAL; + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, + sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + } else { + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, + sizeof(struct fuse_rename_in)); } + return err; +} +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename2(olddir, oldent, newdir, newent, 0); } static int fuse_link(struct dentry *entry, struct inode *newdir, -- cgit v1.2.3 From f9ae9cf5d72b3926ca48ea60e15bdbb840f42372 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 11 Jul 2014 13:55:40 -0400 Subject: ext4: revert commit which was causing fs corruption after journal replays Commit 007649375f6af2 ("ext4: initialize multi-block allocator before checking block descriptors") causes the block group descriptor's count of the number of free blocks to become inconsistent with the number of free blocks in the allocation bitmap. This is a harmless form of fs corruption, but it causes the kernel to potentially remount the file system read-only, or to panic, depending on the file systems's error behavior. Thanks to Eric Whitney for his tireless work to reproduce and to find the guilty commit. Fixes: 007649375f6af2 ("ext4: initialize multi-block allocator before checking block descriptors" Cc: stable@vger.kernel.org # 3.15 Reported-by: David Jander Reported-by: Matteo Croce Tested-by: Eric Whitney Suggested-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 51 ++++++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6297c07c937f..6df7bc611dbd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3879,38 +3879,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - - /* - * set up enough so that it can read an inode, - * and create new inode for buddy allocator - */ - sbi->s_gdb_count = db_count; - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); - goto failed_mount2; - } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2a; + goto failed_mount2; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - goto failed_mount2a; + goto failed_mount2; } + sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3945,6 +3926,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; + /* + * set up enough so that it can read an inode + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4134,13 +4123,21 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount5; + goto failed_mount4a; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); + goto failed_mount4a; + } + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); goto failed_mount5; } @@ -4217,8 +4214,11 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_release_system_zone(sb); + ext4_mb_release(sb); failed_mount5: + ext4_ext_release(sb); + ext4_release_system_zone(sb); +failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4242,14 +4242,11 @@ failed_mount3: percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); -failed_mount2a: - ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: - ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { -- cgit v1.2.3 From 3f1f9b851311a76226140b55b1ea22111234a7c2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 12 Jul 2014 15:32:24 -0400 Subject: ext4: fix a potential deadlock in __ext4_es_shrink() This fixes the following lockdep complaint: [ INFO: possible circular locking dependency detected ] 3.16.0-rc2-mm1+ #7 Tainted: G O ------------------------------------------------------- kworker/u24:0/4356 is trying to acquire lock: (&(&sbi->s_es_lru_lock)->rlock){+.+.-.}, at: [] __ext4_es_shrink+0x4f/0x2e0 but task is already holding lock: (&ei->i_es_lock){++++-.}, at: [] ext4_es_insert_extent+0x71/0x180 which lock already depends on the new lock. Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&ei->i_es_lock); lock(&(&sbi->s_es_lru_lock)->rlock); lock(&ei->i_es_lock); lock(&(&sbi->s_es_lru_lock)->rlock); *** DEADLOCK *** 6 locks held by kworker/u24:0/4356: #0: ("writeback"){.+.+.+}, at: [] process_one_work+0x180/0x560 #1: ((&(&wb->dwork)->work)){+.+.+.}, at: [] process_one_work+0x180/0x560 #2: (&type->s_umount_key#22){++++++}, at: [] grab_super_passive+0x44/0x90 #3: (jbd2_handle){+.+...}, at: [] start_this_handle+0x189/0x5f0 #4: (&ei->i_data_sem){++++..}, at: [] ext4_map_blocks+0x132/0x550 #5: (&ei->i_es_lock){++++-.}, at: [] ext4_es_insert_extent+0x71/0x180 stack backtrace: CPU: 0 PID: 4356 Comm: kworker/u24:0 Tainted: G O 3.16.0-rc2-mm1+ #7 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Workqueue: writeback bdi_writeback_workfn (flush-253:0) ffffffff8213dce0 ffff880014b07538 ffffffff815df0bb 0000000000000007 ffffffff8213e040 ffff880014b07588 ffffffff815db3dd ffff880014b07568 ffff880014b07610 ffff88003b868930 ffff88003b868908 ffff88003b868930 Call Trace: [] dump_stack+0x4e/0x68 [] print_circular_bug+0x1fb/0x20c [] __lock_acquire+0x163e/0x1d00 [] ? retint_restore_args+0xe/0xe [] ? __slab_alloc+0x4a8/0x4ce [] ? __ext4_es_shrink+0x4f/0x2e0 [] lock_acquire+0x87/0x120 [] ? __ext4_es_shrink+0x4f/0x2e0 [] ? ext4_es_free_extent+0x5d/0x70 [] _raw_spin_lock+0x39/0x50 [] ? __ext4_es_shrink+0x4f/0x2e0 [] ? kmem_cache_alloc+0x18b/0x1a0 [] __ext4_es_shrink+0x4f/0x2e0 [] ext4_es_insert_extent+0xc8/0x180 [] ext4_map_blocks+0x1c4/0x550 [] ext4_writepages+0x6d4/0xd00 ... Reported-by: Minchan Kim Signed-off-by: Theodore Ts'o Reported-by: Minchan Kim Cc: stable@vger.kernel.org Cc: Zheng Liu --- fs/ext4/extents_status.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 3f5c188953a4..0b7e28e7eaa4 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -966,10 +966,10 @@ retry: continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei) + if (ei->i_es_lru_nr == 0 || ei == locked_ei || + !write_trylock(&ei->i_es_lock)) continue; - write_lock(&ei->i_es_lock); shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); if (ei->i_es_lru_nr == 0) list_del_init(&ei->i_es_lru); -- cgit v1.2.3 From bf40c92635d63fcc574c52649f7cda13e0418ac1 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 12 Jul 2014 16:11:42 -0400 Subject: ext4: fix potential null pointer dereference in ext4_free_inode Fix potential null pointer dereferencing problem caused by e43bb4e612 ("ext4: decrement free clusters/inodes counters when block group declared bad") Reported-by: Dan Carpenter Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Theodore Ts'o Reviewed-by: Lukas Czerner --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0840bf321cdb..5b87fc36aab8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -338,7 +338,7 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { int count; count = ext4_free_inodes_count(sb, gdp); percpu_counter_sub(&sbi->s_freeinodes_counter, -- cgit v1.2.3 From 27f1b36326bc8b6911e7052bc4b80a10234f0ec5 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 10 Jul 2014 15:32:43 +0400 Subject: fuse: release temporary page if fuse_writepage_locked() failed tmp_page to be freed if fuse_write_file_get() returns NULL. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 96d513e01a5d..35b6f31ecc38 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1722,7 +1722,7 @@ static int fuse_writepage_locked(struct page *page) error = -EIO; req->ff = fuse_write_file_get(fc, fi); if (!req->ff) - goto err_free; + goto err_nofile; fuse_write_fill(req, req->ff, page_offset(page), 0); @@ -1750,6 +1750,8 @@ static int fuse_writepage_locked(struct page *page) return 0; +err_nofile: + __free_page(tmp_page); err_free: fuse_request_free(req); err: -- cgit v1.2.3 From f2b3455e47c72bef80fe584adb9bb9bc5afdd99c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 23 Jun 2014 18:35:15 +0200 Subject: fuse: replace count*size kzalloc by kcalloc kcalloc manages count*sizeof overflow. Signed-off-by: Fabian Frederick Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 35b6f31ecc38..1570dad1915d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1992,8 +1992,8 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kzalloc(sizeof(struct page *) * - FUSE_MAX_PAGES_PER_REQ, + data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + sizeof(struct page *), GFP_NOFS); if (!data.orig_pages) goto out; -- cgit v1.2.3 From 263782c1c95bbddbb022dc092fd89a36bb8d5577 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Mon, 14 Jul 2014 12:49:26 -0400 Subject: aio: protect reqs_available updates from changes in interrupt handlers As of commit f8567a3845ac05bb28f3c1b478ef752762bd39ef it is now possible to have put_reqs_available() called from irq context. While put_reqs_available() is per cpu, it did not protect itself from interrupts on the same CPU. This lead to aio_complete() corrupting the available io requests count when run under a heavy O_DIRECT workloads as reported by Robert Elliott. Fix this by disabling irq updates around the per cpu batch updates of reqs_available. Many thanks to Robert and folks for testing and tracking this down. Reported-by: Robert Elliot Tested-by: Robert Elliot Signed-off-by: Benjamin LaHaise Cc: Jens Axboe , Christoph Hellwig Cc: stable@vger.kenel.org --- fs/aio.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 955947ef3e02..1c9c5f0a9e2b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -830,16 +830,20 @@ void exit_aio(struct mm_struct *mm) static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } + local_irq_restore(flags); preempt_enable(); } @@ -847,10 +851,12 @@ static bool get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -869,6 +875,7 @@ static bool get_reqs_available(struct kioctx *ctx) ret = true; kcpu->reqs_available--; out: + local_irq_restore(flags); preempt_enable(); return ret; } -- cgit v1.2.3 From d68aab6b8f572406aa93b45ef6483934dd3b54a6 Mon Sep 17 00:00:00 2001 From: Niu Yawei Date: Wed, 4 Jun 2014 12:22:13 +0800 Subject: quota: missing lock in dqcache_shrink_scan() Commit 1ab6c4997e04 (fs: convert fs shrinkers to new scan/count API) accidentally removed locking from quota shrinker. Fix it - dqcache_shrink_scan() should use dq_list_lock to protect the scan on free_dquots list. CC: stable@vger.kernel.org Fixes: 1ab6c4997e04a00c50c6d786c2f046adc0d1f5de Signed-off-by: Niu Yawei Signed-off-by: Jan Kara --- fs/quota/dquot.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9cd5f63715c0..7f30bdc57d13 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -702,6 +702,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct dquot *dquot; unsigned long freed = 0; + spin_lock(&dq_list_lock); head = free_dquots.prev; while (head != &free_dquots && sc->nr_to_scan) { dquot = list_entry(head, struct dquot, dq_free); @@ -713,6 +714,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed++; head = free_dquots.prev; } + spin_unlock(&dq_list_lock); return freed; } -- cgit v1.2.3