From 547edce3ba234a5161c6e1f4389f4a0d2f8451aa Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 8 Nov 2016 11:30:58 +1100 Subject: ext4: tell DAX the size of allocation holes When DAX calls _ext4_get_block() and the file offset points to a hole we currently don't set bh->b_size. This is current worked around via buffer_size_valid() in fs/dax.c. _ext4_get_block() has the hole size information from ext4_map_blocks(), so populate bh->b_size so we can remove buffer_size_valid() in a later patch. Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Signed-off-by: Dave Chinner --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9c064727ed62..3d58b2b477e8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -767,6 +767,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; + } else if (ret == 0) { + /* hole case, need to fill in bh->b_size */ + bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } -- cgit v1.2.3 From b50f7b268bad8088dfe7579a65cd910d8cc5c40f Mon Sep 17 00:00:00 2001 From: David Gstir Date: Sun, 13 Nov 2016 22:20:45 +0100 Subject: fscrypt: Allow fscrypt_decrypt_page() to function with non-writeback pages Some filesystem might pass pages which do not have page->mapping->host set to the encrypted inode. We want the caller to explicitly pass the corresponding inode. Signed-off-by: David Gstir Signed-off-by: Richard Weinberger Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9c064727ed62..4b7b842ec024 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1166,7 +1166,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = fscrypt_decrypt_page(page); + err = fscrypt_decrypt_page(page->mapping->host, page); return err; } #endif @@ -3743,7 +3743,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); - WARN_ON_ONCE(fscrypt_decrypt_page(page)); + WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host, + page)); } } if (ext4_should_journal_data(inode)) { -- cgit v1.2.3 From 7821d4dd4589ce5af54f3e46d04a29439ba3c2e5 Mon Sep 17 00:00:00 2001 From: David Gstir Date: Sun, 13 Nov 2016 22:20:46 +0100 Subject: fscrypt: Enable partial page encryption Not all filesystems work on full pages, thus we should allow them to hand partial pages to fscrypt for en/decryption. Signed-off-by: David Gstir Signed-off-by: Richard Weinberger Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 ++++-- fs/ext4/page-io.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4b7b842ec024..1d498c5e2990 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1166,7 +1166,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = fscrypt_decrypt_page(page->mapping->host, page); + err = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0); return err; } #endif @@ -3743,8 +3744,9 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); + BUG_ON(!PageLocked(page)); WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host, - page)); + page, PAGE_SIZE, 0)); } } if (ext4_should_journal_data(inode)) { diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0094923e5ebf..3d1d3d0f4303 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -470,7 +470,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; retry_encrypt: - data_page = fscrypt_encrypt_page(inode, page, gfp_flags); + data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, gfp_flags); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { -- cgit v1.2.3 From 9c4bb8a3a9b4de21753053d667310c2b7cb39916 Mon Sep 17 00:00:00 2001 From: David Gstir Date: Sun, 13 Nov 2016 22:20:48 +0100 Subject: fscrypt: Let fs select encryption index/tweak Avoid re-use of page index as tweak for AES-XTS when multiple parts of same page are encrypted. This will happen on multiple (partial) calls of fscrypt_encrypt_page on same page. page->index is only valid for writeback pages. Signed-off-by: David Gstir Signed-off-by: Richard Weinberger Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 4 ++-- fs/ext4/page-io.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1d498c5e2990..1485ac273bfb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1167,7 +1167,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, page_zero_new_buffers(page, from, to); else if (decrypt) err = fscrypt_decrypt_page(page->mapping->host, page, - PAGE_SIZE, 0); + PAGE_SIZE, 0, page->index); return err; } #endif @@ -3746,7 +3746,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, BUG_ON(blocksize != PAGE_SIZE); BUG_ON(!PageLocked(page)); WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host, - page, PAGE_SIZE, 0)); + page, PAGE_SIZE, 0, page->index)); } } if (ext4_should_journal_data(inode)) { diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3d1d3d0f4303..902a3e3059b3 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -470,7 +470,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; retry_encrypt: - data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, gfp_flags); + data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, + page->index, gfp_flags); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { -- cgit v1.2.3 From 2c98eb5ea249767bbc11cf4e70e91d5b0458ed13 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 13 Nov 2016 22:02:26 -0500 Subject: ext4: allow ext4_truncate() to return an error This allows us to properly propagate errors back up to ext4_truncate()'s callers. This also means we no longer have to silently ignore some errors (e.g., when trying to add the inode to the orphan inode list). Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 41 ++++++++++++++++++++++++++--------------- fs/ext4/ioctl.c | 7 +++++-- fs/ext4/super.c | 6 ++++-- 4 files changed, 36 insertions(+), 20 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 282a51b07c57..be2282dcde7d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2491,7 +2491,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); -extern void ext4_truncate(struct inode *); +extern int ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 48fbdfc43c10..585153057c2a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -261,8 +261,15 @@ void ext4_evict_inode(struct inode *inode) "couldn't mark inode dirty (err %d)", err); goto stop_handle; } - if (inode->i_blocks) - ext4_truncate(inode); + if (inode->i_blocks) { + err = ext4_truncate(inode); + if (err) { + ext4_error(inode->i_sb, + "couldn't truncate inode %lu (err %d)", + inode->i_ino, err); + goto stop_handle; + } + } /* * ext4_ext_truncate() doesn't reserve any slop when it @@ -4097,10 +4104,11 @@ int ext4_inode_attach_jinode(struct inode *inode) * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ -void ext4_truncate(struct inode *inode) +int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; + int err = 0; handle_t *handle; struct address_space *mapping = inode->i_mapping; @@ -4114,7 +4122,7 @@ void ext4_truncate(struct inode *inode) trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) - return; + return 0; ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); @@ -4126,13 +4134,13 @@ void ext4_truncate(struct inode *inode) ext4_inline_data_truncate(inode, &has_inline); if (has_inline) - return; + return 0; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { if (ext4_inode_attach_jinode(inode) < 0) - return; + return 0; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4141,10 +4149,8 @@ void ext4_truncate(struct inode *inode) credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ext4_std_error(inode->i_sb, PTR_ERR(handle)); - return; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); @@ -4158,7 +4164,8 @@ void ext4_truncate(struct inode *inode) * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ - if (ext4_orphan_add(handle, inode)) + err = ext4_orphan_add(handle, inode); + if (err) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); @@ -4191,6 +4198,7 @@ out_stop: ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); + return err; } /* @@ -5205,12 +5213,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); - if (shrink) - ext4_truncate(inode); + if (shrink) { + rc = ext4_truncate(inode); + if (rc) + error = rc; + } up_write(&EXT4_I(inode)->i_mmap_sem); } - if (!rc) { + if (!error) { setattr_copy(inode, attr); mark_inode_dirty(inode); } @@ -5222,7 +5233,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); - if (!rc && (ia_valid & ATTR_MODE)) + if (!error && (ia_valid & ATTR_MODE)) rc = posix_acl_chmod(inode, inode->i_mode); err_out: diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bf5ae8ebbc97..99862a3726fc 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -248,8 +248,11 @@ static int ext4_ioctl_setflags(struct inode *inode, err = -EOPNOTSUPP; goto flags_out; } - } else if (oldflags & EXT4_EOFBLOCKS_FL) - ext4_truncate(inode); + } else if (oldflags & EXT4_EOFBLOCKS_FL) { + err = ext4_truncate(inode); + if (err) + goto flags_out; + } handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 20da99da0a34..e4f61c39328a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2330,7 +2330,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; - int nr_orphans = 0, nr_truncates = 0; + int ret, nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA int i; #endif @@ -2412,7 +2412,9 @@ static void ext4_orphan_cleanup(struct super_block *sb, inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); - ext4_truncate(inode); + ret = ext4_truncate(inode); + if (ret) + ext4_std_error(inode->i_sb, ret); inode_unlock(inode); nr_truncates++; } else { -- cgit v1.2.3 From d0abb36db44faaf8f8aa148ca206fe2404042dec Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 13 Nov 2016 22:02:28 -0500 Subject: ext4: allow ext4_ext_truncate() to return an error Return errors to the caller instead of declaring the file system corrupted. Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 15 +++++++-------- fs/ext4/inode.c | 4 +++- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index be2282dcde7d..54211c7876f8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3128,7 +3128,7 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern void ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_truncate(handle_t *, struct inode *); extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c930a0110fb4..d3b119499c53 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4631,7 +4631,7 @@ out2: return err ? err : allocated; } -void ext4_ext_truncate(handle_t *handle, struct inode *inode) +int ext4_ext_truncate(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; @@ -4645,7 +4645,9 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode) /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); + if (err) + return err; last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); @@ -4657,12 +4659,9 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - if (err) { - ext4_std_error(inode->i_sb, err); - return; - } - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); - ext4_std_error(inode->i_sb, err); + if (err) + return err; + return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 585153057c2a..79f46f4b43b8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4173,11 +4173,13 @@ int ext4_truncate(struct inode *inode) ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ext4_ext_truncate(handle, inode); + err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); + if (err) + goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); -- cgit v1.2.3 From 1566a48aaa10c6bb29b9a69dd8279f9a4fc41e35 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 13 Nov 2016 22:02:29 -0500 Subject: ext4: don't lock buffer in ext4_commit_super if holding spinlock If there is an error reported in mballoc via ext4_grp_locked_error(), the code is holding a spinlock, so ext4_commit_super() must not try to lock the buffer head, or else it will trigger a BUG: BUG: sleeping function called from invalid context at ./include/linux/buffer_head.h:358 in_atomic(): 1, irqs_disabled(): 0, pid: 993, name: mount CPU: 0 PID: 993 Comm: mount Not tainted 4.9.0-rc1-clouder1 #62 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 ffff880006423548 ffffffff81318c89 ffffffff819ecdd0 0000000000000166 ffff880006423558 ffffffff810810b0 ffff880006423580 ffffffff81081153 ffff880006e5a1a0 ffff88000690e400 0000000000000000 ffff8800064235c0 Call Trace: [] dump_stack+0x67/0x9e [] ___might_sleep+0xf0/0x140 [] __might_sleep+0x53/0xb0 [] ext4_commit_super+0x19c/0x290 [] __ext4_grp_locked_error+0x14a/0x230 [] ? __might_sleep+0x53/0xb0 [] ext4_mb_generate_buddy+0x1de/0x320 Since ext4_grp_locked_error() calls ext4_commit_super with sync == 0 (and it is the only caller which does so), avoid locking and unlocking the buffer in this case. This can result in races with ext4_commit_super() if there are other problems (which is what commit 4743f83990614 was trying to address), but a Warning is better than BUG. Fixes: 4743f83990614 Cc: stable@vger.kernel.org # 4.9 Reported-by: Nikolay Borisov Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e4f61c39328a..ff6f3ab09c7e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4537,7 +4537,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) &EXT4_SB(sb)->s_freeinodes_counter)); BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); - lock_buffer(sbh); + if (sync) + lock_buffer(sbh); if (buffer_write_io_error(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -4553,8 +4554,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) set_buffer_uptodate(sbh); } mark_buffer_dirty(sbh); - unlock_buffer(sbh); if (sync) { + unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC); if (error) -- cgit v1.2.3 From 69e43e8cc971a79dd1ee5d4343d8e63f82725123 Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Mon, 14 Nov 2016 21:04:37 -0500 Subject: ext4: fix mballoc breakage with 64k block size 'border' variable is set to a value of 2 times the block size of the underlying filesystem. With 64k block size, the resulting value won't fit into a 16-bit variable. Hence this commit changes the data type of 'border' to 'unsigned int'. Fixes: c9de560ded61f Signed-off-by: Chandan Rajendra Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f418f55c2bbe..a937ac7ef99f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; - unsigned short border; + unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); -- cgit v1.2.3 From 30a9d7afe70ed6bd9191d3000e2ef1a34fb58493 Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Mon, 14 Nov 2016 21:26:26 -0500 Subject: ext4: fix stack memory corruption with 64k block size The number of 'counters' elements needed in 'struct sg' is super_block->s_blocksize_bits + 2. Presently we have 16 'counters' elements in the array. This is insufficient for block sizes >= 32k. In such cases the memcpy operation performed in ext4_mb_seq_groups_show() would cause stack memory corruption. Fixes: c9de560ded61f Signed-off-by: Chandan Rajendra Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a937ac7ef99f..7ae43c59bc79 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2287,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) struct ext4_group_info *grinfo; struct sg { struct ext4_group_info info; - ext4_grpblk_t counters[16]; + ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; } sg; group--; -- cgit v1.2.3 From eeca7ea1baa939c97d58ba821f8c6e683e4388f2 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Mon, 14 Nov 2016 21:40:10 -0500 Subject: ext4: use current_time() for inode timestamps CURRENT_TIME_SEC and CURRENT_TIME are not y2038 safe. current_time() will be transitioned to be y2038 safe along with vfs. current_time() returns timestamps according to the granularities set in the super_block. The granularity check in ext4_current_time() to call current_time() or CURRENT_TIME_SEC is not required. Use current_time() directly to obtain timestamps unconditionally, and remove ext4_current_time(). Quota files are assumed to be on the same filesystem. Hence, use current_time() for these files as well. Signed-off-by: Deepa Dinamani Signed-off-by: Theodore Ts'o Reviewed-by: Arnd Bergmann --- fs/ext4/acl.c | 2 +- fs/ext4/ext4.h | 6 ------ fs/ext4/extents.c | 10 +++++----- fs/ext4/ialloc.c | 2 +- fs/ext4/inline.c | 4 ++-- fs/ext4/inode.c | 6 +++--- fs/ext4/ioctl.c | 8 ++++---- fs/ext4/namei.c | 24 +++++++++++++----------- fs/ext4/super.c | 2 +- fs/ext4/xattr.c | 2 +- 10 files changed, 31 insertions(+), 35 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index dfa519979038..fd389935ecd1 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -196,7 +196,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, error = posix_acl_update_mode(inode, &inode->i_mode, &acl); if (error) return error; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); } break; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 54211c7876f8..53d6d463ac4d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1532,12 +1532,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) return container_of(inode, struct ext4_inode_info, vfs_inode); } -static inline struct timespec ext4_current_time(struct inode *inode) -{ - return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? - current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -} - static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d3b119499c53..65dbd2bcf775 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4724,7 +4724,7 @@ retry: map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); if (new_size) { if (epos > new_size) epos = new_size; @@ -4852,7 +4852,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); @@ -4877,7 +4877,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_dio; } - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); if (new_size) { ext4_update_inode_size(inode, new_size); } else { @@ -5567,7 +5567,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: @@ -5677,7 +5677,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 170421edfdfe..088afe07ddda 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1039,7 +1039,7 @@ got: /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = - ext4_current_time(inode); + current_time(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f74d5ee2cdec..cfa87bd49c5a 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1028,7 +1028,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); @@ -1971,7 +1971,7 @@ out: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 79f46f4b43b8..e821ee28fa33 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4039,7 +4039,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); @@ -4195,7 +4195,7 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); @@ -5170,7 +5170,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * update c/mtime in shrink case below */ if (!shrink) { - inode->i_mtime = ext4_current_time(inode); + inode->i_mtime = current_time(inode); inode->i_ctime = inode->i_mtime; } down_write(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 99862a3726fc..fc1cd37ba2d9 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -153,7 +153,7 @@ static long swap_inode_boot_loader(struct super_block *sb, swap_inode_data(inode, inode_bl); - inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); + inode->i_ctime = inode_bl->i_ctime = current_time(inode); spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -275,7 +275,7 @@ static int ext4_ioctl_setflags(struct inode *inode, } ext4_set_inode_flags(inode); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: @@ -371,7 +371,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) } EXT4_I(inode)->i_projid = kprojid; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) @@ -503,7 +503,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 104f8bfba718..eadba919f26b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1941,7 +1941,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); @@ -2987,7 +2987,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_dec_count(handle, dir); ext4_update_dx_flag(dir); @@ -3050,13 +3050,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_unlink; - dir->i_ctime = dir->i_mtime = ext4_current_time(dir); + dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); end_unlink: @@ -3254,7 +3254,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_inc_count(handle, inode); ihold(inode); @@ -3381,7 +3381,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, ent->de->file_type = file_type; ent->dir->i_version++; ent->dir->i_ctime = ent->dir->i_mtime = - ext4_current_time(ent->dir); + current_time(ent->dir); ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3651,7 +3651,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = ext4_current_time(old.inode); + old.inode->i_ctime = current_time(old.inode); ext4_mark_inode_dirty(handle, old.inode); if (!whiteout) { @@ -3663,9 +3663,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (new.inode) { ext4_dec_count(handle, new.inode); - new.inode->i_ctime = ext4_current_time(new.inode); + new.inode->i_ctime = current_time(new.inode); } - old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir); + old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); ext4_update_dx_flag(old.dir); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); @@ -3723,6 +3723,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, }; u8 new_file_type; int retval; + struct timespec ctime; if ((ext4_encrypted_inode(old_dir) || ext4_encrypted_inode(new_dir)) && @@ -3823,8 +3824,9 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = ext4_current_time(old.inode); - new.inode->i_ctime = ext4_current_time(new.inode); + ctime = current_time(old.inode); + old.inode->i_ctime = ctime; + new.inode->i_ctime = ctime; ext4_mark_inode_dirty(handle, old.inode); ext4_mark_inode_dirty(handle, new.inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ff6f3ab09c7e..35ccbdc2d64e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5354,7 +5354,7 @@ static int ext4_quota_off(struct super_block *sb, int type) handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto out; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index d77be9e9f535..9c1bb5cd89d2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1249,7 +1249,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); if (!value) ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); -- cgit v1.2.3 From 88e0387769c4bc36c93d25329f6151fb3d6474b2 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 14 Nov 2016 21:48:35 -0500 Subject: ext4: allow inode expansion for nojournal file systems Runs of xfstest ext4/022 on nojournal file systems result in failures because the inodes of some of its test files do not expand as expected. The cause is a conditional in ext4_mark_inode_dirty() that prevents inode expansion unless the test file system has a journal. Remove this unnecessary restriction. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e821ee28fa33..b1b4c85bbae3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5474,18 +5474,20 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) return err; - if (ext4_handle_valid(handle) && - EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { /* - * We need extra buffer credits since we may write into EA block + * In nojournal mode, we can immediately attempt to expand + * the inode. When journaled, we first need to obtain extra + * buffer credits since we may write into the EA block * with this same handle. If journal_extend fails, then it will * only result in a minor loss of functionality for that inode. * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ - if ((jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { + if (!ext4_handle_valid(handle) || + jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) { ret = ext4_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); -- cgit v1.2.3 From d5c8dab6a8a1e328b976140ee7dc8e66957aaf61 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 14 Nov 2016 21:56:48 -0500 Subject: ext4: remove parameter from ext4_xattr_ibody_set() The parameter "handle" isn't used. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 9c1bb5cd89d2..1846e9168f80 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1109,7 +1109,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, return 0; } -static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, +static int ext4_xattr_ibody_set(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { @@ -1216,7 +1216,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, } if (!value) { if (!is.s.not_found) - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { @@ -1227,7 +1227,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); @@ -1242,8 +1242,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, goto cleanup; if (!is.s.not_found) { i.value = NULL; - error = ext4_xattr_ibody_set(handle, inode, &i, - &is); + error = ext4_xattr_ibody_set(inode, &i, &is); } } } @@ -1384,7 +1383,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, goto out; /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); + error = ext4_xattr_ibody_set(inode, &i, is); if (error) goto out; -- cgit v1.2.3 From 9e47a4c9fc58032ee135bf76516809c7624b1551 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 18 Nov 2016 13:00:24 -0500 Subject: ext4: sanity check the block and cluster size at mount time If the block size or cluster size is insane, reject the mount. This is important for security reasons (although we shouldn't be just depending on this check). Ref: http://www.securityfocus.com/archive/1/539661 Ref: https://bugzilla.redhat.com/show_bug.cgi?id=1332506 Reported-by: Borislav Petkov Reported-by: Nikolay Borisov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 53d6d463ac4d..bdf1e5ee8642 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -235,6 +235,7 @@ struct ext4_io_submit { #define EXT4_MAX_BLOCK_SIZE 65536 #define EXT4_MIN_BLOCK_LOG_SIZE 10 #define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) #else diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 35ccbdc2d64e..0f9ae4ce33d6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3567,7 +3567,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, - "Unsupported filesystem blocksize %d", blocksize); + "Unsupported filesystem blocksize %d (%d log_block_size)", + blocksize, le32_to_cpu(es->s_log_block_size)); + goto failed_mount; + } + if (le32_to_cpu(es->s_log_block_size) > + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); goto failed_mount; } @@ -3699,6 +3707,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "block size (%d)", clustersize, blocksize); goto failed_mount; } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto failed_mount; + } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); sbi->s_clusters_per_group = -- cgit v1.2.3 From 5aee0f8a3f42c94c5012f1673420aee96315925a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 18 Nov 2016 13:24:26 -0500 Subject: ext4: fix in-superblock mount options processing Fix a large number of problems with how we handle mount options in the superblock. For one, if the string in the superblock is long enough that it is not null terminated, we could run off the end of the string and try to interpret superblocks fields as characters. It's unlikely this will cause a security problem, but it could result in an invalid parse. Also, parse_options is destructive to the string, so in some cases if there is a comma-separated string, it would be modified in the superblock. (Fortunately it only happens on file systems with a 1k block size.) Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0f9ae4ce33d6..404e6f3c1bed 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3303,7 +3303,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); ext4_fsblk_t block; ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; @@ -3322,16 +3322,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - goto out_free_orig; + if ((data && !orig_data) || !sbi) + goto out_free_base; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) { - kfree(sbi); - goto out_free_orig; - } + if (!sbi->s_blockgroup_lock) + goto out_free_base; + sb->s_fs_info = sbi; sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; @@ -3477,11 +3475,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, - &journal_devnum, &journal_ioprio, 0)) { - ext4_msg(sb, KERN_WARNING, - "failed to parse options in superblock: %s", - sbi->s_es->s_mount_opts); + if (sbi->s_es->s_mount_opts[0]) { + char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, + sizeof(sbi->s_es->s_mount_opts), + GFP_KERNEL); + if (!s_mount_opts) + goto failed_mount; + if (!parse_options(s_mount_opts, sb, &journal_devnum, + &journal_ioprio, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + s_mount_opts); + } + kfree(s_mount_opts); } sbi->s_def_mount_opt = sbi->s_mount_opt; if (!parse_options((char *) data, sb, &journal_devnum, @@ -4162,7 +4168,9 @@ no_journal: if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + "Opts: %.*s%s%s", descr, + (int) sizeof(sbi->s_es->s_mount_opts), + sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); if (es->s_error_count) @@ -4241,8 +4249,8 @@ failed_mount: out_fail: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); +out_free_base: kfree(sbi); -out_free_orig: kfree(orig_data); return err ? err : ret; } -- cgit v1.2.3 From cd6bb35bf7f6d7d922509bf50265383a0ceabe96 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 18 Nov 2016 13:28:30 -0500 Subject: ext4: use more strict checks for inodes_per_block on mount Centralize the checks for inodes_per_block and be more strict to make sure the inodes_per_block_group can't end up being zero. Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org --- fs/ext4/super.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 404e6f3c1bed..689c02df1af4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3668,12 +3668,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) - goto cantfind_ext4; sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0) goto cantfind_ext4; + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); @@ -3756,13 +3760,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_cluster_ratio = clustersize / blocksize; - if (sbi->s_inodes_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#inodes per group too big: %lu", - sbi->s_inodes_per_group); - goto failed_mount; - } - /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); -- cgit v1.2.3 From c48ae41bafe31e9a66d8be2ced4e42a6b57fa814 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 18 Nov 2016 13:37:47 -0500 Subject: ext4: add sanity checking to count_overhead() The commit "ext4: sanity check the block and cluster size at mount time" should prevent any problems, but in case the superblock is modified while the file system is mounted, add an extra safety check to make sure we won't overrun the allocated buffer. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 689c02df1af4..2d8a49d74f56 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3195,10 +3195,15 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_set_bit(s++, buf); count++; } - for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { - ext4_set_bit(EXT4_B2C(sbi, s++), buf); - count++; + j = ext4_bg_num_gdb(sb, grp); + if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_error(sb, "Invalid number of block group " + "descriptor blocks: %d", j); + j = EXT4_BLOCKS_PER_GROUP(sb) - s; } + count += j; + for (; j > 0; j--) + ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; -- cgit v1.2.3 From 213bcd9ccbf04b709e4764ad89aaaa66a47785f0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 20 Nov 2016 17:29:51 -0500 Subject: ext4: factor out checks from ext4_file_write_iter() Factor out checks of 'from' and whether we are overwriting out of ext4_file_write_iter() so that the function is easier to follow. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 97 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 50 insertions(+), 47 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2a822d30e73f..9facb4dc5c70 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -88,6 +88,51 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) return 0; } +/* Is IO overwriting allocated and initialized blocks? */ +static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) +{ + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, blklen; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = pos >> blkbits; + map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); + blklen = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); + /* + * 'err==len' means that all of the blocks have been preallocated, + * regardless of whether they have been initialized or not. To exclude + * unwritten extents, we need to check m_flags. + */ + return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + return ret; + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) + return -EFBIG; + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); + } + return iov_iter_count(from); +} + static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -98,7 +143,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; inode_lock(inode); - ret = generic_write_checks(iocb, from); + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -114,53 +159,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_unwritten_wait(inode); } - /* - * If we have encountered a bitmap-format file, the size limit - * is smaller than s_maxbytes, which is for extent-mapped files. - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - - if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) { - ret = -EFBIG; - goto out; - } - iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); - } - iocb->private = &overwrite; - if (o_direct) { - size_t length = iov_iter_count(from); - loff_t pos = iocb->ki_pos; - - /* check whether we do a DIO overwrite or not */ - if (ext4_should_dioread_nolock(inode) && !unaligned_aio && - pos + length <= i_size_read(inode)) { - struct ext4_map_blocks map; - unsigned int blkbits = inode->i_blkbits; - int err, len; - - map.m_lblk = pos >> blkbits; - map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); - len = map.m_len; - - err = ext4_map_blocks(NULL, inode, &map, 0); - /* - * 'err==len' means that all of blocks has - * been preallocated no matter they are - * initialized or not. For excluding - * unwritten extents, we need to check - * m_flags. There are two conditions that - * indicate for initialized extents. 1) If we - * hit extent cache, EXT4_MAP_MAPPED flag is - * returned; 2) If we do a real lookup, - * non-flags are returned. So we should check - * these two conditions. - */ - if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) - overwrite = 1; - } - } + /* Check whether we do a DIO overwrite or not */ + if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && + ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) + overwrite = 1; ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); -- cgit v1.2.3 From a3caa24b703794507bf2e0a68bdc800b90f5e70b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 20 Nov 2016 17:32:59 -0500 Subject: ext4: only set S_DAX if DAX is really supported Currently we have S_DAX set inode->i_flags for a regular file whenever ext4 is mounted with dax mount option. However in some cases we cannot really do DAX - e.g. when inode is marked to use data journalling, when inode data is being encrypted, or when inode is stored inline. Make sure S_DAX flag is appropriately set/cleared in these cases. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 10 ++++++++++ fs/ext4/inode.c | 9 ++++++++- fs/ext4/super.c | 6 ++++++ 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index cfa87bd49c5a..9b67f75bdcf7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -299,6 +299,11 @@ static int ext4_create_inline_data(handle_t *handle, EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); + /* + * Propagate changes to inode->i_flags as well - e.g. S_DAX may + * get cleared + */ + ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -442,6 +447,11 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); + /* + * Propagate changes to inode->i_flags as well - e.g. S_DAX may + * get set. + */ + ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b1b4c85bbae3..733fda1c3646 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4368,7 +4368,9 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) && + !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) && + !ext4_encrypted_inode(inode)) new_fl |= S_DAX; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); @@ -5641,6 +5643,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); + /* + * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated. + * E.g. S_DAX may get cleared / set. + */ + ext4_set_inode_flags(inode); jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_journal_flag_rwsem); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2d8a49d74f56..cbfaee175fa5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1126,6 +1126,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + /* + * Update inode->i_flags - e.g. S_DAX may get disabled + */ + ext4_set_inode_flags(inode); } return res; } @@ -1140,6 +1144,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + /* Update inode->i_flags - e.g. S_DAX may get disabled */ + ext4_set_inode_flags(inode); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); -- cgit v1.2.3 From 364443cbcfe70f927b6a0dc0d410b4d4318bc1ca Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 20 Nov 2016 17:36:06 -0500 Subject: ext4: convert DAX reads to iomap infrastructure Implement basic iomap_begin function that handles reading and use it for DAX reads. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/file.c | 38 +++++++++++++++++++++++++++++++++++++- fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bdf1e5ee8642..da82de650350 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3266,6 +3266,8 @@ static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); } +extern struct iomap_ops ext4_iomap_ops; + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 9facb4dc5c70..1f25c644cb12 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -31,6 +31,42 @@ #include "xattr.h" #include "acl.h" +#ifdef CONFIG_FS_DAX +static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + inode_lock_shared(inode); + /* + * Recheck under inode lock - at this point we are sure it cannot + * change anymore + */ + if (!IS_DAX(inode)) { + inode_unlock_shared(inode); + /* Fallback to buffered IO in case we cannot support DAX */ + return generic_file_read_iter(iocb, to); + } + ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} +#endif + +static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + if (!iov_iter_count(to)) + return 0; /* skip atime */ + +#ifdef CONFIG_FS_DAX + if (IS_DAX(file_inode(iocb->ki_filp))) + return ext4_dax_read_iter(iocb, to); +#endif + return generic_file_read_iter(iocb, to); +} + /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release @@ -690,7 +726,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, - .read_iter = generic_file_read_iter, + .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 733fda1c3646..5e0526fed165 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -3318,6 +3319,59 @@ int ext4_dax_get_block(struct inode *inode, sector_t iblock, clear_buffer_new(bh_result); return 0; } + +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap) +{ + unsigned int blkbits = inode->i_blkbits; + unsigned long first_block = offset >> blkbits; + unsigned long last_block = (offset + length - 1) >> blkbits; + struct ext4_map_blocks map; + int ret; + + if (flags & IOMAP_WRITE) + return -EIO; + + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; + + map.m_lblk = first_block; + map.m_len = last_block - first_block + 1; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = first_block << blkbits; + + if (ret == 0) { + iomap->type = IOMAP_HOLE; + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->length = (u64)map.m_len << blkbits; + } else { + if (map.m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + } else { + WARN_ON_ONCE(1); + return -EIO; + } + iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9); + iomap->length = (u64)map.m_len << blkbits; + } + + if (map.m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + return 0; +} + +struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, +}; + #else /* Just define empty function, it will never get called. */ int ext4_dax_get_block(struct inode *inode, sector_t iblock, -- cgit v1.2.3 From 47e6935136b1f9fbda59cd929409f8e7cee4a1e4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 20 Nov 2016 18:08:05 -0500 Subject: ext4: use iomap for zeroing blocks in DAX mode Use iomap infrastructure for zeroing blocks when in DAX mode. ext4_iomap_begin() handles read requests just fine and that's all that is needed for iomap_zero_range(). Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5e0526fed165..6d186ca2c34b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3859,8 +3859,10 @@ static int ext4_block_zero_page_range(handle_t *handle, if (length > max || length < 0) length = max; - if (IS_DAX(inode)) - return dax_zero_page_range(inode, from, length, ext4_get_block); + if (IS_DAX(inode)) { + return iomap_zero_range(inode, from, length, NULL, + &ext4_iomap_ops); + } return __ext4_block_zero_page_range(handle, mapping, from, length); } -- cgit v1.2.3 From 776722e85d3b0936253ecc3d14db4fba37f191ba Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 20 Nov 2016 18:09:11 -0500 Subject: ext4: DAX iomap write support Implement DAX writes using the new iomap infrastructure instead of overloading the direct IO path. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 40 ++++++++++++++++++ fs/ext4/inode.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 160 insertions(+), 6 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1f25c644cb12..1953fe34f9fe 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -169,6 +169,41 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return iov_iter_count(from); } +#ifdef CONFIG_FS_DAX +static ssize_t +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + bool overwrite = false; + + inode_lock(inode); + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + ret = file_update_time(iocb->ki_filp); + if (ret) + goto out; + + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + overwrite = true; + downgrade_write(&inode->i_rwsem); + } + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); +out: + if (!overwrite) + inode_unlock(inode); + else + inode_unlock_shared(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +#endif + static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -178,6 +213,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) int overwrite = 0; ssize_t ret; +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_write_iter(iocb, from); +#endif + inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6d186ca2c34b..3941cee21e4c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3329,18 +3329,79 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct ext4_map_blocks map; int ret; - if (flags & IOMAP_WRITE) - return -EIO; - if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; map.m_lblk = first_block; map.m_len = last_block - first_block + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; + if (!(flags & IOMAP_WRITE)) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + } else { + int dio_credits; + handle_t *handle; + int retries = 0; + + /* Trim mapping request to maximum we can map at once for DIO */ + if (map.m_len > DIO_MAX_BLOCKS) + map.m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); +retry: + /* + * Either we allocate blocks and then we don't get unwritten + * extent so we have reserved enough credits, or the blocks + * are already allocated and unwritten and in that case + * extent conversion fits in the credits as well. + */ + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ret = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) { + ext4_journal_stop(handle); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + return ret; + } + /* For DAX writes we need to zero out unwritten extents */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + /* + * We are protected by i_mmap_sem or i_rwsem so we know + * block cannot go away from under us even though we + * dropped i_data_sem. Convert extent to written and + * write zeros there. + */ + ret = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_CONVERT | + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) { + ext4_journal_stop(handle); + return ret; + } + } + + /* + * If we added blocks beyond i_size we need to make sure they + * will get truncated if we crash before updating i_size in + * ext4_iomap_end(). + */ + if (first_block + map.m_len > + (inode->i_size + (1 << blkbits) - 1) >> blkbits) { + int err; + + err = ext4_orphan_add(handle, inode); + if (err < 0) { + ext4_journal_stop(handle); + return err; + } + } + ext4_journal_stop(handle); + } iomap->flags = 0; iomap->bdev = inode->i_sb->s_bdev; @@ -3368,8 +3429,61 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return 0; } +static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ + int ret = 0; + handle_t *handle; + int blkbits = inode->i_blkbits; + bool truncate = false; + + if (!(flags & IOMAP_WRITE)) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto orphan_del; + } + if (ext4_update_inode_size(inode, offset + written)) + ext4_mark_inode_dirty(handle, inode); + /* + * We may need to truncate allocated but not written blocks beyond EOF. + */ + if (iomap->offset + iomap->length > + ALIGN(inode->i_size, 1 << blkbits)) { + ext4_lblk_t written_blk, end_blk; + + written_blk = (offset + written) >> blkbits; + end_blk = (offset + length) >> blkbits; + if (written_blk < end_blk && ext4_can_truncate(inode)) + truncate = true; + } + /* + * Remove inode from orphan list if we were extending a inode and + * everything went fine. + */ + if (!truncate && inode->i_nlink && + !list_empty(&EXT4_I(inode)->i_orphan)) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + if (truncate) { + ext4_truncate_failed_write(inode); +orphan_del: + /* + * If truncate failed early the inode might still be on the + * orphan list; we need to make sure the inode is removed from + * the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret; +} + struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, + .iomap_end = ext4_iomap_end, }; #else -- cgit v1.2.3 From 96f8ba3dd632aff684cc7c67d9f4af435be0341c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 20 Nov 2016 18:10:09 -0500 Subject: ext4: avoid split extents for DAX writes Currently mapping of blocks for DAX writes happen with EXT4_GET_BLOCKS_PRE_IO flag set. That has a result that each ext4_map_blocks() call creates a separate written extent, although it could be merged to the neighboring extents in the extent tree. The reason for using this flag is that in case the extent is unwritten, we need to convert it to written one and zero it out. However this "convert mapped range to written" operation is already implemented by ext4_map_blocks() for the case of data writes into unwritten extent. So just use flags for that mode of operation, simplify the code, and avoid unnecessary split extents. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3941cee21e4c..29237f25ddbe 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3359,7 +3359,6 @@ retry: return PTR_ERR(handle); ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO); if (ret < 0) { ext4_journal_stop(handle); @@ -3368,22 +3367,6 @@ retry: goto retry; return ret; } - /* For DAX writes we need to zero out unwritten extents */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - /* - * We are protected by i_mmap_sem or i_rwsem so we know - * block cannot go away from under us even though we - * dropped i_data_sem. Convert extent to written and - * write zeros there. - */ - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CONVERT | - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) { - ext4_journal_stop(handle); - return ret; - } - } /* * If we added blocks beyond i_size we need to make sure they -- cgit v1.2.3 From e2ae766c1b030271b5099b25674e2131d1d1e8c1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 20 Nov 2016 18:51:24 -0500 Subject: ext4: convert DAX faults to iomap infrastructure Convert DAX faults to use iomap infrastructure. We would not have to start transaction in ext4_dax_fault() anymore since ext4_iomap_begin takes care of that but so far we do that to avoid lock inversion of transaction start with DAX entry lock which gets acquired in dax_iomap_fault() before calling ->iomap_begin handler. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 9 +++++---- fs/ext4/inode.c | 14 +++++++++----- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1953fe34f9fe..b5f184493c57 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -275,7 +275,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = dax_fault(vma, vmf, ext4_dax_get_block); + result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); if (write) { if (!IS_ERR(handle)) @@ -309,9 +309,10 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; - else - result = dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_get_block); + else { + result = dax_iomap_pmd_fault(vma, addr, pmd, flags, + &ext4_iomap_ops); + } if (write) { if (!IS_ERR(handle)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 29237f25ddbe..9de9a5a5d2a4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3369,12 +3369,16 @@ retry: } /* - * If we added blocks beyond i_size we need to make sure they + * If we added blocks beyond i_size, we need to make sure they * will get truncated if we crash before updating i_size in - * ext4_iomap_end(). + * ext4_iomap_end(). For faults we don't need to do that (and + * even cannot because for orphan list operations inode_lock is + * required) - if we happen to instantiate block beyond i_size, + * it is because we race with truncate which has already added + * the inode to the orphan list. */ - if (first_block + map.m_len > - (inode->i_size + (1 << blkbits) - 1) >> blkbits) { + if (!(flags & IOMAP_FAULT) && first_block + map.m_len > + (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { int err; err = ext4_orphan_add(handle, inode); @@ -3420,7 +3424,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, int blkbits = inode->i_blkbits; bool truncate = false; - if (!(flags & IOMAP_WRITE)) + if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) return 0; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); -- cgit v1.2.3 From 0bd2d5ec3d7655a849928f04597a0ceea0329176 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 20 Nov 2016 18:53:30 -0500 Subject: ext4: rip out DAX handling from direct IO path Reads and writes for DAX inodes should no longer end up in direct IO code. Rip out the support and add a warning. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 -- fs/ext4/inode.c | 97 +++++++++------------------------------------------------ 2 files changed, 15 insertions(+), 84 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index da82de650350..6673e88011ec 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2452,8 +2452,6 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_dio_get_block(struct inode *inode, sector_t iblock, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9de9a5a5d2a4..861f848159e8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3280,46 +3280,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } #ifdef CONFIG_FS_DAX -/* - * Get block function for DAX IO and mmap faults. It takes care of converting - * unwritten extents to written ones and initializes new / converted blocks - * to zeros. - */ -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - - ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); - if (!create) - return _ext4_get_block(inode, iblock, bh_result, 0); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) - return ret; - - if (buffer_unwritten(bh_result)) { - /* - * We are protected by i_mmap_sem or i_mutex so we know block - * cannot go away from under us even though we dropped - * i_data_sem. Convert extent to written and write zeros there. - */ - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_CONVERT | - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) - return ret; - } - /* - * At least for now we have to clear BH_New so that DAX code - * doesn't attempt to zero blocks again in a racy way. - */ - clear_buffer_new(bh_result); - return 0; -} - static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { @@ -3473,14 +3433,6 @@ struct iomap_ops ext4_iomap_ops = { .iomap_end = ext4_iomap_end, }; -#else -/* Just define empty function, it will never get called. */ -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - BUG(); - return 0; -} #endif static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, @@ -3602,19 +3554,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) iocb->private = NULL; if (overwrite) get_block_func = ext4_dio_get_block_overwrite; - else if (IS_DAX(inode)) { - /* - * We can avoid zeroing for aligned DAX writes beyond EOF. Other - * writes need zeroing either because they can race with page - * faults or because they use partial blocks. - */ - if (round_down(offset, 1<i_blkbits) >= inode->i_size && - ext4_aligned_io(inode, offset, count)) - get_block_func = ext4_dio_get_block; - else - get_block_func = ext4_dax_get_block; - dio_flags = DIO_LOCKING; - } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { get_block_func = ext4_dio_get_block; dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; @@ -3628,14 +3568,9 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) #ifdef CONFIG_EXT4_FS_ENCRYPTION BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); #endif - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, get_block_func, - ext4_end_io_dio, dio_flags); - } else - ret = __blockdev_direct_IO(iocb, inode, - inode->i_sb->s_bdev, iter, - get_block_func, - ext4_end_io_dio, NULL, dio_flags); + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, + get_block_func, ext4_end_io_dio, NULL, + dio_flags); if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { @@ -3704,6 +3639,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; /* @@ -3712,19 +3648,12 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) * we are protected against page writeback as well. */ inode_lock_shared(inode); - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); - } else { - size_t count = iov_iter_count(iter); - - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count); - if (ret) - goto out_unlock; - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, ext4_dio_get_block, - NULL, NULL, 0); - } + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, + iocb->ki_pos + count); + if (ret) + goto out_unlock; + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, ext4_dio_get_block, NULL, NULL, 0); out_unlock: inode_unlock_shared(inode); return ret; @@ -3753,6 +3682,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (ext4_has_inline_data(inode)) return 0; + /* DAX uses iomap path now */ + if (WARN_ON_ONCE(IS_DAX(inode))) + return 0; + trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == READ) ret = ext4_direct_IO_read(iocb, iter); -- cgit v1.2.3 From d086630e193b5837aa3432a65cda5751aa11b425 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 21 Nov 2016 11:51:44 -0500 Subject: ext4: remove unused function ext4_aligned_io() The last user of ext4_aligned_io() was the DAX path in ext4_direct_IO_write(). This usage was removed by Jan Kara's patch entitled "ext4: Rip out DAX handling from direct IO path". Signed-off-by: Ross Zwisler Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6673e88011ec..aff204f040fc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3257,13 +3257,6 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } } -static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) -{ - int blksize = 1 << inode->i_blkbits; - - return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); -} - extern struct iomap_ops ext4_iomap_ops; #endif /* __KERNEL__ */ -- cgit v1.2.3 From 2f8f5e76c7da787153f3b9791a2cb4fb384e0e67 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 21 Nov 2016 11:52:44 -0500 Subject: ext4: avoid lockdep warning when inheriting encryption context On a lockdep-enabled kernel, xfstests generic/027 fails due to a lockdep warning when run on ext4 mounted with -o test_dummy_encryption: xfs_io/4594 is trying to acquire lock: (jbd2_handle ){++++.+}, at: [] jbd2_log_wait_commit+0x5/0x11b but task is already holding lock: (jbd2_handle ){++++.+}, at: [] start_this_handle+0x354/0x3d8 The abbreviated call stack is: [] ? jbd2_log_wait_commit+0x5/0x11b [] jbd2_log_wait_commit+0x40/0x11b [] ? jbd2_log_wait_commit+0x5/0x11b [] ? __jbd2_journal_force_commit+0x76/0xa6 [] __jbd2_journal_force_commit+0x91/0xa6 [] jbd2_journal_force_commit_nested+0xe/0x18 [] ext4_should_retry_alloc+0x72/0x79 [] ext4_xattr_set+0xef/0x11f [] ext4_set_context+0x3a/0x16b [] fscrypt_inherit_context+0xe3/0x103 [] __ext4_new_inode+0x12dc/0x153a [] ext4_create+0xb7/0x161 When a file is created in an encrypted directory, ext4_set_context() is called to set an encryption context on the new file. This calls ext4_xattr_set(), which contains a retry loop where the journal is forced to commit if an ENOSPC error is encountered. If the task actually were to wait for the journal to commit in this case, then it would deadlock because a handle remains open from __ext4_new_inode(), so the running transaction can't be committed yet. Fortunately, __jbd2_journal_force_commit() avoids the deadlock by not allowing the running transaction to be committed while the current task has it open. However, the above lockdep warning is still triggered. This was a false positive which was introduced by: 1eaa566d368b: jbd2: track more dependencies on transaction commit Fix the problem by passing the handle through the 'fs_data' argument to ext4_set_context(), then using ext4_xattr_set_handle() instead of ext4_xattr_set(). And in the case where no journal handle is specified and ext4_set_context() has to open one, add an ENOSPC retry loop since in that case it is the outermost transaction. Signed-off-by: Eric Biggers --- fs/ext4/ialloc.c | 3 +-- fs/ext4/super.c | 32 ++++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 12 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 088afe07ddda..e57e8d90ea54 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1115,8 +1115,7 @@ got: } if (encrypt) { - /* give pointer to avoid set_context with journal ops. */ - err = fscrypt_inherit_context(dir, inode, &encrypt, true); + err = fscrypt_inherit_context(dir, inode, handle, true); if (err) goto fail_free_drop; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cbfaee175fa5..bb3a8edc75db 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1114,14 +1114,22 @@ static int ext4_prepare_context(struct inode *inode) static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { - handle_t *handle; - int res, res2; + handle_t *handle = fs_data; + int res, res2, retries = 0; - /* fs_data is null when internally used. */ - if (fs_data) { - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, - len, 0); + /* + * If a journal handle was specified, then the encryption context is + * being set on a new inode via inheritance and is part of a larger + * transaction to create the inode. Otherwise the encryption context is + * being set on an existing inode in its own transaction. Only in the + * latter case should the "retry on ENOSPC" logic be used. + */ + + if (handle) { + res = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, @@ -1134,14 +1142,15 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, return res; } +retry: handle = ext4_journal_start(inode, EXT4_HT_MISC, ext4_jbd2_credits_xattr(inode)); if (IS_ERR(handle)) return PTR_ERR(handle); - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, - len, 0); + res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); /* Update inode->i_flags - e.g. S_DAX may get disabled */ @@ -1151,6 +1160,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); } res2 = ext4_journal_stop(handle); + + if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (!res) res = res2; return res; -- cgit v1.2.3 From 4f5a763c9a0d4b179064d51dddde8421fdbb76c7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 22 Nov 2016 23:21:58 -0500 Subject: ext4: Add select for CONFIG_FS_IOMAP When ext4 is compiled with DAX support, it now needs the iomap code. Add appropriate select to Kconfig. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/ext4') diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index e38039fd96ff..7b90691e98c4 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -37,6 +37,7 @@ config EXT4_FS select CRC16 select CRYPTO select CRYPTO_CRC32C + select FS_IOMAP if FS_DAX help This is the next generation of the ext3 filesystem. -- cgit v1.2.3 From 9060dd2c5036b12132f9b97e3486ca6422d5bdfc Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 26 Nov 2016 14:24:51 -0500 Subject: ext4: fix mmp use after free during unmount In ext4_put_super, we call brelse on the buffer head containing the ext4 superblock, but then try to use it when we stop the mmp thread, because when the thread shuts down it does: write_mmp_block ext4_mmp_csum_set ext4_has_metadata_csum WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb)...) which reaches into sb->s_fs_info->s_es->s_feature_ro_compat, which lives in the superblock buffer s_sbh which we just released. Fix this by moving the brelse down to a point where we are no longer using it. Reported-by: Wang Shu Signed-off-by: Eric Sandeen Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bb3a8edc75db..a526956e49e7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -863,7 +863,6 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_free_rwsem(&sbi->s_journal_flag_rwsem); - brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); @@ -895,6 +894,7 @@ static void ext4_put_super(struct super_block *sb) } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); + brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the -- cgit v1.2.3 From f8011d93a2d125b812e8b90bedb5be2a00149ac4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 29 Nov 2016 11:13:13 -0500 Subject: ext4: add EXT4_JOURNAL_DATA_FL and EXT4_EXTENTS_FL to modifiable mask Add EXT4_JOURNAL_DATA_FL and EXT4_EXTENTS_FL to EXT4_FL_USER_MODIFIABLE to recognize that they are modifiable by userspace. So far we got away without having them there because ext4_ioctl_setflags() treats them in a special way. But it was really confusing like that. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/ioctl.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index aff204f040fc..6e8b8e3fa4b7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -397,7 +397,7 @@ struct flex_groups { #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index fc1cd37ba2d9..834a1c10b2c2 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -268,6 +268,9 @@ static int ext4_ioctl_setflags(struct inode *inode, for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; + /* These flags get special treatment later */ + if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL) + continue; if (mask & flags) ext4_set_inode_flag(inode, i); else -- cgit v1.2.3 From d14e7683ecf06aa4365ee96adf74494114341dbe Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 29 Nov 2016 11:18:39 -0500 Subject: ext4: be more strict when verifying flags set via SETFLAGS ioctls Currently we just silently ignore flags that we don't understand (or that cannot be manipulated) through EXT4_IOC_SETFLAGS and EXT4_IOC_FSSETXATTR ioctls. This makes it problematic for the unused flags to be used in future (some app may be inadvertedly setting them and we won't notice until the flag gets used). Also this is inconsistent with other filesystems like XFS or BTRFS which return EOPNOTSUPP when they see a flag they cannot set. ext4 has the additional problem that there are flags which are returned by EXT4_IOC_GETFLAGS ioctl but which cannot be modified via EXT4_IOC_SETFLAGS. So we have to be careful to ignore value of these flags and not fail the ioctl when they are set (as e.g. chattr(1) passes flags returned from EXT4_IOC_GETFLAGS to EXT4_IOC_SETFLAGS without any masking and thus we'd break this utility). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/ioctl.c | 28 +++++++++++++++++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6e8b8e3fa4b7..f9958f661789 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -399,6 +399,7 @@ struct flex_groups { #define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ #define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 834a1c10b2c2..2faeb3cdbbb3 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -415,6 +415,10 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) return xflags; } +#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ + FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + /* Transfer xflags flags to internal */ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) { @@ -459,12 +463,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; + if (flags & ~EXT4_FL_USER_VISIBLE) + return -EOPNOTSUPP; + /* + * chattr(1) grabs flags via GETFLAGS, modifies the result and + * passes that to SETFLAGS. So we cannot easily make SETFLAGS + * more restrictive than just silently masking off visible but + * not settable flags as we always did. + */ + flags &= EXT4_FL_USER_MODIFIABLE; + if (ext4_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + err = mnt_want_write_file(filp); if (err) return err; - flags = ext4_mask_flags(inode->i_mode, flags); - inode_lock(inode); err = ext4_ioctl_setflags(inode, flags); inode_unlock(inode); @@ -871,13 +885,17 @@ resizefs_out: if (!inode_owner_or_capable(inode)) return -EACCES; + if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS) + return -EOPNOTSUPP; + + flags = ext4_xflags_to_iflags(fa.fsx_xflags); + if (ext4_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + err = mnt_want_write_file(filp); if (err) return err; - flags = ext4_xflags_to_iflags(fa.fsx_xflags); - flags = ext4_mask_flags(inode->i_mode, flags); - inode_lock(inode); flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); -- cgit v1.2.3 From 6dcc693bc57f198bd85a7881eb59a915366fae1e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 1 Dec 2016 11:46:40 -0500 Subject: ext4: warn when page is dirtied without buffers Warn when a page is dirtied without buffers (as that will likely lead to a crash in ext4_writepages()) or when it gets newly dirtied without the page being locked (as there is nothing that prevents buffers to get stripped just before calling set_page_dirty() under memory pressure). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 861f848159e8..7d95b6174980 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3714,6 +3714,13 @@ static int ext4_journalled_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } +static int ext4_set_page_dirty(struct page *page) +{ + WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page)); + WARN_ON_ONCE(!page_has_buffers(page)); + return __set_page_dirty_buffers(page); +} + static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, @@ -3721,6 +3728,7 @@ static const struct address_space_operations ext4_aops = { .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, + .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, @@ -3753,6 +3761,7 @@ static const struct address_space_operations ext4_da_aops = { .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, + .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_da_invalidatepage, .releasepage = ext4_releasepage, -- cgit v1.2.3 From 05ac5aa18abd7db341e54df4ae2b4c98ea0e43b7 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 1 Dec 2016 11:49:12 -0500 Subject: ext4: fix inode checksum calculation problem if i_extra_size is small We've fixed the race condition problem in calculating ext4 checksum value in commit b47820edd163 ("ext4: avoid modifying checksum fields directly during checksum veficationon"). However, by this change, when calculating the checksum value of inode whose i_extra_size is less than 4, we couldn't calculate the checksum value in a proper way. This problem was found and reported by Nix, Thank you. Reported-by: Nix Signed-off-by: Daeho Jeong Signed-off-by: Youngjin Gil Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7d95b6174980..b48ca0392b9c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -72,10 +72,9 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, - EXT4_INODE_SIZE(inode->i_sb) - - offset); } + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; -- cgit v1.2.3 From 35997d1ce8b01f9f002dd080c59e22430ead6db1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 1 Dec 2016 11:54:18 -0500 Subject: ext4: get rid of ext4_sb_has_crypto() ext4_sb_has_crypto() just called through to ext4_has_feature_encrypt(), and all callers except one were already using the latter. So remove it and switch its one caller to ext4_has_feature_encrypt(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 ----- fs/ext4/ioctl.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f9958f661789..7446d390d051 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2272,11 +2272,6 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -static inline int ext4_sb_has_crypto(struct super_block *sb) -{ - return ext4_has_feature_encrypt(sb); -} - static inline bool ext4_encrypted_inode(struct inode *inode) { return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2faeb3cdbbb3..560afe08a5ca 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -806,7 +806,7 @@ resizefs_out: struct ext4_sb_info *sbi = EXT4_SB(sb); handle_t *handle; - if (!ext4_sb_has_crypto(sb)) + if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { err = mnt_want_write_file(filp); -- cgit v1.2.3 From ba679017ef0f47d31c50df731441fdd39bf4bf13 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 1 Dec 2016 11:55:51 -0500 Subject: ext4: disable pwsalt ioctl when encryption disabled by config On a CONFIG_EXT4_FS_ENCRYPTION=n kernel, the ioctls to get and set encryption policies were disabled but EXT4_IOC_GET_ENCRYPTION_PWSALT was not. But there's no good reason to expose the pwsalt ioctl if the kernel doesn't support encryption. The pwsalt ioctl was also disabled pre-4.8 (via ext4_sb_has_crypto() previously returning 0 when encryption was disabled by config) and seems to have been enabled by mistake when ext4 encryption was refactored to use fs/crypto/. So let's disable it again. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 560afe08a5ca..fcc9510a819c 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -191,6 +191,7 @@ journal_err_out: return err; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION static int uuid_is_zero(__u8 u[16]) { int i; @@ -200,6 +201,7 @@ static int uuid_is_zero(__u8 u[16]) return 0; return 1; } +#endif static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) @@ -802,6 +804,7 @@ resizefs_out: #endif } case EXT4_IOC_GET_ENCRYPTION_PWSALT: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); handle_t *handle; @@ -836,6 +839,9 @@ resizefs_out: sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; +#else + return -EOPNOTSUPP; +#endif } case EXT4_IOC_GET_ENCRYPTION_POLICY: { #ifdef CONFIG_EXT4_FS_ENCRYPTION -- cgit v1.2.3 From 2dc8d9e19b0d891b0d3675b5ac82be9be3875e36 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 1 Dec 2016 14:43:33 -0500 Subject: ext4: forbid i_extra_isize not divisible by 4 i_extra_isize not divisible by 4 is problematic for several reasons: - It causes the in-inode xattr space to be misaligned, but the xattr header and entries are not declared __packed to express this possibility. This may cause poor performance or incorrect code generation on some platforms. - When validating the xattr entries we can read past the end of the inode if the size available for xattrs is not a multiple of 4. - It allows the nonsensical i_extra_isize=1, which doesn't even leave enough room for i_extra_isize itself. Therefore, update ext4_iget() to consider i_extra_isize not divisible by 4 to be an error, like the case where i_extra_isize is too large. This also matches the rule recently added to e2fsck for determining whether an inode has valid i_extra_isize. This patch shouldn't have any noticeable effect on non-corrupted/non-malicious filesystems, since the size of ext4_inode has always been a multiple of 4. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b48ca0392b9c..e3e197898c66 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4572,10 +4572,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT4_INODE_SIZE(inode->i_sb)) { - EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", - EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, - EXT4_INODE_SIZE(inode->i_sb)); + EXT4_INODE_SIZE(inode->i_sb) || + (ei->i_extra_isize & 3)) { + EXT4_ERROR_INODE(inode, + "bad extra_isize %u (inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } @@ -4693,6 +4695,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ + BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { -- cgit v1.2.3 From 290ab230016f187c3551d8380ea742889276d03a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 1 Dec 2016 14:51:58 -0500 Subject: ext4: don't read out of bounds when checking for in-inode xattrs With i_extra_isize equal to or close to the available space, it was possible for us to read past the end of the inode when trying to detect or validate in-inode xattrs. Fix this by checking for the needed extra space first. This patch shouldn't have any noticeable effect on non-corrupted/non-malicious filesystems. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/inode.c | 4 +++- fs/ext4/xattr.c | 5 ++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e3e197898c66..59a518ad6bb2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4527,7 +4527,9 @@ static inline void ext4_iget_extra_inode(struct inode *inode, { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= + EXT4_INODE_SIZE(inode->i_sb) && + *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); ext4_find_inline_data_nolock(inode); } else diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1846e9168f80..59c9ec7eabae 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -231,13 +231,12 @@ static int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { - struct ext4_xattr_entry *entry = IFIRST(header); int error = -EFSCORRUPTED; - if (((void *) header >= end) || + if (end - (void *)header < sizeof(*header) + sizeof(u32) || (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; - error = ext4_xattr_check_names(entry, end, entry); + error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); errout: if (error) __ext4_error_inode(inode, function, line, 0, -- cgit v1.2.3 From d7614cc16146e3f0b4c33e71875c19607602aed5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 1 Dec 2016 14:57:29 -0500 Subject: ext4: correctly detect when an xattr value has an invalid size It was possible for an xattr value to have a very large size, which would then pass validation on 32-bit architectures due to a pointer wraparound. Fix this by validating the size in a way which avoids pointer wraparound. It was also possible that a value's size would fit in the available space but its padded size would not. This would cause an out-of-bounds memory write in ext4_xattr_set_entry when replacing the xattr value. For example, if an xattr value of unpadded size 253 bytes went until the very end of the inode or block, then using setxattr(2) to replace this xattr's value with 256 bytes would cause a write to the 3 bytes past the end of the inode or buffer, and the new xattr value would be incorrectly truncated. Fix this by requiring that the padded size fit in the available space rather than the unpadded size. This patch shouldn't have any noticeable effect on non-corrupted/non-malicious filesystems. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 59c9ec7eabae..5a94fa52b74f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -185,6 +185,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, { struct ext4_xattr_entry *e = entry; + /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) @@ -192,15 +193,29 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, e = next; } + /* Check the values */ while (!IS_LAST_ENTRY(entry)) { if (entry->e_value_block != 0) return -EFSCORRUPTED; - if (entry->e_value_size != 0 && - (value_start + le16_to_cpu(entry->e_value_offs) < - (void *)e + sizeof(__u32) || - value_start + le16_to_cpu(entry->e_value_offs) + - le32_to_cpu(entry->e_value_size) > end)) - return -EFSCORRUPTED; + if (entry->e_value_size != 0) { + u16 offs = le16_to_cpu(entry->e_value_offs); + u32 size = le32_to_cpu(entry->e_value_size); + void *value; + + /* + * The value cannot overlap the names, and the value + * with padding cannot extend beyond 'end'. Check both + * the padded and unpadded sizes, since the size may + * overflow to 0 when adding padding. + */ + if (offs > end - value_start) + return -EFSCORRUPTED; + value = value_start + offs; + if (value < (void *)e + sizeof(u32) || + size > end - value || + EXT4_XATTR_SIZE(size) > end - value) + return -EFSCORRUPTED; + } entry = EXT4_XATTR_NEXT(entry); } -- cgit v1.2.3 From 3a4b77cd47bb837b8557595ec7425f281f2ca1fe Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 1 Dec 2016 15:08:37 -0500 Subject: ext4: validate s_first_meta_bg at mount time Ralf Spenneberg reported that he hit a kernel crash when mounting a modified ext4 image. And it turns out that kernel crashed when calculating fs overhead (ext4_calculate_overhead()), this is because the image has very large s_first_meta_bg (debug code shows it's 842150400), and ext4 overruns the memory in count_overhead() when setting bitmap buffer, which is PAGE_SIZE. ext4_calculate_overhead(): buf = get_zeroed_page(GFP_NOFS); <=== PAGE_SIZE buffer blks = count_overhead(sb, i, buf); count_overhead(): for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { <=== j = 842150400 ext4_set_bit(EXT4_B2C(sbi, s++), buf); <=== buffer overrun count++; } This can be reproduced easily for me by this script: #!/bin/bash rm -f fs.img mkdir -p /mnt/ext4 fallocate -l 16M fs.img mke2fs -t ext4 -O bigalloc,meta_bg,^resize_inode -F fs.img debugfs -w -R "ssv first_meta_bg 842150400" fs.img mount -o loop fs.img /mnt/ext4 Fix it by validating s_first_meta_bg first at mount time, and refusing to mount if its value exceeds the largest possible meta_bg number. Reported-by: Ralf Spenneberg Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/super.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a526956e49e7..32c0debbaa92 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3842,6 +3842,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); + if (ext4_has_feature_meta_bg(sb)) { + if (le32_to_cpu(es->s_first_meta_bg) >= db_count) { + ext4_msg(sb, KERN_WARNING, + "first meta block group too large: %u " + "(group descriptor block count %u)", + le32_to_cpu(es->s_first_meta_bg), db_count); + goto failed_mount; + } + } sbi->s_group_desc = ext4_kvmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); -- cgit v1.2.3 From 4db0d88e2ebc4f47092adc01f9885a43ad748995 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 2 Dec 2016 12:12:53 -0500 Subject: ext4: fix reading new encrypted symlinks on no-journal file systems On a filesystem with no journal, a symlink longer than about 32 characters (exact length depending on padding for encryption) could not be followed or read immediately after being created in an encrypted directory. This happened because when the symlink data went through the delayed allocation path instead of the journaling path, the symlink was incorrectly detected as a "fast" symlink rather than a "slow" symlink until its data was written out. To fix this, disable delayed allocation for symlinks, since there is no benefit for delayed allocation anyway. Reported-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 59a518ad6bb2..a1eac0054203 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2902,7 +2902,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_SHIFT; - if (ext4_nonda_switch(inode->i_sb)) { + if (ext4_nonda_switch(inode->i_sb) || + S_ISLNK(inode->i_mode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); -- cgit v1.2.3 From ab04df78181b271dca096a8050877469889e3c8c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 3 Dec 2016 16:20:53 -0500 Subject: ext4: fix checks for data=ordered and journal_async_commit options Combination of data=ordered mode and journal_async_commit mount option is invalid. However the check in parse_options() fails to detect the case where we simply end up defaulting to data=ordered mode and we detect the problem only on remount which triggers hard to understand failure to remount the filesystem. Fix the checking of mount options to take into account also the default mode by moving the check somewhat later in the mount sequence. Reported-by: Wolfgang Walter Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 32c0debbaa92..f185b9a5a024 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1901,12 +1901,6 @@ static int parse_options(char *options, struct super_block *sb, return 0; } } - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && - test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit " - "in data=ordered mode"); - return 0; - } return 1; } @@ -4004,6 +3998,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) default: break; } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + goto failed_mount_wq; + } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; @@ -4897,6 +4899,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + err = -EINVAL; + goto restore_opts; + } } if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { -- cgit v1.2.3 From 011c88e36c26a08590b2ebe214ac854289f85f26 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 3 Dec 2016 16:46:58 -0500 Subject: ext4: remove another test in ext4_alloc_file_blocks() Before commit c3fe493ccdb1 ('ext4: remove unneeded test in ext4_alloc_file_blocks()') then it was possible for "depth" to be -1 but now, it's not possible that it is negative. Signed-off-by: Dan Carpenter Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 65dbd2bcf775..3e1014fe835e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4700,7 +4700,7 @@ retry: /* * Recalculate credits when extent tree depth changes. */ - if (depth >= 0 && depth != ext_depth(inode)) { + if (depth != ext_depth(inode)) { credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); } -- cgit v1.2.3 From 7e6e1ef48fc02f3ac5d0edecbb0c6087cd758d58 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 10 Dec 2016 09:55:01 -0500 Subject: ext4: reject inodes with negative size Don't load an inode with a negative size; this causes integer overflow problems in the VFS. [ Added EXT4_ERROR_INODE() to mark file system as corrupted. -TYT] Fixes: a48380f769df (ext4: rename i_dir_acl to i_size_high) Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a1eac0054203..f9f892212308 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4553,6 +4553,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; + loff_t size; int block; uid_t i_uid; gid_t i_gid; @@ -4655,6 +4656,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); + if ((size = i_size_read(inode)) < 0) { + EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ret = -EFSCORRUPTED; + goto bad_inode; + } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; -- cgit v1.2.3 From 578620f451f836389424833f1454eeeb2ffc9e9f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 10 Dec 2016 09:56:01 -0500 Subject: ext4: return -ENOMEM instead of success We should set the error code if kzalloc() fails. Fixes: 67cf5b09a46f ("ext4: add the basic function for inline data support") Signed-off-by: Dan Carpenter Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/inline.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 9b67f75bdcf7..437df6a1a841 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -341,8 +341,10 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); - if (!value) + if (!value) { + error = -ENOMEM; goto out; + } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); -- cgit v1.2.3 From 73b92a2a5e97d17cc4d5c4fe9d724d3273fb6fd2 Mon Sep 17 00:00:00 2001 From: Sergey Karamov Date: Sat, 10 Dec 2016 17:54:58 -0500 Subject: ext4: do not perform data journaling when data is encrypted Currently data journalling is incompatible with encryption: enabling both at the same time has never been supported by design, and would result in unpredictable behavior. However, users are not precluded from turning on both features simultaneously. This change programmatically replaces data journaling for encrypted regular files with ordered data journaling mode. Background: Journaling encrypted data has not been supported because it operates on buffer heads of the page in the page cache. Namely, when the commit happens, which could be up to five seconds after caching, the commit thread uses the buffer heads attached to the page to copy the contents of the page to the journal. With encryption, it would have been required to keep the bounce buffer with ciphertext for up to the aforementioned five seconds, since the page cache can only hold plaintext and could not be used for journaling. Alternatively, it would be required to setup the journal to initiate a callback at the commit time to perform deferred encryption - in this case, not only would the data have to be written twice, but it would also have to be encrypted twice. This level of complexity was not justified for a mode that in practice is very rarely used because of the overhead from the data journalling. Solution: If data=journaled has been set as a mount option for a filesystem, or if journaling is enabled on a regular file, do not perform journaling if the file is also encrypted, instead fall back to the data=ordered mode for the file. Rationale: The intent is to allow seamless and proper filesystem operation when journaling and encryption have both been enabled, and have these two conflicting features gracefully resolved by the filesystem. Fixes: 4461471107b7 Signed-off-by: Sergey Karamov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ext4_jbd2.h | 14 ++++++++------ fs/ext4/super.c | 5 +++++ 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b1d52c14098e..f97611171023 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -414,17 +414,19 @@ static inline int ext4_inode_journal_mode(struct inode *inode) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC)) + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC))) { + /* We do not support data journalling for encrypted data */ + if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + } if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - else - BUG(); + BUG(); } static inline int ext4_should_journal_data(struct inode *inode) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f185b9a5a024..79af71d4fccd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3530,6 +3530,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dax"); goto failed_mount; } + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_WARNING, + "encrypted files will use data=ordered " + "instead of data journaling mode"); + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { -- cgit v1.2.3 From db717d8e26c2d1b0dba3e08668a1e6a7f665adde Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 26 Nov 2016 19:07:49 -0500 Subject: fscrypto: move ioctl processing more fully into common code Multiple bugs were recently fixed in the "set encryption policy" ioctl. To make it clear that fscrypt_process_policy() and fscrypt_get_policy() implement ioctls and therefore their implementations must take standard security and correctness precautions, rename them to fscrypt_ioctl_set_policy() and fscrypt_ioctl_get_policy(). Make the latter take in a struct file * to make it consistent with the former. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ++-- fs/ext4/ioctl.c | 34 +++++----------------------------- 2 files changed, 7 insertions(+), 31 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 282a51b07c57..bd8bc3be93a5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2338,8 +2338,8 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context #define fscrypt_inherit_context fscrypt_notsupp_inherit_context #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bf5ae8ebbc97..70083863ce51 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -765,22 +765,12 @@ resizefs_out: } case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); - case EXT4_IOC_SET_ENCRYPTION_POLICY: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct fscrypt_policy policy; + case EXT4_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); - if (copy_from_user(&policy, - (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - return fscrypt_process_policy(filp, &policy); -#else - return -EOPNOTSUPP; -#endif - } case EXT4_IOC_GET_ENCRYPTION_PWSALT: { int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -817,23 +807,9 @@ resizefs_out: return -EFAULT; return 0; } - case EXT4_IOC_GET_ENCRYPTION_POLICY: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct fscrypt_policy policy; - int err = 0; - - if (!ext4_encrypted_inode(inode)) - return -ENOENT; - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; -#else - return -EOPNOTSUPP; -#endif - } + case EXT4_IOC_GET_ENCRYPTION_POLICY: + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; -- cgit v1.2.3 From bd7b8290388dd58a8c0a3710b171e58ef952ca4d Mon Sep 17 00:00:00 2001 From: David Gstir Date: Tue, 6 Dec 2016 23:53:56 +0100 Subject: fscrypt: Cleanup page locking requirements for fscrypt_{decrypt,encrypt}_page() Rename the FS_CFLG_INPLACE_ENCRYPTION flag to FS_CFLG_OWN_PAGES which, when set, indicates that the fs uses pages under its own control as opposed to writeback pages which require locking and a bounce buffer for encryption. Signed-off-by: David Gstir Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1485ac273bfb..fb2b514f675b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3744,7 +3744,6 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); - BUG_ON(!PageLocked(page)); WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host, page, PAGE_SIZE, 0, page->index)); } -- cgit v1.2.3