From 684666e51585f3b136a3f505f8173d7580bc52cd Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 23 Jan 2017 18:50:23 +0100 Subject: jfs: atomically read inode size See i_size_read() comments in include/linux/fs.h Signed-off-by: Fabian Frederick Signed-off-by: Dave Kleikamp --- fs/jfs/resize.c | 4 ++-- fs/jfs/super.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index bd9b641ada2c..7ddcb445a3d9 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -98,7 +98,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) goto out; } - VolumeSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits; if (VolumeSize) { if (newLVSize > VolumeSize) { @@ -211,7 +211,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) txQuiesce(sb); /* Reset size of direct inode */ - sbi->direct_inode->i_size = sb->s_bdev->bd_inode->i_size; + sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode); if (sbi->mntflag & JFS_INLINELOG) { /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2be7c9ce6663..51881eb8ccff 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -283,7 +283,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, } case Opt_resize_nosize: { - *newLVSize = sb->s_bdev->bd_inode->i_size >> + *newLVSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits; if (*newLVSize == 0) pr_err("JFS: Cannot determine volume size\n"); @@ -549,7 +549,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) goto out_unload; } inode->i_ino = 0; - inode->i_size = sb->s_bdev->bd_inode->i_size; + inode->i_size = i_size_read(sb->s_bdev->bd_inode); inode->i_mapping->a_ops = &jfs_metapage_aops; hlist_add_fake(&inode->i_hash); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); -- cgit v1.2.3 From 68227c03cba84a24faf8a7277d2b1a03c8959c2c Mon Sep 17 00:00:00 2001 From: Mateusz Jurczyk Date: Wed, 7 Jun 2017 12:26:49 +0200 Subject: fuse: initialize the flock flag in fuse_file on allocation Before the patch, the flock flag could remain uninitialized for the lifespan of the fuse_file allocation. Unless set to true in fuse_file_flock(), it would remain in an indeterminate state until read in an if statement in fuse_release_common(). This could consequently lead to taking an unexpected branch in the code. The bug was discovered by a runtime instrumentation designed to detect use of uninitialized memory in the kernel. Signed-off-by: Mateusz Jurczyk Fixes: 37fb3a30b462 ("fuse: fix flock") Cc: # v3.1+ Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3ee4fdc3da9e..76eac2a554c4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -46,7 +46,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); if (unlikely(!ff)) return NULL; -- cgit v1.2.3 From 9bcf66c72d726322441ec82962994e69157613e4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2017 15:31:10 +0200 Subject: jfs: Don't clear SGID when inheriting ACLs When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by moving posix_acl_update_mode() out of __jfs_set_acl() into jfs_set_acl(). That way the function will not be called when inheriting ACLs which is what we want as it prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef CC: stable@vger.kernel.org CC: jfs-discussion@lists.sourceforge.net Signed-off-by: Jan Kara Signed-off-by: Dave Kleikamp --- fs/jfs/acl.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 7bc186f4ed4d..1be45c8d460d 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -77,13 +77,6 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: ea_name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (rc) - return rc; - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); - } break; case ACL_TYPE_DEFAULT: ea_name = XATTR_NAME_POSIX_ACL_DEFAULT; @@ -118,9 +111,17 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); + if (type == ACL_TYPE_ACCESS && acl) { + rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (rc) + goto end_tx; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + } rc = __jfs_set_acl(tid, inode, type, acl); if (!rc) rc = txCommit(tid, 1, &inode, 0); +end_tx: txEnd(tid); mutex_unlock(&JFS_IP(inode)->commit_mutex); return rc; -- cgit v1.2.3 From f070e5ac9bc7de71c34402048ce5526dccbd347c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Wed, 12 Jul 2017 06:55:35 -0300 Subject: jfs: preserve i_mode if __jfs_set_acl() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When changing a file's acl mask, __jfs_set_acl() will first set the group bits of i_mode to the value of the mask, and only then set the actual extended attribute representing the new acl. If the second part fails (due to lack of space, for example) and the file had no acl attribute to begin with, the system will from now on assume that the mask permission bits are actual group permission bits, potentially granting access to the wrong users. Prevent this by only changing the inode mode after the acl has been set. Signed-off-by: Ernesto A. Fernández Signed-off-by: Dave Kleikamp --- fs/jfs/acl.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 1be45c8d460d..2e71b6e7e646 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -108,19 +108,26 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int rc; tid_t tid; + int update_mode = 0; + umode_t mode = inode->i_mode; tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); if (type == ACL_TYPE_ACCESS && acl) { - rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); + rc = posix_acl_update_mode(inode, &mode, &acl); if (rc) goto end_tx; - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); + update_mode = 1; } rc = __jfs_set_acl(tid, inode, type, acl); - if (!rc) + if (!rc) { + if (update_mode) { + inode->i_mode = mode; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + } rc = txCommit(tid, 1, &inode, 0); + } end_tx: txEnd(tid); mutex_unlock(&JFS_IP(inode)->commit_mutex); -- cgit v1.2.3 From e33bf72361bdd764c827e160249a3e06d2a8e2fe Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 18 Jul 2017 20:34:02 +0100 Subject: Btrfs: fix dir item validation when replaying xattr deletes We were passing an incorrect slot number to the function that validates directory items when we are replaying xattr deletes from a log tree. The correct slot is stored at variable 'i' and not at 'path->slots[0]', so the call to the validation function was only correct for the first iteration of the loop, when 'i == path->slots[0]'. After this fix, the fstest generic/066 passes again. Fixes: 8ee8c2d62d5f ("btrfs: Verify dir_item in replay_xattr_deletes") Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f20ef211a73d..3a11ae63676e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2153,8 +2153,7 @@ process_leaf: u32 this_len = sizeof(*di) + name_len + data_len; char *name; - ret = verify_dir_item(fs_info, path->nodes[0], - path->slots[0], di); + ret = verify_dir_item(fs_info, path->nodes[0], i, di); if (ret) { ret = -EIO; goto out; -- cgit v1.2.3 From 1e86eabe73b73c82e1110c746ed3ec6d5e1c0a0d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 17 Jul 2017 14:30:45 -0700 Subject: xfs: check _btree_check_block value Check the _btree_check_block return value for the firstrec and lastrec functions, since we have the ability to signal that the repositioning did not succeed. Fixes-coverity-id: 114067 Fixes-coverity-id: 114068 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_btree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 4da85fff69ad..e0bcc4a59efd 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -728,7 +728,8 @@ xfs_btree_firstrec( * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); + if (xfs_btree_check_block(cur, block, level, bp)) + return 0; /* * It's empty, there is no such record. */ @@ -757,7 +758,8 @@ xfs_btree_lastrec( * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); + if (xfs_btree_check_block(cur, block, level, bp)) + return 0; /* * It's empty, there is no such record. */ -- cgit v1.2.3 From 4c1a67bd3606540b9b42caff34a1d5cd94b1cf65 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 17 Jul 2017 14:30:51 -0700 Subject: xfs: set firstfsb to NULLFSBLOCK before feeding it to _bmapi_write We must initialize the firstfsb parameter to _bmapi_write so that it doesn't incorrectly treat stack garbage as a restriction on which AGs it can search for free space. Fixes-coverity-id: 1402025 Fixes-coverity-id: 1415167 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_bmap.c | 9 +++++++++ fs/xfs/xfs_reflink.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0a9880777c9c..ee118ceb702f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6499,6 +6499,15 @@ xfs_bmap_finish_one( xfs_fsblock_t firstfsb; int error = 0; + /* + * firstfsb is tied to the transaction lifetime and is used to + * ensure correct AG locking order and schedule work item + * continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us + * to only making one bmap call per transaction, so it should + * be safe to have it as a local variable here. + */ + firstfsb = NULLFSBLOCK; + trace_xfs_bmap_deferred(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ab2270a87196..d9b3d57a1921 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -329,7 +329,7 @@ xfs_reflink_convert_cow_extent( xfs_filblks_t count_fsb, struct xfs_defer_ops *dfops) { - xfs_fsblock_t first_block; + xfs_fsblock_t first_block = NULLFSBLOCK; int nimaps = 1; if (imap->br_state == XFS_EXT_NORM) -- cgit v1.2.3 From 10479e2dea83d4c421ad05dfc55d918aa8dfc0cd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 17 Jul 2017 14:30:57 -0700 Subject: xfs: check _alloc_read_agf buffer pointer before using In some circumstances, _alloc_read_agf can return an error code of zero but also a null AGF buffer pointer. Check for this and jump out. Fixes-coverity-id: 1415250 Fixes-coverity-id: 1415320 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_refcount.c | 4 ++++ fs/xfs/xfs_reflink.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 900ea231f9a3..45b1c3b4e047 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1638,6 +1638,10 @@ xfs_refcount_recover_cow_leftovers( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) goto out_trans; + if (!agbp) { + error = -ENOMEM; + goto out_trans; + } cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d9b3d57a1921..f45fbf0db9bb 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -170,6 +170,8 @@ xfs_reflink_find_shared( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) return error; + if (!agbp) + return -ENOMEM; cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); -- cgit v1.2.3 From 144439376bfcd1d178e37bc08e27a58f82719bdb Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 19 Jul 2017 23:25:51 -0400 Subject: btrfs: fix lockup in find_free_extent with read-only block groups If we have a block group that is all of the following: 1) uncached in memory 2) is read-only 3) has a disk cache state that indicates we need to recreate the cache AND the file system has enough free space fragmentation such that the request for an extent of a given size can't be honored; AND have a single CPU core; AND it's the block group with the highest starting offset such that there are no opportunities (like reading from disk) for the loop to yield the CPU; We can end up with a lockup. The root cause is simple. Once we're in the position that we've read in all of the other block groups directly and none of those block groups can honor the request, there are no more opportunities to sleep. We end up trying to start a caching thread which never gets run if we only have one core. This *should* present as a hung task waiting on the caching thread to make some progress, but it doesn't. Instead, it degrades into a busy loop because of the placement of the read-only check. During the first pass through the loop, block_group->cached will be set to BTRFS_CACHE_STARTED and have_caching_bg will be set. Then we hit the read-only check and short circuit the loop. We're not yet in LOOP_CACHING_WAIT, so we skip that loop back before going through the loop again for other raid groups. Then we move to LOOP_CACHING_WAIT state. During the this pass through the loop, ->cached will still be BTRFS_CACHE_STARTED, which means it's not cached, so we'll enter cache_block_group, do a lot of nothing, and return, and also set have_caching_bg again. Then we hit the read-only check and short circuit the loop. The same thing happens as before except now we DO trigger the LOOP_CACHING_WAIT && have_caching_bg check and loop back up to the top. We do this forever. There are two fixes in this patch since they address the same underlying bug. The first is to add a cond_resched to the end of the loop to ensure that the caching thread always has an opportunity to run. This will fix the soft lockup issue, but find_free_extent will still loop doing nothing until the thread has completed. The second is to move the read-only check to the top of the loop. We're never going to return an allocation within a read-only block group so we may as well skip it early. The check for ->cached == BTRFS_CACHE_ERROR would cause the same problem except that BTRFS_CACHE_ERROR is considered a "done" state and we won't re-set have_caching_bg again. Many thanks to Stephan Kulow for his excellent help in the testing process. Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 375f8c728d91..a6635f07b8f1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7589,6 +7589,10 @@ search: u64 offset; int cached; + /* If the block group is read-only, we can skip it entirely. */ + if (unlikely(block_group->ro)) + continue; + btrfs_grab_block_group(block_group, delalloc); search_start = block_group->key.objectid; @@ -7624,8 +7628,6 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; - if (unlikely(block_group->ro)) - goto loop; /* * Ok we want to try and use the cluster allocator, so @@ -7839,6 +7841,7 @@ loop: failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); btrfs_release_block_group(block_group, delalloc); + cond_resched(); } up_read(&space_info->groups_sem); -- cgit v1.2.3 From 17024ad0a0fdfcfe53043afb969b813d3e020c21 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 20 Jul 2017 15:10:35 -0700 Subject: Btrfs: fix early ENOSPC due to delalloc If a lot of metadata is reserved for outstanding delayed allocations, we rely on shrink_delalloc() to reclaim metadata space in order to fulfill reservation tickets. However, shrink_delalloc() has a shortcut where if it determines that space can be overcommitted, it will stop early. This made sense before the ticketed enospc system, but now it means that shrink_delalloc() will often not reclaim enough space to fulfill any tickets, leading to an early ENOSPC. (Reservation tickets don't care about being able to overcommit, they need every byte accounted for.) Fix it by getting rid of the shortcut so that shrink_delalloc() reclaims all of the metadata it is supposed to. This fixes early ENOSPCs we were seeing when doing a btrfs receive to populate a new filesystem, as well as early ENOSPCs Christoph saw when doing a big cp -r onto Btrfs. Fixes: 957780eb2788 ("Btrfs: introduce ticketed enospc infrastructure") Tested-by: Christoph Anton Mitterer Cc: stable@vger.kernel.org Reviewed-by: Josef Bacik Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a6635f07b8f1..e3b0b4196d3d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4825,10 +4825,6 @@ skip_async: else flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(fs_info, space_info, orig, flush, false)) { - spin_unlock(&space_info->lock); - break; - } if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { spin_unlock(&space_info->lock); -- cgit v1.2.3 From 0e4324a4c36b3eb5cd1f71cbbc38d888f919ebfc Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 21 Jul 2017 11:28:24 +0300 Subject: btrfs: round down size diff when shrinking/growing device Further testing showed that the fix introduced in 7dfb8be11b5d ("btrfs: Round down values which are written for total_bytes_size") was insufficient and it could still lead to discrepancies between the total_bytes in the super block and the device total bytes. So this patch also ensures that the difference between old/new sizes when shrinking/growing is also rounded down. This ensure that we won't be subtracting/adding a non-sectorsize multiples to the superblock/device total sizees. Fixes: 7dfb8be11b5d ("btrfs: Round down values which are written for total_bytes_size") Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c95f018d4a1e..b3c30719bf5d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2702,7 +2702,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, mutex_lock(&fs_info->chunk_mutex); old_total = btrfs_super_total_bytes(super_copy); - diff = new_size - device->total_bytes; + diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); if (new_size <= device->total_bytes || device->is_tgtdev_for_dev_replace) { @@ -4406,7 +4406,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 diff; new_size = round_down(new_size, fs_info->sectorsize); - diff = old_size - new_size; + diff = round_down(old_size - new_size, fs_info->sectorsize); if (device->is_tgtdev_for_dev_replace) return -EINVAL; -- cgit v1.2.3 From cfaf2d034360166e569a4929dd83ae9698bed856 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 24 Jul 2017 08:33:25 -0700 Subject: xfs: fix quotacheck dquot id overflow infinite loop If a dquot has an id of U32_MAX, the next lookup index increment overflows the uint32_t back to 0. This starts the lookup sequence over from the beginning, repeats indefinitely and results in a livelock. Update xfs_qm_dquot_walk() to explicitly check for the lookup overflow and exit the loop. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_qm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6ce948c436d5..15751dc2a27d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -111,6 +111,9 @@ restart: skipped = 0; break; } + /* we're done if id overflows back to zero */ + if (!next_index) + break; } if (skipped) { -- cgit v1.2.3 From 6215894e11de224183c89b001f5363912442b489 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 21 Jul 2017 11:04:23 -0700 Subject: xfs: check that dir block entries don't off the end of the buffer When we're checking the entries in a directory buffer, make sure that the entry length doesn't push us off the end of the buffer. Found via xfs/388 writing ones to the length fields. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_dir2_data.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index d478065b9544..8727a43115ef 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -136,6 +136,8 @@ __xfs_dir3_data_check( */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= + p + be16_to_cpu(dup->length)); XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)hdr); @@ -164,6 +166,8 @@ __xfs_dir3_data_check( XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= + p + ops->data_entsize(dep->namelen)); XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*ops->data_entry_tag_p(dep)) == (char *)dep - (char *)hdr); -- cgit v1.2.3 From 5b094d6dac0451ad89b1dc088395c7b399b7e9e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jul 2017 11:16:51 -0700 Subject: xfs: fix multi-AG deadlock in xfs_bunmapi Just like in the allocator we must avoid touching multiple AGs out of order when freeing blocks, as freeing still locks the AGF and can cause the same AB-BA deadlocks as in the allocation path. Signed-off-by: Christoph Hellwig Reported-by: Nikolay Borisov Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ee118ceb702f..c09c16b1ad3b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5435,6 +5435,7 @@ __xfs_bunmapi( xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t max_len; + xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); @@ -5534,6 +5535,17 @@ __xfs_bunmapi( */ del = got; wasdel = isnullstartblock(del.br_startblock); + + /* + * Make sure we don't touch multiple AGF headers out of order + * in a single transaction, as that could cause AB-BA deadlocks. + */ + if (!wasdel) { + agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); + if (prev_agno != NULLAGNUMBER && prev_agno > agno) + break; + prev_agno = agno; + } if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; -- cgit v1.2.3 From 1e6f209515a08de29ec53b653eac73b50efd949c Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 25 Jul 2017 16:10:47 -0400 Subject: NFS: Use raw NFS access mask in nfs4_opendata_access() Commit bd8b2441742b ("NFS: Store the raw NFS access mask in the inode's access cache") changed how the access results are stored after an access() call. An NFS v4 OPEN might have access bits returned with the opendata, so we should use the NFS4_ACCESS values when determining the return value in nfs4_opendata_access(). Fixes: bd8b2441742b ("NFS: Store the raw NFS access mask in the inode's access cache") Reported-by: Eryu Guan Signed-off-by: Anna Schumaker Tested-by: Takashi Iwai --- fs/nfs/nfs4proc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e1a26c653e78..583c2b38c908 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2236,7 +2236,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred, int openflags) { struct nfs_access_entry cache; - u32 mask; + u32 mask, flags; /* access call failed or for some reason the server doesn't * support any access modes -- defer access call until later */ @@ -2250,16 +2250,20 @@ static int nfs4_opendata_access(struct rpc_cred *cred, */ if (openflags & __FMODE_EXEC) { /* ONLY check for exec rights */ - mask = MAY_EXEC; + if (S_ISDIR(state->inode->i_mode)) + mask = NFS4_ACCESS_LOOKUP; + else + mask = NFS4_ACCESS_EXECUTE; } else if ((fmode & FMODE_READ) && !opendata->file_created) - mask = MAY_READ; + mask = NFS4_ACCESS_READ; cache.cred = cred; cache.jiffies = jiffies; nfs_access_set_mask(&cache, opendata->o_res.access_result); nfs_access_add_cache(state->inode, &cache); - if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) + flags = NFS4_ACCESS_READ | NFS4_ACCESS_EXECUTE | NFS4_ACCESS_LOOKUP; + if ((mask & ~cache.mask & flags) == 0) return 0; return -EACCES; -- cgit v1.2.3 From 442ce0499c0535f8972b68fa1fda357357a5c953 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 24 Jul 2017 13:18:50 +1000 Subject: NFS: invalidate file size when taking a lock. Prior to commit ca0daa277aca ("NFS: Cache aggressively when file is open for writing"), NFS would revalidate, or invalidate, the file size when taking a lock. Since that commit it only invalidates the file content. If the file size is changed on the server while wait for the lock, the client will have an incorrect understanding of the file size and could corrupt data. This particularly happens when writing beyond the (supposed) end of file and can be easily be demonstrated with posix_fallocate(). If an application opens an empty file, waits for a write lock, and then calls posix_fallocate(), glibc will determine that the underlying filesystem doesn't support fallocate (assuming version 4.1 or earlier) and will write out a '0' byte at the end of each 4K page in the region being fallocated that is after the end of the file. NFS will (usually) detect that these writes are beyond EOF and will expand them to cover the whole page, and then will merge the pages. Consequently, NFS will write out large blocks of zeroes beyond where it thought EOF was. If EOF had moved, the pre-existing part of the file will be over-written. Locking should have protected against this, but it doesn't. This patch restores the use of nfs_zap_caches() which invalidated the cached attributes. When posix_fallocate() asks for the file size, the request will go to the server and get a correct answer. cc: stable@vger.kernel.org (v4.8+) Fixes: ca0daa277aca ("NFS: Cache aggressively when file is open for writing") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5713eb32a45e..d264363559db 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -750,7 +750,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) */ nfs_sync_mapping(filp->f_mapping); if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) - nfs_zap_mapping(inode, filp->f_mapping); + nfs_zap_caches(inode); out: return status; } -- cgit v1.2.3 From 6ba80d4348bd8cce2a0a6bbc21e5e1e760de42a9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 24 Jul 2017 13:18:50 +1000 Subject: NFS: Optimize fallocate by refreshing mapping when needed. posix_fallocate() will allocate space in an NFS file by considering the last byte of every 4K block. If it is before EOF, it will read the byte and if it is zero, a zero is written out. If it is after EOF, the zero is unconditionally written. For the blocks beyond EOF, if NFS believes its cache is valid, it will expand these writes to write full pages, and then will merge the pages. This results if (typically) 1MB writes. If NFS believes its cache is not valid (particularly if NFS_INO_INVALID_DATA or NFS_INO_REVAL_PAGECACHE are set - see nfs_write_pageuptodate()), it will send the individual 1-byte writes. This results in (typically) 256 times as many RPC requests, and can be substantially slower. Currently nfs_revalidate_mapping() is only used when reading a file or mmapping a file, as these are times when the content needs to be up-to-date. Writes don't generally need the cache to be up-to-date, but writes beyond EOF can benefit, particularly in the posix_fallocate() case. So this patch calls nfs_revalidate_mapping() when writing beyond EOF - i.e. when there is a gap between the end of the file and the start of the write. If the cache is thought to be out of date (as happens after taking a file lock), this will cause a GETATTR, and the two flags mentioned above will be cleared. With this, posix_fallocate() on a newly locked file does not generate excessive tiny writes. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d264363559db..af330c31f627 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -617,6 +617,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result) goto out; } + if (iocb->ki_pos > i_size_read(inode)) + nfs_revalidate_mapping(inode, file->f_mapping); nfs_start_io_write(inode); result = generic_write_checks(iocb, from); -- cgit v1.2.3 From b7dbcc0e433f0f61acb89ed9861ec996be4f2b38 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 28 Jul 2017 12:33:54 -0400 Subject: NFSv4.1: Fix a race where CB_NOTIFY_LOCK fails to wake a waiter nfs4_retry_setlk() sets the task's state to TASK_INTERRUPTIBLE within the same region protected by the wait_queue's lock after checking for a notification from CB_NOTIFY_LOCK callback. However, after releasing that lock, a wakeup for that task may race in before the call to freezable_schedule_timeout_interruptible() and set TASK_WAKING, then freezable_schedule_timeout_interruptible() will set the state back to TASK_INTERRUPTIBLE before the task will sleep. The result is that the task will sleep for the entire duration of the timeout. Since we've already set TASK_INTERRUPTIBLE in the locked section, just use freezable_schedule_timout() instead. Fixes: a1d617d8f134 ("nfs: allow blocking locks to be awoken by lock callbacks") Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 583c2b38c908..93c1d7352238 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6495,7 +6495,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irqrestore(&q->lock, flags); - freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT); + freezable_schedule_timeout(NFS4_LOCK_MAXTIMEOUT); } finish_wait(q, &wait); -- cgit v1.2.3 From 1e21196c8e3e210cbef130e23e40c4adc3d9ce10 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Sun, 30 Jul 2017 22:26:40 -0400 Subject: ext4: correct comment references to ext4_ext_direct_IO() Commit 914f82a32d0268847 "ext4: refactor direct IO code" deleted ext4_ext_direct_IO(), but references to that function remain in comments. Update them to refer to ext4_direct_IO_write(). Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Reviewed-by: Jan Kara --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3c600f02673f..2e6c02258ee2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -892,7 +892,7 @@ static int ext4_dio_get_block_unwritten_async(struct inode *inode, /* * Get block function for non-AIO DIO writes when we create unwritten extent if * blocks are not allocated yet. The extent will be converted to written - * after IO is complete from ext4_ext_direct_IO() function. + * after IO is complete by ext4_direct_IO_write(). */ static int ext4_dio_get_block_unwritten_sync(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -907,7 +907,7 @@ static int ext4_dio_get_block_unwritten_sync(struct inode *inode, /* * Mark inode as having pending DIO writes to unwritten extents. - * ext4_ext_direct_IO() checks this flag and converts extents to + * ext4_direct_IO_write() checks this flag and converts extents to * written. */ if (!ret && buffer_unwritten(bh_result)) -- cgit v1.2.3 From a627b0a7c15ee4d2c87a86d5be5c8167382e8d0d Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Sun, 30 Jul 2017 22:30:11 -0400 Subject: ext4: remove unused metadata accounting variables Two variables in ext4_inode_info, i_reserved_meta_blocks and i_allocated_meta_blocks, are unused. Removing them saves a little memory per in-memory inode and cleans up clutter in several tracepoints. Adjust tracepoint output from ext4_alloc_da_blocks() for consistency and fix a typo and whitespace near these changes. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 6 ++---- fs/ext4/super.c | 2 -- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9ebde0cd632e..dcbcd9d444d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -961,7 +961,7 @@ struct ext4_inode_info { /* * i_block_group is the number of the block group which contains * this file's inode. Constant across the lifetime of the inode, - * it is ued for making block allocation decisions - we try to + * it is used for making block allocation decisions - we try to * place a file's data blocks near its inode block, and new inodes * near to their parent directory's inode. */ @@ -1049,10 +1049,8 @@ struct ext4_inode_info { ext4_group_t i_last_alloc_group; /* allocation reservation info for delalloc */ - /* In case of bigalloc, these refer to clusters rather than blocks */ + /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; - unsigned int i_reserved_meta_blocks; - unsigned int i_allocated_meta_blocks; ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0886fe82e9c4..d61a70e2193a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -978,8 +978,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; - ei->i_reserved_meta_blocks = 0; - ei->i_allocated_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); -- cgit v1.2.3 From 397e434176bb62bc6068d2210af1d876c6212a7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Sun, 30 Jul 2017 22:43:41 -0400 Subject: ext4: preserve i_mode if __ext4_set_acl() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When changing a file's acl mask, __ext4_set_acl() will first set the group bits of i_mode to the value of the mask, and only then set the actual extended attribute representing the new acl. If the second part fails (due to lack of space, for example) and the file had no acl attribute to begin with, the system will from now on assume that the mask permission bits are actual group permission bits, potentially granting access to the wrong users. Prevent this by only changing the inode mode after the acl has been set. Signed-off-by: Ernesto A. Fernández Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/acl.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 09441ae07a5b..2985cd0a640d 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -189,16 +189,17 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; + int update_mode = 0; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + update_mode = 1; } break; @@ -221,8 +222,14 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, value, size, xattr_flags); kfree(value); - if (!error) + if (!error) { set_cached_acl(inode, type, acl); + if (update_mode) { + inode->i_mode = mode; + inode->i_ctime = current_time(inode); + ext4_mark_inode_dirty(handle, inode); + } + } return error; } -- cgit v1.2.3 From a3bb2d5587521eea6dab2d05326abb0afb460abd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 30 Jul 2017 23:33:01 -0400 Subject: ext4: Don't clear SGID when inheriting ACLs When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by moving posix_acl_update_mode() out of __ext4_set_acl() into ext4_set_acl(). That way the function will not be called when inheriting ACLs which is what we want as it prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef CC: stable@vger.kernel.org Signed-off-by: Theodore Ts'o Signed-off-by: Jan Kara Reviewed-by: Andreas Gruenbacher --- fs/ext4/acl.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 2985cd0a640d..46ff2229ff5e 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -189,18 +189,10 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; - int update_mode = 0; - umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &mode, &acl); - if (error) - return error; - update_mode = 1; - } break; case ACL_TYPE_DEFAULT: @@ -224,11 +216,6 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, kfree(value); if (!error) { set_cached_acl(inode, type, acl); - if (update_mode) { - inode->i_mode = mode; - inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); - } } return error; @@ -240,6 +227,8 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) handle_t *handle; int error, credits, retries = 0; size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; + umode_t mode = inode->i_mode; + int update_mode = 0; error = dquot_initialize(inode); if (error) @@ -254,7 +243,20 @@ retry: if (IS_ERR(handle)) return PTR_ERR(handle); + if ((type == ACL_TYPE_ACCESS) && acl) { + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + goto out_stop; + update_mode = 1; + } + error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); + if (!error && update_mode) { + inode->i_mode = mode; + inode->i_ctime = current_time(inode); + ext4_mark_inode_dirty(handle, inode); + } +out_stop: ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; -- cgit v1.2.3 From 191eac33009e6a6d31e87cfa425a20d0e79704b4 Mon Sep 17 00:00:00 2001 From: Emoly Liu Date: Mon, 31 Jul 2017 00:40:22 -0400 Subject: ext4: error should be cleared if ea_inode isn't added to the cache For Lustre, if ea_inode fails in hash validation but passes parent inode and generation checks, it won't be added to the cache as well as the error "-EFSCORRUPTED" should be cleared, otherwise it will cause "Structure needs cleaning" when running getfattr command. Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-9723 Cc: stable@vger.kernel.org Fixes: dec214d00e0d78a08b947d7dccdfdb84407a9f4d Signed-off-by: Emoly Liu Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Reviewed-by: tahsin@google.com --- fs/ext4/xattr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index cff4f41ced61..de217a094733 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -451,6 +451,7 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, } /* Do not add ea_inode to the cache. */ ea_inode_cache = NULL; + err = 0; } else if (err) goto out; -- cgit v1.2.3 From 9c5d58fb9e1df8c9ead4daf9e770ce61e4256e02 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 31 Jul 2017 00:55:34 -0400 Subject: ext4: convert swap_inode_data() over to use swap() on most of the fields For some odd reason, it forces a byte-by-byte copy of each field. A plain old swap() on most of these fields would be more efficient. We do need to retain the memswap of i_data however as that field is an array. Signed-off-by: Theodore Ts'o Signed-off-by: Jeff Layton Reviewed-by: Jan Kara --- fs/ext4/ioctl.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 42b3a73143cf..4f4a8391585c 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -64,18 +64,16 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); - memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); - memswap(&inode1->i_version, &inode2->i_version, - sizeof(inode1->i_version)); - memswap(&inode1->i_blocks, &inode2->i_blocks, - sizeof(inode1->i_blocks)); - memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); - memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); - memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); + swap(inode1->i_flags, inode2->i_flags); + swap(inode1->i_version, inode2->i_version); + swap(inode1->i_blocks, inode2->i_blocks); + swap(inode1->i_bytes, inode2->i_bytes); + swap(inode1->i_atime, inode2->i_atime); + swap(inode1->i_mtime, inode2->i_mtime); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); - memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); - memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + swap(ei1->i_flags, ei2->i_flags); + swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); -- cgit v1.2.3 From fd40559c8657418385e42f797e0b04bfc0add748 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 16:02:47 -0400 Subject: NFSv4: Fix EXCHANGE_ID corrupt verifier issue The verifier is allocated on the stack, but the EXCHANGE_ID RPC call was changed to be asynchronous by commit 8d89bd70bc939. If we interrrupt the call to rpc_wait_for_completion_task(), we can therefore end up transmitting random stack contents in lieu of the verifier. Fixes: 8d89bd70bc939 ("NFS setup async exchange_id") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 11 ++++------- fs/nfs/nfs4xdr.c | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 93c1d7352238..c458a1e28b91 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7460,7 +7460,7 @@ static void nfs4_exchange_id_done(struct rpc_task *task, void *data) cdata->res.server_scope = NULL; } /* Save the EXCHANGE_ID verifier session trunk tests */ - memcpy(clp->cl_confirm.data, cdata->args.verifier->data, + memcpy(clp->cl_confirm.data, cdata->args.verifier.data, sizeof(clp->cl_confirm.data)); } out: @@ -7497,7 +7497,6 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = { static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, u32 sp4_how, struct rpc_xprt *xprt) { - nfs4_verifier verifier; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], .rpc_cred = cred, @@ -7521,8 +7520,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, return -ENOMEM; } - if (!xprt) - nfs4_init_boot_verifier(clp, &verifier); + nfs4_init_boot_verifier(clp, &calldata->args.verifier); status = nfs4_init_uniform_client_string(clp); if (status) @@ -7563,9 +7561,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, task_setup_data.rpc_xprt = xprt; task_setup_data.flags = RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC; - calldata->args.verifier = &clp->cl_confirm; - } else { - calldata->args.verifier = &verifier; + memcpy(calldata->args.verifier.data, clp->cl_confirm.data, + sizeof(calldata->args.verifier.data)); } calldata->args.client = clp; #ifdef CONFIG_NFS_V4_1_MIGRATION diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index fa3eb361d4f8..37c8af003275 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1785,7 +1785,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, int len = 0; encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); - encode_nfs4_verifier(xdr, args->verifier); + encode_nfs4_verifier(xdr, &args->verifier); encode_string(xdr, strlen(args->client->cl_owner_id), args->client->cl_owner_id); -- cgit v1.2.3 From d9cb73300ac0001832c048a489c592263fc28193 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 16:02:48 -0400 Subject: NFSv4: Fix double frees in nfs4_test_session_trunk() rpc_clnt_add_xprt() expects the callback function to be synchronous, and expects to release the transport and switch references itself. Fixes: 04fa2c6bb51b1 ("NFS pnfs data server multipath session trunking") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4client.c | 3 --- fs/nfs/nfs4proc.c | 16 +++------------- 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 50566acb5469..e9bea90dc017 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -660,9 +660,6 @@ int nfs4_detect_session_trunking(struct nfs_client *clp, if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope)) goto out_err; - /* Session trunking passed, add the xprt */ - rpc_clnt_xprt_switch_add_xprt(clp->cl_rpcclient, xprt); - pr_info("NFS: %s: Session trunking succeeded for %s\n", clp->cl_hostname, xprt->address_strings[RPC_DISPLAY_ADDR]); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c458a1e28b91..3e46be8c9be1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7473,10 +7473,6 @@ static void nfs4_exchange_id_release(void *data) struct nfs41_exchange_id_data *cdata = (struct nfs41_exchange_id_data *)data; - if (cdata->xprt) { - xprt_put(cdata->xprt); - rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient); - } nfs_put_client(cdata->args.client); kfree(cdata->res.impl_id); kfree(cdata->res.server_scope); @@ -7505,7 +7501,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, .rpc_client = clp->cl_rpcclient, .callback_ops = &nfs4_exchange_id_call_ops, .rpc_message = &msg, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_TIMEOUT, }; struct nfs41_exchange_id_data *calldata; struct rpc_task *task; @@ -7559,8 +7555,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (xprt) { calldata->xprt = xprt; task_setup_data.rpc_xprt = xprt; - task_setup_data.flags = - RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC; + task_setup_data.flags |= RPC_TASK_SOFTCONN; memcpy(calldata->args.verifier.data, clp->cl_confirm.data, sizeof(calldata->args.verifier.data)); } @@ -7581,12 +7576,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (IS_ERR(task)) return PTR_ERR(task); - if (!xprt) { - status = rpc_wait_for_completion_task(task); - if (!status) - status = calldata->rpc_status; - } else /* session trunking test */ - status = calldata->rpc_status; + status = calldata->rpc_status; rpc_put_task(task); out: -- cgit v1.2.3 From 9d95aa4bada24be35bb94827a55e1d6e243d866e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 2 Aug 2017 13:32:15 -0700 Subject: userfaultfd_zeropage: return -ENOSPC in case mm has gone In the non-cooperative userfaultfd case, the process exit may race with outstanding mcopy_atomic called by the uffd monitor. Returning -ENOSPC instead of -EINVAL when mm is already gone will allow uffd monitor to distinguish this case from other error conditions. Unfortunately I overlooked userfaultfd_zeropage when updating userfaultd_copy(). Link: http://lkml.kernel.org/r/1501136819-21857-1-git-send-email-rppt@linux.vnet.ibm.com Fixes: 96333187ab162 ("userfaultfd_copy: return -ENOSPC in case mm has gone") Signed-off-by: Mike Rapoport Cc: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Pavel Emelyanov Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cadcd12a3d35..2d8c2d848668 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1643,6 +1643,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, uffdio_zeropage.range.len); mmput(ctx->mm); + } else { + return -ENOSPC; } if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) return -EFAULT; -- cgit v1.2.3 From 5a18b64e3f02125be1c0ef777501ae38aafe2a24 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 2 Aug 2017 13:32:24 -0700 Subject: userfaultfd: non-cooperative: flush event_wqh at release time There may still be threads waiting on event_wqh at the time the userfault file descriptor is closed. Flush the events wait-queue to prevent waiting threads from hanging. Link: http://lkml.kernel.org/r/1501398127-30419-1-git-send-email-rppt@linux.vnet.ibm.com Fixes: 9cd75c3cd4c3d ("userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor") Signed-off-by: Mike Rapoport Cc: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Pavel Emelyanov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 2d8c2d848668..06ea26b8c996 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -854,6 +854,9 @@ wakeup: __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range); spin_unlock(&ctx->fault_pending_wqh.lock); + /* Flush pending events that may still wait on event_wqh */ + wake_up_all(&ctx->event_wqh); + wake_up_poll(&ctx->fd_wqh, POLLHUP); userfaultfd_ctx_put(ctx); return 0; -- cgit v1.2.3 From 19ec8e48582670c021e998b9deb88e39a842ff45 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 2 Aug 2017 13:32:30 -0700 Subject: ocfs2: don't clear SGID when inheriting ACLs When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by moving posix_acl_update_mode() out of ocfs2_set_acl() into ocfs2_iop_set_acl(). That way the function will not be called when inheriting ACLs which is what we want as it prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Also posix_acl_chmod() that is calling ocfs2_set_acl() takes care of updating mode itself. Fixes: 073931017b4 ("posix_acl: Clear SGID bit when setting file permissions") Link: http://lkml.kernel.org/r/20170801141252.19675-3-jack@suse.cz Signed-off-by: Jan Kara Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/acl.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index dc22ba8c710f..e50a387959bf 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -240,18 +240,6 @@ int ocfs2_set_acl(handle_t *handle, switch (type) { case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - umode_t mode; - - ret = posix_acl_update_mode(inode, &mode, &acl); - if (ret) - return ret; - - ret = ocfs2_acl_set_mode(inode, di_bh, - handle, mode); - if (ret) - return ret; - } break; case ACL_TYPE_DEFAULT: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; @@ -289,7 +277,19 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); if (had_lock < 0) return had_lock; + if (type == ACL_TYPE_ACCESS && acl) { + umode_t mode; + + status = posix_acl_update_mode(inode, &mode, &acl); + if (status) + goto unlock; + + status = ocfs2_acl_set_mode(inode, bh, NULL, mode); + if (status) + goto unlock; + } status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); +unlock: ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); brelse(bh); return status; -- cgit v1.2.3 From 61c12b49e1c9c77d7a1bcc161de540d0fd21cf0c Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Wed, 12 Jul 2017 19:26:58 -0700 Subject: fuse: Dont call set_page_dirty_lock() for ITER_BVEC pages for async_dio Commit 8fba54aebbdf ("fuse: direct-io: don't dirty ITER_BVEC pages") fixes the ITER_BVEC page deadlock for direct io in fuse by checking in fuse_direct_io(), whether the page is a bvec page or not, before locking it. However, this check is missed when the "async_dio" mount option is enabled. In this case, set_page_dirty_lock() is called from the req->end callback in request_end(), when the fuse thread is returning from userspace to respond to the read request. This will cause the same deadlock because the bvec condition is not checked in this path. Here is the stack of the deadlocked thread, while returning from userspace: [13706.656686] INFO: task glusterfs:3006 blocked for more than 120 seconds. [13706.657808] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [13706.658788] glusterfs D ffffffff816c80f0 0 3006 1 0x00000080 [13706.658797] ffff8800d6713a58 0000000000000086 ffff8800d9ad7000 ffff8800d9ad5400 [13706.658799] ffff88011ffd5cc0 ffff8800d6710008 ffff88011fd176c0 7fffffffffffffff [13706.658801] 0000000000000002 ffffffff816c80f0 ffff8800d6713a78 ffffffff816c790e [13706.658803] Call Trace: [13706.658809] [] ? bit_wait_io_timeout+0x80/0x80 [13706.658811] [] schedule+0x3e/0x90 [13706.658813] [] schedule_timeout+0x1b5/0x210 [13706.658816] [] ? gup_pud_range+0x1db/0x1f0 [13706.658817] [] ? kvm_clock_read+0x1e/0x20 [13706.658819] [] ? kvm_clock_get_cycles+0x9/0x10 [13706.658822] [] ? ktime_get+0x52/0xc0 [13706.658824] [] io_schedule_timeout+0xa4/0x110 [13706.658826] [] bit_wait_io+0x36/0x50 [13706.658828] [] __wait_on_bit_lock+0x76/0xb0 [13706.658831] [] ? lock_request+0x46/0x70 [fuse] [13706.658834] [] __lock_page+0xaa/0xb0 [13706.658836] [] ? wake_atomic_t_function+0x40/0x40 [13706.658838] [] set_page_dirty_lock+0x58/0x60 [13706.658841] [] fuse_release_user_pages+0x58/0x70 [fuse] [13706.658844] [] ? fuse_aio_complete+0x190/0x190 [fuse] [13706.658847] [] fuse_aio_complete_req+0x29/0x90 [fuse] [13706.658849] [] request_end+0xd9/0x190 [fuse] [13706.658852] [] fuse_dev_do_write+0x336/0x490 [fuse] [13706.658854] [] fuse_dev_write+0x6e/0xa0 [fuse] [13706.658857] [] ? security_file_permission+0x23/0x90 [13706.658859] [] do_iter_readv_writev+0x60/0x90 [13706.658862] [] ? fuse_dev_splice_write+0x350/0x350 [fuse] [13706.658863] [] do_readv_writev+0x171/0x1f0 [13706.658866] [] ? try_to_wake_up+0x210/0x210 [13706.658868] [] vfs_writev+0x41/0x50 [13706.658870] [] SyS_writev+0x56/0xf0 [13706.658872] [] ? syscall_trace_leave+0xf1/0x160 [13706.658874] [] system_call_fastpath+0x12/0x71 Fix this by making should_dirty a fuse_io_priv parameter that can be checked in fuse_aio_complete_req(). Reported-by: Tiger Yang Signed-off-by: Ashish Samant Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 6 +++--- fs/fuse/fuse_i.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 76eac2a554c4..810ed4f99e38 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -609,7 +609,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) struct fuse_io_priv *io = req->io; ssize_t pos = -1; - fuse_release_user_pages(req, !io->write); + fuse_release_user_pages(req, io->should_dirty); if (io->write) { if (req->misc.write.in.size != req->misc.write.out.size) @@ -1316,7 +1316,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; - bool should_dirty = !write && iter_is_iovec(iter); int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; struct inode *inode = file->f_mapping->host; @@ -1346,6 +1345,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, inode_unlock(inode); } + io->should_dirty = !write && iter_is_iovec(iter); while (count) { size_t nres; fl_owner_t owner = current->files; @@ -1360,7 +1360,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(req, io, pos, nbytes, owner); if (!io->async) - fuse_release_user_pages(req, should_dirty); + fuse_release_user_pages(req, io->should_dirty); if (req->out.h.error) { err = req->out.h.error; break; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1bd7ffdad593..bd4d2a3e1ec1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -249,6 +249,7 @@ struct fuse_io_priv { size_t size; __u64 offset; bool write; + bool should_dirty; int err; struct kiocb *iocb; struct file *file; -- cgit v1.2.3 From ea7bd56fa309d10a41b1041827a63c0746c47554 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 2 Aug 2017 12:37:16 -0700 Subject: xfs: Fix leak of discard bio The bio describing discard operation is allocated by __blkdev_issue_discard() which returns us a reference to it. That reference is never released and thus we leak this bio. Drop the bio reference once it completes in xlog_discard_endio(). CC: stable@vger.kernel.org Fixes: 4560e78f40cb55bd2ea8f1ef4001c5baa88531c7 Signed-off-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index fbe72b134bef..43aa42a3a5d3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -539,6 +539,7 @@ xlog_discard_endio( INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); queue_work(xfs_discard_wq, &ctx->discard_endio_work); + bio_put(bio); } static void -- cgit v1.2.3 From 56bdf855e676f1f2ed7033f288f57dfd315725ba Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 3 Aug 2017 13:19:13 -0700 Subject: xfs: Fix per-inode DAX flag inheritance According to the commit that implemented per-inode DAX flag: commit 58f88ca2df72 ("xfs: introduce per-inode DAX enablement") the flag is supposed to act as "inherit flag". Currently this only works in the situations where parent directory already has a flag in di_flags set, otherwise inheritance does not work. This is because setting the XFS_DIFLAG2_DAX flag is done in a wrong branch designated for di_flags, not di_flags2. Fix this by moving the code to branch designated for setting di_flags2, which does test for flags in di_flags2. Fixes: 58f88ca2df72 ("xfs: introduce per-inode DAX enablement") Signed-off-by: Lukas Czerner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ceef77c0416a..ff48f0096810 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -874,7 +874,6 @@ xfs_ialloc( case S_IFREG: case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint64_t di_flags2 = 0; uint di_flags = 0; if (S_ISDIR(mode)) { @@ -911,20 +910,23 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_NODEFRAG; if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; ip->i_d.di_flags |= di_flags; - ip->i_d.di_flags2 |= di_flags2; } if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && pip->i_d.di_version == 3 && ip->i_d.di_version == 3) { + uint64_t di_flags2 = 0; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; } + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + + ip->i_d.di_flags2 |= di_flags2; } /* FALLTHROUGH */ case S_IFLNK: -- cgit v1.2.3 From e45105772db41c5318b2a7ec1c420183183414e3 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Sat, 5 Aug 2017 13:11:57 -0400 Subject: ext4: release discard bio after sending discard commands We've changed the discard command handling into parallel manner. But, in this change, I forgot decreasing the usage count of the bio which was used to send discard request. I'm sorry about that. Fixes: a015434480dc ("ext4: send parallel discards on commit completions") Signed-off-by: Daeho Jeong Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/mballoc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 581e357e8406..8779893d74e5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2892,8 +2892,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) break; } - if (discard_bio) + if (discard_bio) { submit_bio_wait(discard_bio); + bio_put(discard_bio); + } } list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) -- cgit v1.2.3 From fcf5ea10992fbac3c7473a1db33d56a139333cd1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 5 Aug 2017 17:43:24 -0400 Subject: ext4: fix SEEK_HOLE/SEEK_DATA for blocksize < pagesize ext4_find_unwritten_pgoff() does not properly handle a situation when starting index is in the middle of a page and blocksize < pagesize. The following command shows the bug on filesystem with 1k blocksize: xfs_io -f -c "falloc 0 4k" \ -c "pwrite 1k 1k" \ -c "pwrite 3k 1k" \ -c "seek -a -r 0" foo In this example, neither lseek(fd, 1024, SEEK_HOLE) nor lseek(fd, 2048, SEEK_DATA) will return the correct result. Fix the problem by neglecting buffers in a page before starting offset. Reported-by: Andreas Gruenbacher Signed-off-by: Theodore Ts'o Signed-off-by: Jan Kara CC: stable@vger.kernel.org # 3.8+ --- fs/ext4/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 58294c9a7e1d..0d7cf0cc9b87 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -537,6 +537,8 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, lastoff = page_offset(page); bh = head = page_buffers(page); do { + if (lastoff + bh->b_size <= startoff) + goto next; if (buffer_uptodate(bh) || buffer_unwritten(bh)) { if (whence == SEEK_DATA) @@ -551,6 +553,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); goto out; } +next: lastoff += bh->b_size; bh = bh->b_this_page; } while (bh != head); -- cgit v1.2.3 From 381cebfe72e17e495281c07c976797f5a397515a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 5 Aug 2017 19:00:31 -0400 Subject: ext4: silence array overflow warning I get a static checker warning: fs/ext4/ext4.h:3091 ext4_set_de_type() error: buffer overflow 'ext4_type_by_mode' 15 <= 15 It seems unlikely that we would hit this read overflow in real life, but it's also simple enough to make the array 16 bytes instead of 15. Signed-off-by: Dan Carpenter Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index dcbcd9d444d1..2d6d4c952492 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3072,7 +3072,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define S_SHIFT 12 -static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, -- cgit v1.2.3 From c7414892067204fcb8f8ebb4309d0fdd8c7242fe Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Sat, 5 Aug 2017 19:47:34 -0400 Subject: ext4: fix dir_nlink behaviour The dir_nlink feature has been enabled by default for new ext4 filesystems since e2fsprogs-1.41 in 2008, and was automatically enabled by the kernel for older ext4 filesystems since the dir_nlink feature was added with ext4 in kernel 2.6.28+ when the subdirectory count exceeded EXT4_LINK_MAX-1. Automatically adding the file system features such as dir_nlink is generally frowned upon, since it could cause the file system to not be mountable on older kernel, thus preventing the administrator from rolling back to an older kernel if necessary. In this case, the administrator might also want to disable the feature because glibc's fts_read() function does not correctly optimize directory traversal for directories that use st_nlinks field of 1 to indicate that the number of links in the directory are not tracked by the file system, and could fail to traverse the full directory hierarchy. Fortunately, in the past ten years very few users have complained about incomplete file system traversal by glibc's fts_read(). This commit also changes ext4_inc_count() to allow i_nlinks to reach the full EXT4_LINK_MAX links on the parent directory (including "." and "..") before changing i_links_count to be 1. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196405 Signed-off-by: Andreas Dilger Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++- fs/ext4/namei.c | 21 ++++++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2d6d4c952492..d50229e92c55 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2020,7 +2020,8 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) -#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) /* Legal values for the dx_root hash_version field: */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 13f0cadb1238..cc986b824181 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2395,19 +2395,22 @@ out: } /* - * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, - * since this indicates that nlinks count was previously 1. + * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2 + * since this indicates that nlinks count was previously 1 to avoid overflowing + * the 16-bit i_links_count field on disk. Directories with i_nlink == 1 mean + * that subdirectory link counts are not being maintained accurately. + * + * The caller has already checked for i_nlink overflow in case the DIR_LINK + * feature is not enabled and returned -EMLINK. The is_dx() check is a proxy + * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set + * on regular files) and to avoid creating huge/slow non-HTREE directories. */ static void ext4_inc_count(handle_t *handle, struct inode *inode) { inc_nlink(inode); - if (is_dx(inode) && inode->i_nlink > 1) { - /* limit is 16-bit i_links_count */ - if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { - set_nlink(inode, 1); - ext4_set_feature_dir_nlink(inode->i_sb); - } - } + if (is_dx(inode) && + (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) + set_nlink(inode, 1); } /* -- cgit v1.2.3 From 2df2c3402fc81918a888e1ec711369f6014471f2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 5 Aug 2017 21:57:46 -0400 Subject: ext4: fix warning about stack corruption After commit 62d1034f53e3 ("fortify: use WARN instead of BUG for now"), we get a warning about possible stack overflow from a memcpy that was not strictly bounded to the size of the local variable: inlined from 'ext4_mb_seq_groups_show' at fs/ext4/mballoc.c:2322:2: include/linux/string.h:309:9: error: '__builtin_memcpy': writing between 161 and 1116 bytes into a region of size 160 overflows the destination [-Werror=stringop-overflow=] We actually had a bug here that would have been found by the warning, but it was already fixed last year in commit 30a9d7afe70e ("ext4: fix stack memory corruption with 64k block size"). This replaces the fixed-length structure on the stack with a variable-length structure, using the correct upper bound that tells the compiler that everything is really fine here. I also change the loop count to check for the same upper bound for consistency, but the existing code is already correct here. Note that while clang won't allow certain kinds of variable-length arrays in structures, this particular instance is fine, as the array is at the end of the structure, and the size is strictly bounded. Signed-off-by: Arnd Bergmann Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8779893d74e5..5a1052627a81 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2295,9 +2295,12 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) int err, buddy_loaded = 0; struct ext4_buddy e4b; struct ext4_group_info *grinfo; + unsigned char blocksize_bits = min_t(unsigned char, + sb->s_blocksize_bits, + EXT4_MAX_BLOCK_LOG_SIZE); struct sg { struct ext4_group_info info; - ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; + ext4_grpblk_t counters[blocksize_bits + 2]; } sg; group--; @@ -2306,8 +2309,6 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); - i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + - sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { @@ -2319,7 +2320,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) buddy_loaded = 1; } - memcpy(&sg, ext4_get_group_info(sb, group), i); + memcpy(&sg, ext4_get_group_info(sb, group), sizeof(sg)); if (buddy_loaded) ext4_mb_unload_buddy(&e4b); @@ -2327,7 +2328,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) - seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg.info.bb_counters[i] : 0); seq_printf(seq, " ]\n"); -- cgit v1.2.3 From 77a2e84d51729da7957d19283523af7b571d61f6 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Sat, 5 Aug 2017 22:15:45 -0400 Subject: ext4: remove unused mode parameter ext4_alloc_file_blocks() does not use its mode parameter. Remove it. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e0a8425ff74d..230ca74c4841 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4652,7 +4652,7 @@ retry: static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, ext4_lblk_t len, loff_t new_size, - int flags, int mode) + int flags) { struct inode *inode = file_inode(file); handle_t *handle; @@ -4815,7 +4815,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits) >> blkbits, (round_up((offset + len), 1 << blkbits) - round_down(offset, 1 << blkbits)) >> blkbits, - new_size, flags, mode); + new_size, flags); if (ret) goto out_dio; @@ -4841,7 +4841,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags, mode); + flags); up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; @@ -4976,8 +4976,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags, mode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; -- cgit v1.2.3 From ec00022030da5761518476096626338bd67df57a Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Sat, 5 Aug 2017 22:41:42 -0400 Subject: ext4: inplace xattr block update fails to deduplicate blocks When an xattr block has a single reference, block is updated inplace and it is reinserted to the cache. Later, a cache lookup is performed to see whether an existing block has the same contents. This cache lookup will most of the time return the just inserted entry so deduplication is not achieved. Running the following test script will produce two xattr blocks which can be observed in "File ACL: " line of debugfs output: mke2fs -b 1024 -I 128 -F -O extent /dev/sdb 1G mount /dev/sdb /mnt/sdb touch /mnt/sdb/{x,y} setfattr -n user.1 -v aaa /mnt/sdb/x setfattr -n user.2 -v bbb /mnt/sdb/x setfattr -n user.1 -v aaa /mnt/sdb/y setfattr -n user.2 -v bbb /mnt/sdb/y debugfs -R 'stat x' /dev/sdb | cat debugfs -R 'stat y' /dev/sdb | cat This patch defers the reinsertion to the cache so that we can locate other blocks with the same contents. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/xattr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index de217a094733..4025666c5991 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1816,9 +1816,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); - if (!error) - ext4_xattr_block_cache_insert(ea_block_cache, - bs->bh); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -1974,6 +1971,7 @@ inserted: } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ ea_bdebug(bs->bh, "keeping this block"); + ext4_xattr_block_cache_insert(ea_block_cache, bs->bh); new_bh = bs->bh; get_bh(new_bh); } else { -- cgit v1.2.3 From 9699d4f91d9bd2f70dcc37afe3c9f18145ab2dba Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Sun, 6 Aug 2017 00:07:01 -0400 Subject: ext4: make xattr inode reads faster ext4_xattr_inode_read() currently reads each block sequentially while waiting for io operation to complete before moving on to the next block. This prevents request merging in block layer. Add a ext4_bread_batch() function that starts reads for all blocks then optionally waits for them to complete. A similar logic is used in ext4_find_entry(), so update that code to use the new function. Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/inode.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/namei.c | 43 ++++++++++++++----------------------------- fs/ext4/xattr.c | 51 ++++++++++++++++++++++++++++++++------------------- 4 files changed, 92 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d50229e92c55..a2bb7d2870e4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2461,6 +2461,8 @@ extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2e6c02258ee2..56bca45bcdf4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1015,6 +1015,50 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return ERR_PTR(-EIO); } +/* Read a contiguous batch of blocks. */ +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs) +{ + int i, err; + + for (i = 0; i < bh_count; i++) { + bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); + if (IS_ERR(bhs[i])) { + err = PTR_ERR(bhs[i]); + bh_count = i; + goto out_brelse; + } + } + + for (i = 0; i < bh_count; i++) + /* Note that NULL bhs[i] is valid because of holes. */ + if (bhs[i] && !buffer_uptodate(bhs[i])) + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, + &bhs[i]); + + if (!wait) + return 0; + + for (i = 0; i < bh_count; i++) + if (bhs[i]) + wait_on_buffer(bhs[i]); + + for (i = 0; i < bh_count; i++) { + if (bhs[i] && !buffer_uptodate(bhs[i])) { + err = -EIO; + goto out_brelse; + } + } + return 0; + +out_brelse: + for (i = 0; i < bh_count; i++) { + brelse(bhs[i]); + bhs[i] = NULL; + } + return err; +} + int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *head, unsigned from, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index cc986b824181..c1cf020d1889 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1342,13 +1342,12 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; - ext4_lblk_t start, block, b; + ext4_lblk_t start, block; const u8 *name = d_name->name; - int ra_max = 0; /* Number of bh's in the readahead + size_t ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ - int ra_ptr = 0; /* Current index into readahead + size_t ra_ptr = 0; /* Current index into readahead buffer */ - int num = 0; ext4_lblk_t nblocks; int i, namelen, retval; struct ext4_filename fname; @@ -1411,31 +1410,17 @@ restart: if (ra_ptr >= ra_max) { /* Refill the readahead buffer */ ra_ptr = 0; - b = block; - for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { - /* - * Terminate if we reach the end of the - * directory and must wrap, or if our - * search has finished at this block. - */ - if (b >= nblocks || (num && block == start)) { - bh_use[ra_max] = NULL; - break; - } - num++; - bh = ext4_getblk(NULL, dir, b++, 0); - if (IS_ERR(bh)) { - if (ra_max == 0) { - ret = bh; - goto cleanup_and_exit; - } - break; - } - bh_use[ra_max] = bh; - if (bh) - ll_rw_block(REQ_OP_READ, - REQ_META | REQ_PRIO, - 1, &bh); + if (block < start) + ra_max = start - block; + else + ra_max = nblocks - block; + ra_max = min(ra_max, ARRAY_SIZE(bh_use)); + retval = ext4_bread_batch(dir, block, ra_max, + false /* wait */, bh_use); + if (retval) { + ret = ERR_PTR(retval); + ra_max = 0; + goto cleanup_and_exit; } } if ((bh = bh_use[ra_ptr++]) == NULL) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 4025666c5991..5fa912e5d2a6 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -317,28 +317,41 @@ static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) */ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) { - unsigned long block = 0; - struct buffer_head *bh; - int blocksize = ea_inode->i_sb->s_blocksize; - size_t csize, copied = 0; - void *copy_pos = buf; - - while (copied < size) { - csize = (size - copied) > blocksize ? blocksize : size - copied; - bh = ext4_bread(NULL, ea_inode, block, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); - if (!bh) - return -EFSCORRUPTED; + int blocksize = 1 << ea_inode->i_blkbits; + int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits; + int tail_size = (size % blocksize) ?: blocksize; + struct buffer_head *bhs_inline[8]; + struct buffer_head **bhs = bhs_inline; + int i, ret; + + if (bh_count > ARRAY_SIZE(bhs_inline)) { + bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS); + if (!bhs) + return -ENOMEM; + } - memcpy(copy_pos, bh->b_data, csize); - brelse(bh); + ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count, + true /* wait */, bhs); + if (ret) + goto free_bhs; - copy_pos += csize; - block += 1; - copied += csize; + for (i = 0; i < bh_count; i++) { + /* There shouldn't be any holes in ea_inode. */ + if (!bhs[i]) { + ret = -EFSCORRUPTED; + goto put_bhs; + } + memcpy((char *)buf + blocksize * i, bhs[i]->b_data, + i < bh_count - 1 ? blocksize : tail_size); } - return 0; + ret = 0; +put_bhs: + for (i = 0; i < bh_count; i++) + brelse(bhs[i]); +free_bhs: + if (bhs != bhs_inline) + kfree(bhs); + return ret; } static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, -- cgit v1.2.3 From 3b10fdc6d8bd048f4fb14af5eda2051ace7b8b16 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Sun, 6 Aug 2017 00:27:38 -0400 Subject: ext4: fix forgetten xattr lock protection in ext4_expand_extra_isize We should avoid the contention between the i_extra_isize update and the inline data insertion, so move the xattr trylock in front of i_extra_isize update. Signed-off-by: Miao Xie Reviewed-by: Wang Shilong --- fs/ext4/inode.c | 18 ++++++++++++++++-- fs/ext4/xattr.c | 10 ---------- 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 56bca45bcdf4..5dabbf276651 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5713,10 +5713,15 @@ static int ext4_expand_extra_isize(struct inode *inode, { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; + int no_expand; + int error; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) return 0; + if (ext4_write_trylock_xattr(inode, &no_expand) == 0) + return 0; + raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); @@ -5728,12 +5733,21 @@ static int ext4_expand_extra_isize(struct inode *inode, EXT4_I(inode)->i_extra_isize, 0, new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; + ext4_write_unlock_xattr(inode, &no_expand); return 0; } /* try to expand with EAs present */ - return ext4_expand_extra_isize_ea(inode, new_extra_isize, - raw_inode, handle); + error = ext4_expand_extra_isize_ea(inode, new_extra_isize, + raw_inode, handle); + if (error) { + /* + * Inode size expansion failed; don't try again + */ + no_expand = 1; + } + ext4_write_unlock_xattr(inode, &no_expand); + return error; } /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 5fa912e5d2a6..862ba3891398 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2645,10 +2645,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ - int no_expand; - - if (ext4_write_trylock_xattr(inode, &no_expand) == 0) - return 0; retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; @@ -2731,16 +2727,10 @@ shift: EXT4_I(inode)->i_extra_isize = new_extra_isize; brelse(bh); out: - ext4_write_unlock_xattr(inode, &no_expand); return 0; cleanup: brelse(bh); - /* - * Inode size expansion failed; don't try again - */ - no_expand = 1; - ext4_write_unlock_xattr(inode, &no_expand); return error; } -- cgit v1.2.3 From cf0a5e818fe216dbdf5da4e829e157d27ebfc8a4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Sun, 6 Aug 2017 00:40:01 -0400 Subject: ext4: restructure ext4_expand_extra_isize Current ext4_expand_extra_isize just tries to expand extra isize, if someone is holding xattr lock or some check fails, it will give up. So rename its name to ext4_try_to_expand_extra_isize. Besides that, we clean up unnecessary check and move some relative checks into it. Signed-off-by: Miao Xie Signed-off-by: Theodore Ts'o Reviewed-by: Wang Shilong --- fs/ext4/inode.c | 67 ++++++++++++++++++++++++--------------------------------- fs/ext4/xattr.c | 9 +++++++- 2 files changed, 36 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5dabbf276651..713b67e85f73 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5706,21 +5706,35 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, * Expand an inode by new_extra_isize bytes. * Returns 0 on success or negative error number on failure. */ -static int ext4_expand_extra_isize(struct inode *inode, - unsigned int new_extra_isize, - struct ext4_iloc iloc, - handle_t *handle) +static int ext4_try_to_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc iloc, + handle_t *handle) { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; int no_expand; int error; - if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) - return 0; + if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) + return -EOVERFLOW; + + /* + * In nojournal mode, we can immediately attempt to expand + * the inode. When journaled, we first need to obtain extra + * buffer credits since we may write into the EA block + * with this same handle. If journal_extend fails, then it will + * only result in a minor loss of functionality for that inode. + * If this is felt to be critical, then e2fsck should be run to + * force a large enough s_min_extra_isize. + */ + if (ext4_handle_valid(handle) && + jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0) + return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) - return 0; + return -EBUSY; raw_inode = ext4_raw_inode(&iloc); @@ -5747,6 +5761,7 @@ static int ext4_expand_extra_isize(struct inode *inode, no_expand = 1; } ext4_write_unlock_xattr(inode, &no_expand); + return error; } @@ -5767,44 +5782,18 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - static unsigned int mnt_count; - int err, ret; + int err; might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) return err; - if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && - !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { - /* - * In nojournal mode, we can immediately attempt to expand - * the inode. When journaled, we first need to obtain extra - * buffer credits since we may write into the EA block - * with this same handle. If journal_extend fails, then it will - * only result in a minor loss of functionality for that inode. - * If this is felt to be critical, then e2fsck should be run to - * force a large enough s_min_extra_isize. - */ - if (!ext4_handle_valid(handle) || - jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) { - ret = ext4_expand_extra_isize(inode, - sbi->s_want_extra_isize, - iloc, handle); - if (ret) { - if (mnt_count != - le16_to_cpu(sbi->s_es->s_mnt_count)) { - ext4_warning(inode->i_sb, - "Unable to expand inode %lu. Delete" - " some EAs or run e2fsck.", - inode->i_ino); - mnt_count = - le16_to_cpu(sbi->s_es->s_mnt_count); - } - } - } - } + + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) + ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, + iloc, handle); + return ext4_mark_iloc_dirty(handle, inode, &iloc); } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 862ba3891398..7f5f4b63782b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2638,12 +2638,14 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, { struct ext4_xattr_ibody_header *header; struct buffer_head *bh = NULL; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + static unsigned int mnt_count; size_t min_offs; size_t ifree, bfree; int total_ino; void *base, *end; int error = 0, tried_min_extra_isize = 0; - int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ retry: @@ -2731,6 +2733,11 @@ out: cleanup: brelse(bh); + if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { + ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", + inode->i_ino); + mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); + } return error; } -- cgit v1.2.3 From b640b2c51b26459fc08f2185a385495b0f509a80 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Sun, 6 Aug 2017 00:55:48 -0400 Subject: ext4: cleanup ext4_expand_extra_isize_ea() Clean up some goto statement, make ext4_expand_extra_isize_ea() clearer. Signed-off-by: Miao Xie Signed-off-by: Theodore Ts'o Reviewed-by: Wang Shilong --- fs/ext4/xattr.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7f5f4b63782b..82a5af9f6668 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2637,7 +2637,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; - struct buffer_head *bh = NULL; + struct buffer_head *bh; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); static unsigned int mnt_count; size_t min_offs; @@ -2651,7 +2651,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) - goto out; + return 0; header = IHDR(inode, raw_inode); @@ -2686,6 +2686,7 @@ retry: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EFSCORRUPTED; + brelse(bh); goto cleanup; } base = BHDR(bh); @@ -2693,11 +2694,11 @@ retry: min_offs = end - base; bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, NULL); + brelse(bh); if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; - brelse(bh); goto retry; } error = -ENOSPC; @@ -2715,7 +2716,6 @@ retry: s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; - brelse(bh); goto retry; } goto cleanup; @@ -2727,13 +2727,9 @@ shift: EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, (void *)header, total_ino); EXT4_I(inode)->i_extra_isize = new_extra_isize; - brelse(bh); -out: - return 0; cleanup: - brelse(bh); - if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { + if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", inode->i_ino); mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); -- cgit v1.2.3 From c03b45b853f5829816d871283c792e7527a7ded1 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Sun, 6 Aug 2017 01:00:49 -0400 Subject: ext4, project: expand inode extra size if possible When upgrading from old format, try to set project id to old file first time, it will return EOVERFLOW, but if that file is dirtied(touch etc), changing project id will be allowed, this might be confusing for users, we could try to expand @i_extra_isize here too. Reported-by: Zhang Yi Signed-off-by: Miao Xie Signed-off-by: Wang Shilong Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.h | 3 ++ fs/ext4/inode.c | 97 +++++++++++++++++++++++++++++++++++++++++------------ fs/ext4/ioctl.c | 9 +++-- 3 files changed, 85 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index dabad1bc8617..48143e32411c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -227,6 +227,9 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); +int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc); /* * Wrapper functions with which ext4 calls into JBD. */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 713b67e85f73..c774bdc22759 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5702,6 +5702,42 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, return err; } +static int __ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc, + handle_t *handle, int *no_expand) +{ + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + int error; + + raw_inode = ext4_raw_inode(iloc); + + header = IHDR(inode, raw_inode); + + /* No extended attributes present */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize, 0, + new_extra_isize - EXT4_I(inode)->i_extra_isize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + return 0; + } + + /* try to expand with EAs present */ + error = ext4_expand_extra_isize_ea(inode, new_extra_isize, + raw_inode, handle); + if (error) { + /* + * Inode size expansion failed; don't try again + */ + *no_expand = 1; + } + + return error; +} + /* * Expand an inode by new_extra_isize bytes. * Returns 0 on success or negative error number on failure. @@ -5711,8 +5747,6 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode, struct ext4_iloc iloc, handle_t *handle) { - struct ext4_inode *raw_inode; - struct ext4_xattr_ibody_header *header; int no_expand; int error; @@ -5736,32 +5770,53 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode, if (ext4_write_trylock_xattr(inode, &no_expand) == 0) return -EBUSY; - raw_inode = ext4_raw_inode(&iloc); + error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, + handle, &no_expand); + ext4_write_unlock_xattr(inode, &no_expand); - header = IHDR(inode, raw_inode); + return error; +} - /* No extended attributes present */ - if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || - header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { - memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + - EXT4_I(inode)->i_extra_isize, 0, - new_extra_isize - EXT4_I(inode)->i_extra_isize); - EXT4_I(inode)->i_extra_isize = new_extra_isize; - ext4_write_unlock_xattr(inode, &no_expand); - return 0; +int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc) +{ + handle_t *handle; + int no_expand; + int error, rc; + + if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { + brelse(iloc->bh); + return -EOVERFLOW; } - /* try to expand with EAs present */ - error = ext4_expand_extra_isize_ea(inode, new_extra_isize, - raw_inode, handle); + handle = ext4_journal_start(inode, EXT4_HT_INODE, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + brelse(iloc->bh); + return error; + } + + ext4_write_lock_xattr(inode, &no_expand); + + BUFFER_TRACE(iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { - /* - * Inode size expansion failed; don't try again - */ - no_expand = 1; + brelse(iloc->bh); + goto out_stop; } - ext4_write_unlock_xattr(inode, &no_expand); + error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, + handle, &no_expand); + + rc = ext4_mark_iloc_dirty(handle, inode, iloc); + if (!error) + error = rc; + + ext4_write_unlock_xattr(inode, &no_expand); +out_stop: + ext4_journal_stop(handle); return error; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 4f4a8391585c..afb66d4ab5cf 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -349,11 +349,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { - err = -EOVERFLOW; + err = ext4_expand_extra_isize(inode, + EXT4_SB(sb)->s_want_extra_isize, + &iloc); + if (err) + goto out_unlock; + } else { brelse(iloc.bh); - goto out_unlock; } - brelse(iloc.bh); dquot_initialize(inode); -- cgit v1.2.3 From aec51758ce10a9c847a62a48a168f8c804c6e053 Mon Sep 17 00:00:00 2001 From: Jerry Lee Date: Sun, 6 Aug 2017 01:18:31 -0400 Subject: ext4: fix overflow caused by missing cast in ext4_resize_fs() On a 32-bit platform, the value of n_blcoks_count may be wrong during the file system is resized to size larger than 2^32 blocks. This may caused the superblock being corrupted with zero blocks count. Fixes: 1c6bd7173d66 Signed-off-by: Jerry Lee Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org # 3.7+ --- fs/ext4/resize.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c3ed9021b781..035cd3f4785e 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1927,7 +1927,8 @@ retry: n_desc_blocks = o_desc_blocks + le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); - n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); + n_blocks_count = (ext4_fsblk_t)n_group * + EXT4_BLOCKS_PER_GROUP(sb); n_group--; /* set to last group number */ } -- cgit v1.2.3 From 4e5620132144666ab41fc2799fbc055830187b7e Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Sun, 6 Aug 2017 01:33:07 -0400 Subject: ext4: fix copy paste error in ext4_swap_extents() This bug was found by a static code checker tool for copy paste problems. Signed-off-by: Maninder Singh Signed-off-by: Vaneet Narang Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 230ca74c4841..97f0fd06728d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5836,7 +5836,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, if (e1_blk > lblk1) next1 = e1_blk; if (e2_blk > lblk2) - next2 = e1_blk; + next2 = e2_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) goto finish; -- cgit v1.2.3 From 1feb26162bee7b2f110facfec71b5c7bdbc7d14d Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 1 Aug 2017 16:25:01 -0400 Subject: nfs/flexfiles: fix leak of nfs4_ff_ds_version arrays The client was freeing the nfs4_ff_layout_ds, but not the contained nfs4_ff_ds_version array. Signed-off-by: Weston Andros Adamson Cc: stable@vger.kernel.org # v4.0+ Signed-off-by: Anna Schumaker --- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 6df7a0cf5660..f32c58bbe556 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -32,6 +32,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); + kfree(mirror_ds->ds_versions); kfree_rcu(mirror_ds, id_node.rcu); } -- cgit v1.2.3 From c0ca0e5934b3bd70d456b363a6a989a3b4a692f1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Aug 2017 21:39:28 -0400 Subject: NFSv4: Ignore NFS4ERR_OLD_STATEID in nfs41_check_open_stateid() If the call to TEST_STATEID returns NFS4ERR_OLD_STATEID, then it just means we raced with other calls to OPEN. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3e46be8c9be1..cbf9cdd48a3b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2553,9 +2553,8 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) clear_bit(NFS_O_RDWR_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); stateid->type = NFS4_INVALID_STATEID_TYPE; - } - if (status != NFS_OK) return status; + } if (nfs_open_stateid_recover_openmode(state)) return -NFS4ERR_OPENMODE; return NFS_OK; -- cgit v1.2.3 From d507e2ebd2c7be9138e5cf5c0cb1931c90c42ab1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 10 Aug 2017 15:23:31 -0700 Subject: mm: fix global NR_SLAB_.*CLAIMABLE counter reads As Tetsuo points out: "Commit 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters") broke "Slab:" field of /proc/meminfo . It shows nearly 0kB" In addition to /proc/meminfo, this problem also affects the slab counters OOM/allocation failure info dumps, can cause early -ENOMEM from overcommit protection, and miscalculate image size requirements during suspend-to-disk. This is because the patch in question switched the slab counters from the zone level to the node level, but forgot to update the global accessor functions to read the aggregate node data instead of the aggregate zone data. Use global_node_page_state() to access the global slab counters. Fixes: 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters") Link: http://lkml.kernel.org/r/20170801134256.5400-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Josef Bacik Cc: Vladimir Davydov Cc: Stefan Agner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8a428498d6b2..509a61668d90 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -106,13 +106,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); show_val_kb(m, "Slab: ", - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)); + global_node_page_state(NR_SLAB_RECLAIMABLE) + + global_node_page_state(NR_SLAB_UNRECLAIMABLE)); show_val_kb(m, "SReclaimable: ", - global_page_state(NR_SLAB_RECLAIMABLE)); + global_node_page_state(NR_SLAB_RECLAIMABLE)); show_val_kb(m, "SUnreclaim: ", - global_page_state(NR_SLAB_UNRECLAIMABLE)); + global_node_page_state(NR_SLAB_UNRECLAIMABLE)); seq_printf(m, "KernelStack: %8lu kB\n", global_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", -- cgit v1.2.3 From b3a81d0841a954a3d3e782059960f53e63b37066 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 10 Aug 2017 15:24:15 -0700 Subject: mm: fix KSM data corruption Nadav reported KSM can corrupt the user data by the TLB batching race[1]. That means data user written can be lost. Quote from Nadav Amit: "For this race we need 4 CPUs: CPU0: Caches a writable and dirty PTE entry, and uses the stale value for write later. CPU1: Runs madvise_free on the range that includes the PTE. It would clear the dirty-bit. It batches TLB flushes. CPU2: Writes 4 to /proc/PID/clear_refs , clearing the PTEs soft-dirty. We care about the fact that it clears the PTE write-bit, and of course, batches TLB flushes. CPU3: Runs KSM. Our purpose is to pass the following test in write_protect_page(): if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) Since it will avoid TLB flush. And we want to do it while the PTE is stale. Later, and before replacing the page, we would be able to change the page. Note that all the operations the CPU1-3 perform canhappen in parallel since they only acquire mmap_sem for read. We start with two identical pages. Everything below regards the same page/PTE. CPU0 CPU1 CPU2 CPU3 ---- ---- ---- ---- Write the same value on page [cache PTE as dirty in TLB] MADV_FREE pte_mkclean() 4 > clear_refs pte_wrprotect() write_protect_page() [ success, no flush ] pages_indentical() [ ok ] Write to page different value [Ok, using stale PTE] replace_page() Later, CPU1, CPU2 and CPU3 would flush the TLB, but that is too late. CPU0 already wrote on the page, but KSM ignored this write, and it got lost" In above scenario, MADV_FREE is fixed by changing TLB batching API including [set|clear]_tlb_flush_pending. Remained thing is soft-dirty part. This patch changes soft-dirty uses TLB batching API instead of flush_tlb_mm and KSM checks pending TLB flush by using mm_tlb_flush_pending so that it will flush TLB to avoid data lost if there are other parallel threads pending TLB flush. [1] http://lkml.kernel.org/r/BD3A0EBE-ECF4-41D4-87FA-C755EA9AB6BD@gmail.com Link: http://lkml.kernel.org/r/20170802000818.4760-8-namit@vmware.com Signed-off-by: Minchan Kim Signed-off-by: Nadav Amit Reported-by: Nadav Amit Tested-by: Nadav Amit Reviewed-by: Andrea Arcangeli Cc: Mel Gorman Cc: Hugh Dickins Cc: "David S. Miller" Cc: Andy Lutomirski Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Nadav Amit Cc: Rik van Riel Cc: Russell King Cc: Sergey Senozhatsky Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b836fd61ed87..fe8f3265e877 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -16,9 +16,10 @@ #include #include #include +#include #include -#include +#include #include #include "internal.h" @@ -1008,6 +1009,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; + struct mmu_gather tlb; int itype; int rv; @@ -1054,6 +1056,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } down_read(&mm->mmap_sem); + tlb_gather_mmu(&tlb, mm, 0, -1); if (type == CLEAR_REFS_SOFT_DIRTY) { for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!(vma->vm_flags & VM_SOFTDIRTY)) @@ -1075,7 +1078,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(mm, 0, -1); - flush_tlb_mm(mm); + tlb_finish_mmu(&tlb, 0, -1); up_read(&mm->mmap_sem); out_mm: mmput(mm); -- cgit v1.2.3 From e86b298bebf7e799e4b7232e9135799f1947552e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 10 Aug 2017 15:24:32 -0700 Subject: userfaultfd: replace ENOSPC with ESRCH in case mm has gone during copy/zeropage When the process exit races with outstanding mcopy_atomic, it would be better to return ESRCH error. When such race occurs the process and it's mm are going away and returning "no such process" to the uffd monitor seems better fit than ENOSPC. Link: http://lkml.kernel.org/r/1502111545-32305-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Pavel Emelyanov Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 06ea26b8c996..b0d5897bc4e6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1600,7 +1600,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, uffdio_copy.len); mmput(ctx->mm); } else { - return -ENOSPC; + return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_copy->copy))) return -EFAULT; @@ -1647,7 +1647,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, uffdio_zeropage.range.len); mmput(ctx->mm); } else { - return -ENOSPC; + return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) return -EFAULT; -- cgit v1.2.3 From 9183976ef1c858c289b09066fd57aae51b86653c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 25 May 2017 06:57:50 -0400 Subject: fuse: set mapping error in writepage_locked when it fails This ensures that we see errors on fsync when writeback fails. Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 810ed4f99e38..ab60051be6e5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1669,6 +1669,7 @@ err_nofile: err_free: fuse_request_free(req); err: + mapping_set_error(page->mapping, error); end_page_writeback(page); return error; } -- cgit v1.2.3 From 8a9d6e964d318533ba3d2901ce153ba317c99a89 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 5 Aug 2017 10:59:14 +0200 Subject: pnfs/blocklayout: require 64-bit sector_t The blocklayout code does not compile cleanly for a 32-bit sector_t, and also has no reliable checks for devices sizes, which makes it unsafe to use with a kernel that doesn't support large block devices. Signed-off-by: Christoph Hellwig Reported-by: Arnd Bergmann Fixes: 5c83746a0cf2 ("pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing") Signed-off-by: Anna Schumaker --- fs/nfs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 69d02cf8cf37..5f93cfacb3d1 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -121,6 +121,7 @@ config PNFS_FILE_LAYOUT config PNFS_BLOCK tristate depends on NFS_V4_1 && BLK_DEV_DM + depends on 64BIT || LBDAF default NFS_V4 config PNFS_FLEXFILE_LAYOUT -- cgit v1.2.3