From 00c94ebec5925593c0377b941289224469e72ac7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Apr 2022 18:04:27 -0400 Subject: NFSv4: Don't invalidate inode attributes on delegation return There is no need to declare attributes such as the ctime, mtime and block size invalid when we're just returning a delegation, so it is inappropriate to call nfs_post_op_update_inode_force_wcc(). Instead, just call nfs_refresh_inode() after faking up the change attribute. We know that the GETATTR op occurs before the DELEGRETURN, so we are safe when doing this. Fixes: 0bc2c9b4dca9 ("NFSv4: Don't discard the attributes returned by asynchronous DELEGRETURN") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 16106f805ffa..a79f66432bd3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -363,6 +363,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) { + fattr->pre_change_attr = version; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + } +} + static void nfs4_test_and_free_stateid(struct nfs_server *server, nfs4_stateid *stateid, const struct cred *cred) @@ -6553,7 +6561,9 @@ static void nfs4_delegreturn_release(void *calldata) pnfs_roc_release(&data->lr.arg, &data->lr.res, data->res.lr_ret); if (inode) { - nfs_post_op_update_inode_force_wcc(inode, &data->fattr); + nfs4_fattr_set_prechange(&data->fattr, + inode_peek_iversion_raw(inode)); + nfs_refresh_inode(inode, &data->fattr); nfs_iput_and_deactive(inode); } kfree(calldata); -- cgit v1.2.3 From e6f9d69648029e48b8f97db09368d419b5e2614a Mon Sep 17 00:00:00 2001 From: Chung-Chiang Cheng Date: Fri, 15 Apr 2022 16:04:05 +0800 Subject: btrfs: export a helper for compression hard check inode_can_compress will be used outside of inode.c to check the availability of setting compression flag by xattr. This patch moves this function as an internal helper and renames it to btrfs_inode_can_compress. Reviewed-by: Nikolay Borisov Signed-off-by: Chung-Chiang Cheng Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 11 +++++++++++ fs/btrfs/inode.c | 15 ++------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 47e72d72f7d0..32131a5d321b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -384,6 +384,17 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) return ret; } +/* + * Check if the inode has flags compatible with compression + */ +static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) + return false; + return true; +} + struct btrfs_dio_private { struct inode *inode; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8bac68d8e96f..673b9259f6c0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -480,17 +480,6 @@ static noinline int add_async_extent(struct async_chunk *cow, return 0; } -/* - * Check if the inode has flags compatible with compression - */ -static inline bool inode_can_compress(struct btrfs_inode *inode) -{ - if (inode->flags & BTRFS_INODE_NODATACOW || - inode->flags & BTRFS_INODE_NODATASUM) - return false; - return true; -} - /* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. @@ -500,7 +489,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, { struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (!inode_can_compress(inode)) { + if (!btrfs_inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", btrfs_ino(inode)); @@ -2019,7 +2008,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); - } else if (!inode_can_compress(inode) || + } else if (!btrfs_inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { if (zoned) ret = run_delalloc_zoned(inode, locked_page, start, end, -- cgit v1.2.3 From 0e852ab8974cd2b5946766b2d9baf82c78ace03d Mon Sep 17 00:00:00 2001 From: Chung-Chiang Cheng Date: Fri, 15 Apr 2022 16:04:06 +0800 Subject: btrfs: do not allow compression on nodatacow files Compression and nodatacow are mutually exclusive. A similar issue was fixed by commit f37c563bab429 ("btrfs: add missing check for nocow and compression inode flags"). Besides ioctl, there is another way to enable/disable/reset compression directly via xattr. The following steps will result in a invalid combination. $ touch bar $ chattr +C bar $ lsattr bar ---------------C-- bar $ setfattr -n btrfs.compression -v zstd bar $ lsattr bar --------c------C-- bar To align with the logic in check_fsflags, nocompress will also be unacceptable after this patch, to prevent mix any compression-related options with nodatacow. $ touch bar $ chattr +C bar $ lsattr bar ---------------C-- bar $ setfattr -n btrfs.compression -v zstd bar setfattr: bar: Invalid argument $ setfattr -n btrfs.compression -v no bar setfattr: bar: Invalid argument When both compression and nodatacow are enabled, then btrfs_run_delalloc_range prefers nodatacow and no compression happens. Reported-by: Jayce Lin CC: stable@vger.kernel.org # 5.10.x: e6f9d6964802: btrfs: export a helper for compression hard check CC: stable@vger.kernel.org # 5.10.x Reviewed-by: Filipe Manana Signed-off-by: Chung-Chiang Cheng Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/props.c | 16 +++++++++++----- fs/btrfs/props.h | 3 ++- fs/btrfs/xattr.c | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 1a6d2d5b4b33..5a6f87744c28 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -17,7 +17,8 @@ static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); struct prop_handler { struct hlist_node node; const char *xattr_name; - int (*validate)(const char *value, size_t len); + int (*validate)(const struct btrfs_inode *inode, const char *value, + size_t len); int (*apply)(struct inode *inode, const char *value, size_t len); const char *(*extract)(struct inode *inode); int inheritable; @@ -55,7 +56,8 @@ find_prop_handler(const char *name, return NULL; } -int btrfs_validate_prop(const char *name, const char *value, size_t value_len) +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len) { const struct prop_handler *handler; @@ -69,7 +71,7 @@ int btrfs_validate_prop(const char *name, const char *value, size_t value_len) if (value_len == 0) return 0; - return handler->validate(value, value_len); + return handler->validate(inode, value, value_len); } int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, @@ -252,8 +254,12 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) return ret; } -static int prop_compression_validate(const char *value, size_t len) +static int prop_compression_validate(const struct btrfs_inode *inode, + const char *value, size_t len) { + if (!btrfs_inode_can_compress(inode)) + return -EINVAL; + if (!value) return 0; @@ -364,7 +370,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(value, strlen(value)); + ret = h->validate(BTRFS_I(inode), value, strlen(value)); if (ret) continue; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 40b2c65b518c..2b2ac15ab788 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -13,7 +13,8 @@ void __init btrfs_props_init(void); int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const char *value, size_t value_len, int flags); -int btrfs_validate_prop(const char *name, const char *value, size_t value_len); +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len); int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 99abf41b89b9..9632d0ff2038 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -403,7 +403,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct btrfs_root *root = BTRFS_I(inode)->root; name = xattr_full_name(handler, name); - ret = btrfs_validate_prop(name, value, size); + ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size); if (ret) return ret; -- cgit v1.2.3 From d0e64a981fd841cb0f28fcd6afcac55e6f1e6994 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 21 Apr 2022 10:56:39 +0100 Subject: btrfs: always log symlinks in full mode On Linux, empty symlinks are invalid, and attempting to create one with the system call symlink(2) results in an -ENOENT error and this is explicitly documented in the man page. If we rename a symlink that was created in the current transaction and its parent directory was logged before, we actually end up logging the symlink without logging its content, which is stored in an inline extent. That means that after a power failure we can end up with an empty symlink, having no content and an i_size of 0 bytes. It can be easily reproduced like this: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ mkdir /mnt/testdir $ sync # Create a file inside the directory and fsync the directory. $ touch /mnt/testdir/foo $ xfs_io -c "fsync" /mnt/testdir # Create a symlink inside the directory and then rename the symlink. $ ln -s /mnt/testdir/foo /mnt/testdir/bar $ mv /mnt/testdir/bar /mnt/testdir/baz # Now fsync again the directory, this persist the log tree. $ xfs_io -c "fsync" /mnt/testdir $ mount /dev/sdc /mnt $ stat -c %s /mnt/testdir/baz 0 $ readlink /mnt/testdir/baz $ Fix this by always logging symlinks in full mode (LOG_INODE_ALL), so that their content is also logged. A test case for fstests will follow. CC: stable@vger.kernel.org # 4.9+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 09e4f1a04e6f..11399c8eed87 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5804,6 +5804,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&inode->log_mutex); } + /* + * For symlinks, we must always log their content, which is stored in an + * inline extent, otherwise we could end up with an empty symlink after + * log replay, which is invalid on linux (symlink(2) returns -ENOENT if + * one attempts to create an empty symlink). + * We don't need to worry about flushing delalloc, because when we create + * the inline extent when the symlink is created (we never have delalloc + * for symlinks). + */ + if (S_ISLNK(inode->vfs_inode.i_mode)) + inode_only = LOG_INODE_ALL; + /* * Before logging the inode item, cache the value returned by * inode_logged(), because after that we have the need to figure out if @@ -6182,7 +6194,7 @@ again: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) + if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); -- cgit v1.2.3 From 193b4e83986d7ee6caa8ceefb5ee9f58240fbee0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 21 Apr 2022 11:03:09 +0100 Subject: btrfs: do not BUG_ON() on failure to update inode when setting xattr We are doing a BUG_ON() if we fail to update an inode after setting (or clearing) a xattr, but there's really no reason to not instead simply abort the transaction and return the error to the caller. This should be a rare error because we have previously reserved enough metadata space to update the inode and the delayed inode should have already been setup, so an -ENOSPC or -ENOMEM, which are the possible errors, are very unlikely to happen. So replace the BUG_ON()s with a transaction abort. CC: stable@vger.kernel.org # 4.9+ Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/xattr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 9632d0ff2038..367bb87b66df 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -262,7 +262,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, ret); out: if (start_trans) btrfs_end_transaction(trans); @@ -416,7 +417,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, ret); } btrfs_end_transaction(trans); -- cgit v1.2.3 From 4b73c55fdebd8939f0f6000921075f7f6fa41397 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 21 Apr 2022 11:01:22 +0100 Subject: btrfs: skip compression property for anything other than files and dirs The compression property only has effect on regular files and directories (so that it's propagated to files and subdirectories created inside a directory). For any other inode type (symlink, fifo, device, socket), it's pointless to set the compression property because it does nothing and ends up unnecessarily wasting leaf space due to the pointless xattr (75 or 76 bytes, depending on the compression value). Symlinks in particular are very common (for example, I have almost 10k symlinks under /etc, /usr and /var alone) and therefore it's worth to avoid wasting leaf space with the compression xattr. For example, the compression property can end up on a symlink or character device implicitly, through inheritance from a parent directory $ mkdir /mnt/testdir $ btrfs property set /mnt/testdir compression lzo $ ln -s yadayada /mnt/testdir/lnk $ mknod /mnt/testdir/dev c 0 0 Or explicitly like this: $ ln -s yadayda /mnt/lnk $ setfattr -h -n btrfs.compression -v lzo /mnt/lnk So skip the compression property on inodes that are neither a regular file nor a directory. CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/props.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/props.h | 1 + fs/btrfs/xattr.c | 3 +++ 3 files changed, 47 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 5a6f87744c28..1b31481f9e72 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -21,6 +21,7 @@ struct prop_handler { size_t len); int (*apply)(struct inode *inode, const char *value, size_t len); const char *(*extract)(struct inode *inode); + bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -74,6 +75,28 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, return handler->validate(inode, value, value_len); } +/* + * Check if a property should be ignored (not set) for an inode. + * + * @inode: The target inode. + * @name: The property's name. + * + * The caller must be sure the given property name is valid, for example by + * having previously called btrfs_validate_prop(). + * + * Returns: true if the property should be ignored for the given inode + * false if the property must not be ignored for the given inode + */ +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name) +{ + const struct prop_handler *handler; + + handler = find_prop_handler(name, NULL); + ASSERT(handler != NULL); + + return handler->ignore(inode); +} + int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const char *value, size_t value_len, int flags) @@ -316,6 +339,22 @@ static int prop_compression_apply(struct inode *inode, const char *value, return 0; } +static bool prop_compression_ignore(const struct btrfs_inode *inode) +{ + /* + * Compression only has effect for regular files, and for directories + * we set it just to propagate it to new files created inside them. + * Everything else (symlinks, devices, sockets, fifos) is pointless as + * it will do nothing, so don't waste metadata space on a compression + * xattr for anything that is neither a file nor a directory. + */ + if (!S_ISREG(inode->vfs_inode.i_mode) && + !S_ISDIR(inode->vfs_inode.i_mode)) + return true; + + return false; +} + static const char *prop_compression_extract(struct inode *inode) { switch (BTRFS_I(inode)->prop_compress) { @@ -336,6 +375,7 @@ static struct prop_handler prop_handlers[] = { .validate = prop_compression_validate, .apply = prop_compression_apply, .extract = prop_compression_extract, + .ignore = prop_compression_ignore, .inheritable = 1 }, }; @@ -362,6 +402,9 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; + if (h->ignore(BTRFS_I(inode))) + continue; + value = h->extract(parent); if (!value) continue; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 2b2ac15ab788..59bea741cfcf 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -15,6 +15,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, int flags); int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, const char *value, size_t value_len); +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 367bb87b66df..85691dc2232f 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -408,6 +408,9 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, if (ret) return ret; + if (btrfs_ignore_prop(BTRFS_I(inode), name)) + return 0; + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); -- cgit v1.2.3 From a196c78b5443fc61af2c0490213b9d125482cbd1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 1 May 2022 21:19:50 -0600 Subject: io_uring: assign non-fixed early for async work We defer file assignment to ensure that fixed files work with links between a direct accept/open and the links that follow it. But this has the side effect that normal file assignment is then not complete by the time that request submission has been done. For deferred execution, if the file is a regular file, assign it when we do the async prep anyway. Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index e01f595f5b7d..91de361ea9ab 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6947,7 +6947,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_req_prep_async(struct io_kiocb *req) { - if (!io_op_defs[req->opcode].needs_async_setup) + const struct io_op_def *def = &io_op_defs[req->opcode]; + + /* assign early for deferred execution for non-fixed file */ + if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) + req->file = io_file_get_normal(req, req->fd); + if (!def->needs_async_setup) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; -- cgit v1.2.3 From 9f73f1aef98b2fa7252c0a89be64840271ce8ea0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 1 Apr 2022 15:29:37 +0800 Subject: btrfs: force v2 space cache usage for subpage mount [BUG] For a 4K sector sized btrfs with v1 cache enabled and only mounted on systems with 4K page size, if it's mounted on subpage (64K page size) systems, it can cause the following warning on v1 space cache: BTRFS error (device dm-1): csum mismatch on free space cache BTRFS warning (device dm-1): failed to load free space cache for block group 84082688, rebuilding it now Although not a big deal, as kernel can rebuild it without problem, such warning will bother end users, especially if they want to switch the same btrfs seamlessly between different page sized systems. [CAUSE] V1 free space cache is still using fixed PAGE_SIZE for various bitmap, like BITS_PER_BITMAP. Such hard-coded PAGE_SIZE usage will cause various mismatch, from v1 cache size to checksum. Thus kernel will always reject v1 cache with a different PAGE_SIZE with csum mismatch. [FIX] Although we should fix v1 cache, it's already going to be marked deprecated soon. And we have v2 cache based on metadata (which is already fully subpage compatible), and it has almost everything superior than v1 cache. So just force subpage mount to use v2 cache on mount. Reported-by: Matt Corallo CC: stable@vger.kernel.org # 5.15+ Link: https://lore.kernel.org/linux-btrfs/61aa27d1-30fc-c1a9-f0f4-9df544395ec3@bluematt.me/ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 20e70eb88465..3e0acc362233 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3657,6 +3657,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; + /* + * V1 space cache has some hardcoded PAGE_SIZE usage, and is + * going to be deprecated. + * + * Force to use v2 cache for subpage case. + */ + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(fs_info, FREE_SPACE_TREE, + "forcing free space tree for sector size %u with page size %lu", + sectorsize, PAGE_SIZE); + btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); -- cgit v1.2.3 From 549577127afeb759c29cdeeff1bdccd86e6c0dbf Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 3 May 2022 14:10:04 -0700 Subject: btrfs: zoned: move non-changing condition check out of the loop btrfs_zone_activate() checks if block_group->alloc_offset == block_group->zone_capacity every time it iterates the loop. But, it is not depending on the index. Move out the check and do it only once. Fixes: f9a912a3c45f ("btrfs: zoned: make zone activation multi stripe capable") Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1b1b310c3c51..b5eb794c1e23 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1835,6 +1835,12 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } + /* No space left */ + if (block_group->alloc_offset == block_group->zone_capacity) { + ret = false; + goto out_unlock; + } + for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; physical = map->stripes[i].physical; @@ -1842,12 +1848,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) if (device->zone_info->max_active_zones == 0) continue; - /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { - ret = false; - goto out_unlock; - } - if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; -- cgit v1.2.3 From ceb4f60830a7cd38ff47b7dd54e9e06ddbaf413c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 3 May 2022 14:10:05 -0700 Subject: btrfs: zoned: activate block group properly on unlimited active zone device btrfs_zone_activate() checks if it activated all the underlying zones in the loop. However, that check never hit on an unlimited activate zone device (max_active_zones == 0). Fortunately, it still works without ENOSPC because btrfs_zone_activate() returns true in the end, even if block_group->zone_is_active == 0. But, it is confusing to have non zone_is_active block group still usable for allocation. Also, we are wasting CPU time to iterate the loop every time btrfs_zone_activate() is called for the blog groups. Since error case in the loop is handled by out_unlock, we can just set zone_is_active and do the list stuff after the loop. Fixes: f9a912a3c45f ("btrfs: zoned: make zone activation multi stripe capable") Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b5eb794c1e23..d31b0eda210f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1853,24 +1853,18 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) ret = false; goto out_unlock; } - - /* Successfully activated all the zones */ - if (i == map->num_stripes - 1) - block_group->zone_is_active = 1; - - } + + /* Successfully activated all the zones */ + block_group->zone_is_active = 1; spin_unlock(&block_group->lock); - if (block_group->zone_is_active) { - /* For the active block group list */ - btrfs_get_block_group(block_group); + /* For the active block group list */ + btrfs_get_block_group(block_group); - spin_lock(&fs_info->zone_active_bgs_lock); - list_add_tail(&block_group->active_bg_list, - &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); - } + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); return true; -- cgit v1.2.3 From 750ee454908e90a8792b1e2b157c2948da86e926 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 3 May 2022 11:57:02 +0100 Subject: btrfs: fix assertion failure when logging directory key range item When inserting a key range item (BTRFS_DIR_LOG_INDEX_KEY) while logging a directory, we don't expect the insertion to fail with -EEXIST, because we are holding the directory's log_mutex and we have dropped all existing BTRFS_DIR_LOG_INDEX_KEY keys from the log tree before we started to log the directory. However it's possible that during the logging we attempt to insert the same BTRFS_DIR_LOG_INDEX_KEY key twice, but for this to happen we need to race with insertions of items from other inodes in the subvolume's tree while we are logging a directory. Here's how this can happen: 1) We are logging a directory with inode number 1000 that has its items spread across 3 leaves in the subvolume's tree: leaf A - has index keys from the range 2 to 20 for example. The last item in the leaf corresponds to a dir item for index number 20. All these dir items were created in a past transaction. leaf B - has index keys from the range 22 to 100 for example. It has no keys from other inodes, all its keys are dir index keys for our directory inode number 1000. Its first key is for the dir item with a sequence number of 22. All these dir items were also created in a past transaction. leaf C - has index keys for our directory for the range 101 to 120 for example. This leaf also has items from other inodes, and its first item corresponds to the dir item for index number 101 for our directory with inode number 1000; 2) When we finish processing the items from leaf A at log_dir_items(), we log a BTRFS_DIR_LOG_INDEX_KEY key with an offset of 21 and a last offset of 21, meaning the log is authoritative for the index range from 21 to 21 (a single sequence number). At this point leaf B was not yet modified in the current transaction; 3) When we return from log_dir_items() we have released our read lock on leaf B, and have set *last_offset_ret to 21 (index number of the first item on leaf B minus 1); 4) Some other task inserts an item for other inode (inode number 1001 for example) into leaf C. That resulted in pushing some items from leaf C into leaf B, in order to make room for the new item, so now leaf B has dir index keys for the sequence number range from 22 to 102 and leaf C has the dir items for the sequence number range 103 to 120; 5) At log_directory_changes() we call log_dir_items() again, passing it a 'min_offset' / 'min_key' value of 22 (*last_offset_ret from step 3 plus 1, so 21 + 1). Then btrfs_search_forward() leaves us at slot 0 of leaf B, since leaf B was modified in the current transaction. We have also initialized 'last_old_dentry_offset' to 20 after calling btrfs_previous_item() at log_dir_items(), as it left us at the last item of leaf A, which refers to the dir item with sequence number 20; 6) We then call process_dir_items_leaf() to process the dir items of leaf B, and when we process the first item, corresponding to slot 0, sequence number 22, we notice the dir item was created in a past transaction and its sequence number is greater than the value of *last_old_dentry_offset + 1 (20 + 1), so we decide to log again a BTRFS_DIR_LOG_INDEX_KEY key with an offset of 21 and an end range of 21 (key.offset - 1 == 22 - 1 == 21), which results in an -EEXIST error from insert_dir_log_key(), as we have already inserted that key at step 2, triggering the assertion at process_dir_items_leaf(). The trace produced in dmesg is like the following: assertion failed: ret != -EEXIST, in fs/btrfs/tree-log.c:3857 [198255.980839][ T7460] ------------[ cut here ]------------ [198255.981666][ T7460] kernel BUG at fs/btrfs/ctree.h:3617! [198255.983141][ T7460] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI [198255.984080][ T7460] CPU: 0 PID: 7460 Comm: repro-ghost-dir Not tainted 5.18.0-5314c78ac373-misc-next+ [198255.986027][ T7460] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 [198255.988600][ T7460] RIP: 0010:assertfail.constprop.0+0x1c/0x1e [198255.989465][ T7460] Code: 8b 4c 89 (...) [198255.992599][ T7460] RSP: 0018:ffffc90007387188 EFLAGS: 00010282 [198255.993414][ T7460] RAX: 000000000000003d RBX: 0000000000000065 RCX: 0000000000000000 [198255.996056][ T7460] RDX: 0000000000000001 RSI: ffffffff8b62b180 RDI: fffff52000e70e24 [198255.997668][ T7460] RBP: ffffc90007387188 R08: 000000000000003d R09: ffff8881f0e16507 [198255.999199][ T7460] R10: ffffed103e1c2ca0 R11: 0000000000000001 R12: 00000000ffffffef [198256.000683][ T7460] R13: ffff88813befc630 R14: ffff888116c16e70 R15: ffffc90007387358 [198256.007082][ T7460] FS: 00007fc7f7c24640(0000) GS:ffff8881f0c00000(0000) knlGS:0000000000000000 [198256.009939][ T7460] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [198256.014133][ T7460] CR2: 0000560bb16d0b78 CR3: 0000000140b34005 CR4: 0000000000170ef0 [198256.015239][ T7460] Call Trace: [198256.015674][ T7460] [198256.016313][ T7460] log_dir_items.cold+0x16/0x2c [198256.018858][ T7460] ? replay_one_extent+0xbf0/0xbf0 [198256.025932][ T7460] ? release_extent_buffer+0x1d2/0x270 [198256.029658][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.031114][ T7460] ? lock_acquired+0xbe/0x660 [198256.032633][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.034386][ T7460] ? lock_release+0xcf/0x8a0 [198256.036152][ T7460] log_directory_changes+0xf9/0x170 [198256.036993][ T7460] ? log_dir_items+0xba0/0xba0 [198256.037661][ T7460] ? do_raw_write_unlock+0x7d/0xe0 [198256.038680][ T7460] btrfs_log_inode+0x233b/0x26d0 [198256.041294][ T7460] ? log_directory_changes+0x170/0x170 [198256.042864][ T7460] ? btrfs_attach_transaction_barrier+0x60/0x60 [198256.045130][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.046568][ T7460] ? lock_release+0xcf/0x8a0 [198256.047504][ T7460] ? lock_downgrade+0x420/0x420 [198256.048712][ T7460] ? ilookup5_nowait+0x81/0xa0 [198256.049747][ T7460] ? lock_downgrade+0x420/0x420 [198256.050652][ T7460] ? do_raw_spin_unlock+0xa9/0x100 [198256.051618][ T7460] ? __might_resched+0x128/0x1c0 [198256.052511][ T7460] ? __might_sleep+0x66/0xc0 [198256.053442][ T7460] ? __kasan_check_read+0x11/0x20 [198256.054251][ T7460] ? iget5_locked+0xbd/0x150 [198256.054986][ T7460] ? run_delayed_iput_locked+0x110/0x110 [198256.055929][ T7460] ? btrfs_iget+0xc7/0x150 [198256.056630][ T7460] ? btrfs_orphan_cleanup+0x4a0/0x4a0 [198256.057502][ T7460] ? free_extent_buffer+0x13/0x20 [198256.058322][ T7460] btrfs_log_inode+0x2654/0x26d0 [198256.059137][ T7460] ? log_directory_changes+0x170/0x170 [198256.060020][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.060930][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.061905][ T7460] ? lock_contended+0x770/0x770 [198256.062682][ T7460] ? btrfs_log_inode_parent+0xd04/0x1750 [198256.063582][ T7460] ? lock_downgrade+0x420/0x420 [198256.064432][ T7460] ? preempt_count_sub+0x18/0xc0 [198256.065550][ T7460] ? __mutex_lock+0x580/0xdc0 [198256.066654][ T7460] ? stack_trace_save+0x94/0xc0 [198256.068008][ T7460] ? __kasan_check_write+0x14/0x20 [198256.072149][ T7460] ? __mutex_unlock_slowpath+0x12a/0x430 [198256.073145][ T7460] ? mutex_lock_io_nested+0xcd0/0xcd0 [198256.074341][ T7460] ? wait_for_completion_io_timeout+0x20/0x20 [198256.075345][ T7460] ? lock_downgrade+0x420/0x420 [198256.076142][ T7460] ? lock_contended+0x770/0x770 [198256.076939][ T7460] ? do_raw_spin_lock+0x1c0/0x1c0 [198256.078401][ T7460] ? btrfs_sync_file+0x5e6/0xa40 [198256.080598][ T7460] btrfs_log_inode_parent+0x523/0x1750 [198256.081991][ T7460] ? wait_current_trans+0xc8/0x240 [198256.083320][ T7460] ? lock_downgrade+0x420/0x420 [198256.085450][ T7460] ? btrfs_end_log_trans+0x70/0x70 [198256.086362][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.087544][ T7460] ? lock_release+0xcf/0x8a0 [198256.088305][ T7460] ? lock_downgrade+0x420/0x420 [198256.090375][ T7460] ? dget_parent+0x8e/0x300 [198256.093538][ T7460] ? do_raw_spin_lock+0x1c0/0x1c0 [198256.094918][ T7460] ? lock_downgrade+0x420/0x420 [198256.097815][ T7460] ? do_raw_spin_unlock+0xa9/0x100 [198256.101822][ T7460] ? dget_parent+0xb7/0x300 [198256.103345][ T7460] btrfs_log_dentry_safe+0x48/0x60 [198256.105052][ T7460] btrfs_sync_file+0x629/0xa40 [198256.106829][ T7460] ? start_ordered_ops.constprop.0+0x120/0x120 [198256.109655][ T7460] ? __fget_files+0x161/0x230 [198256.110760][ T7460] vfs_fsync_range+0x6d/0x110 [198256.111923][ T7460] ? start_ordered_ops.constprop.0+0x120/0x120 [198256.113556][ T7460] __x64_sys_fsync+0x45/0x70 [198256.114323][ T7460] do_syscall_64+0x5c/0xc0 [198256.115084][ T7460] ? syscall_exit_to_user_mode+0x3b/0x50 [198256.116030][ T7460] ? do_syscall_64+0x69/0xc0 [198256.116768][ T7460] ? do_syscall_64+0x69/0xc0 [198256.117555][ T7460] ? do_syscall_64+0x69/0xc0 [198256.118324][ T7460] ? sysvec_call_function_single+0x57/0xc0 [198256.119308][ T7460] ? asm_sysvec_call_function_single+0xa/0x20 [198256.120363][ T7460] entry_SYSCALL_64_after_hwframe+0x44/0xae [198256.121334][ T7460] RIP: 0033:0x7fc7fe97b6ab [198256.122067][ T7460] Code: 0f 05 48 (...) [198256.125198][ T7460] RSP: 002b:00007fc7f7c23950 EFLAGS: 00000293 ORIG_RAX: 000000000000004a [198256.126568][ T7460] RAX: ffffffffffffffda RBX: 00007fc7f7c239f0 RCX: 00007fc7fe97b6ab [198256.127942][ T7460] RDX: 0000000000000002 RSI: 000056167536bcf0 RDI: 0000000000000004 [198256.129302][ T7460] RBP: 0000000000000004 R08: 0000000000000000 R09: 000000007ffffeb8 [198256.130670][ T7460] R10: 00000000000001ff R11: 0000000000000293 R12: 0000000000000001 [198256.132046][ T7460] R13: 0000561674ca8140 R14: 00007fc7f7c239d0 R15: 000056167536dab8 [198256.133403][ T7460] Fix this by treating -EEXIST as expected at insert_dir_log_key() and have it update the item with an end offset corresponding to the maximum between the previously logged end offset and the new requested end offset. The end offsets may be different due to dir index key deletions that happened as part of unlink operations while we are logging a directory (triggered when fsyncing some other inode parented by the directory) or during renames which always attempt to log a single dir index deletion. Reported-by: Zygo Blaxell Link: https://lore.kernel.org/linux-btrfs/YmyefE9mc2xl5ZMz@hungrycats.org/ Fixes: 732d591a5d6c12 ("btrfs: stop copying old dir items when logging a directory") Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 11399c8eed87..e65633686378 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3721,11 +3721,29 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - if (ret) + /* + * -EEXIST is fine and can happen sporadically when we are logging a + * directory and have concurrent insertions in the subvolume's tree for + * items from other inodes and that result in pushing off some dir items + * from one leaf to another in order to accommodate for the new items. + * This results in logging the same dir index range key. + */ + if (ret && ret != -EEXIST) return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); + if (ret == -EEXIST) { + const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); + + /* + * btrfs_del_dir_entries_in_log() might have been called during + * an unlink between the initial insertion of this key and the + * current update, or we might be logging a single entry deletion + * during a rename, so set the new last_offset to the max value. + */ + last_offset = max(last_offset, curr_end); + } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(path); @@ -3849,13 +3867,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, ret = insert_dir_log_key(trans, log, dst_path, ino, *last_old_dentry_offset + 1, key.offset - 1); - /* - * -EEXIST should never happen because when we - * log a directory in full mode (LOG_INODE_ALL) - * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from - * the log tree. - */ - ASSERT(ret != -EEXIST); if (ret < 0) return ret; } @@ -7031,12 +7042,12 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, /* * Other concurrent task might be logging the old directory, * as it can be triggered when logging other inode that had or - * still has a dentry in the old directory. So take the old - * directory's log_mutex to prevent getting an -EEXIST when - * logging a key to record the deletion, or having that other - * task logging the old directory get an -EEXIST if it attempts - * to log the same key after we just did it. In both cases that - * would result in falling back to a transaction commit. + * still has a dentry in the old directory. We lock the old + * directory's log_mutex to ensure the deletion of the old + * name is persisted, because during directory logging we + * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of + * the old name's dir index item is in the delayed items, so + * it could be missed by an in progress directory logging. */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), -- cgit v1.2.3 From 3e1ad196385c65c1454aceab1226d9a4baca27d5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 3 May 2022 17:35:25 +0200 Subject: btrfs: sysfs: export the balance paused state of exclusive operation The new state allowing device addition with paused balance is not exported to user space so it can't recognize it and actually start the operation. Fixes: efc0e69c2fea ("btrfs: introduce exclusive operation BALANCE_PAUSED state") CC: stable@vger.kernel.org # 5.17 Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 17389a42a3ab..ba78ca5aabbb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -922,6 +922,9 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, case BTRFS_EXCLOP_BALANCE: str = "balance\n"; break; + case BTRFS_EXCLOP_BALANCE_PAUSED: + str = "balance paused\n"; + break; case BTRFS_EXCLOP_DEV_ADD: str = "device add\n"; break; -- cgit v1.2.3 From ceaf69f8eadcafb323392be88e7a5248c415d423 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sat, 7 May 2022 11:00:28 +0300 Subject: fanotify: do not allow setting dirent events in mask of non-dir Dirent events (create/delete/move) are only reported on watched directory inodes, but in fanotify as well as in legacy inotify, it was always allowed to set them on non-dir inode, which does not result in any meaningful outcome. Until kernel v5.17, dirent events in fanotify also differed from events "on child" (e.g. FAN_OPEN) in the information provided in the event. For example, FAN_OPEN could be set in the mask of a non-dir or the mask of its parent and event would report the fid of the child regardless of the marked object. By contrast, FAN_DELETE is not reported if the child is marked and the child fid was not reported in the events. Since kernel v5.17, with fanotify group flag FAN_REPORT_TARGET_FID, the fid of the child is reported with dirent events, like events "on child", which may create confusion for users expecting the same behavior as events "on child" when setting events in the mask on a child. The desired semantics of setting dirent events in the mask of a child are not clear, so for now, deny this action for a group initialized with flag FAN_REPORT_TARGET_FID and for the new event FAN_RENAME. We may relax this restriction in the future if we decide on the semantics and implement them. Fixes: d61fd650e9d2 ("fanotify: introduce group flag FAN_REPORT_TARGET_FID") Fixes: 8cc3b1ccd930 ("fanotify: wire up FAN_RENAME event") Link: https://lore.kernel.org/linux-fsdevel/20220505133057.zm5t6vumc4xdcnsg@quack3.lan/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220507080028.219826-1-amir73il@gmail.com --- fs/notify/fanotify/fanotify_user.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9b32b76a9c30..a792e21c5309 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1657,6 +1657,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; + /* + * FAN_RENAME is not allowed on non-dir (for now). + * We shouldn't have allowed setting any dirent events in mask of + * non-dir, but because we always allowed it, error only if group + * was initialized with the new flag FAN_REPORT_TARGET_FID. + */ + ret = -ENOTDIR; + if (inode && !S_ISDIR(inode->i_mode) && + ((mask & FAN_RENAME) || + ((mask & FANOTIFY_DIRENT_EVENTS) && + FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID)))) + goto path_put_and_out; + /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; -- cgit v1.2.3 From 085d16d5f949b64713d5e960d6c9bbf51bc1d511 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Sun, 8 May 2022 15:54:50 +0300 Subject: nfs: fix broken handling of the softreval mount option Turns out that ever since this mount option was added, passing `softreval` in NFS mount options cancelled all other flags while not affecting the underlying flag `NFS_MOUNT_SOFTREVAL`. Fixes: c74dfe97c104 ("NFS: Add mount option 'softreval'") Signed-off-by: Dan Aloni Signed-off-by: Trond Myklebust --- fs/nfs/fs_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index e2d59bb5e6bb..9a16897e8dc6 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -517,7 +517,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, if (result.negated) ctx->flags &= ~NFS_MOUNT_SOFTREVAL; else - ctx->flags &= NFS_MOUNT_SOFTREVAL; + ctx->flags |= NFS_MOUNT_SOFTREVAL; break; case Opt_posix: if (result.negated) -- cgit v1.2.3 From 1927e498aee1757b3df755a194cbfc5cc0f2b663 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 9 May 2022 17:34:28 -0700 Subject: procfs: prevent unprivileged processes accessing fdinfo dir The file permissions on the fdinfo dir from were changed from S_IRUSR|S_IXUSR to S_IRUGO|S_IXUGO, and a PTRACE_MODE_READ check was added for opening the fdinfo files [1]. However, the ptrace permission check was not added to the directory, allowing anyone to get the open FD numbers by reading the fdinfo directory. Add the missing ptrace permission check for opening the fdinfo directory. [1] https://lkml.kernel.org/r/20210308170651.919148-1-kaleshsingh@google.com Link: https://lkml.kernel.org/r/20210713162008.1056986-1-kaleshsingh@google.com Fixes: 7bc3fa0172a4 ("procfs: allow reading fdinfo with PTRACE_MODE_READ") Signed-off-by: Kalesh Singh Cc: Kees Cook Cc: Eric W. Biederman Cc: Christian Brauner Cc: Suren Baghdasaryan Cc: Hridya Valsaraju Cc: Jann Horn Signed-off-by: Andrew Morton --- fs/proc/fd.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 172c86270b31..913bef0d2a36 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -72,7 +72,7 @@ out: return 0; } -static int seq_fdinfo_open(struct inode *inode, struct file *file) +static int proc_fdinfo_access_allowed(struct inode *inode) { bool allowed = false; struct task_struct *task = get_proc_task(inode); @@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) if (!allowed) return -EACCES; + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + return single_open(file, seq_show, inode); } @@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx) proc_fdinfo_instantiate); } +static int proc_open_fdinfo(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return 0; +} + const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { + .open = proc_open_fdinfo, .read = generic_read_dir, .iterate_shared = proc_readfdinfo, .llseek = generic_file_llseek, -- cgit v1.2.3 From 620239d9a32e9fe27c9204ec11e40058671aeeb6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 25 Apr 2022 15:54:27 -0400 Subject: ceph: fix setting of xattrs on async created inodes Currently when we create a file, we spin up an xattr buffer to send along with the create request. If we end up doing an async create however, then we currently pass down a zero-length xattr buffer. Fix the code to send down the xattr buffer in req->r_pagelist. If the xattrs span more than a page, however give up and don't try to do an async create. Cc: stable@vger.kernel.org URL: https://bugzilla.redhat.com/show_bug.cgi?id=2063929 Fixes: 9a8d03ca2e2c ("ceph: attempt to do async create when possible") Reported-by: John Fortin Reported-by: Sri Ramanujam Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6c9e837aa1d3..8c8226c0feac 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -629,9 +629,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, iinfo.change_attr = 1; ceph_encode_timespec64(&iinfo.btime, &now); - iinfo.xattr_len = ARRAY_SIZE(xattr_buf); - iinfo.xattr_data = xattr_buf; - memset(iinfo.xattr_data, 0, iinfo.xattr_len); + if (req->r_pagelist) { + iinfo.xattr_len = req->r_pagelist->length; + iinfo.xattr_data = req->r_pagelist->mapped_tail; + } else { + /* fake it */ + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + } in.ino = cpu_to_le64(vino.ino); in.snapid = cpu_to_le64(CEPH_NOSNAP); @@ -743,6 +749,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_security_init_secctx(dentry, mode, &as_ctx); if (err < 0) goto out_ctx; + /* Async create can't handle more than a page of xattrs */ + if (as_ctx.pagelist && + !list_is_singular(&as_ctx.pagelist->head)) + try_async = false; } else if (!d_in_lookup(dentry)) { /* If it's not being looked up, it's negative */ return -ENOENT; -- cgit v1.2.3 From 642d51fb0775a41dd6bb3d99b5a40c24df131c20 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 5 May 2022 18:53:09 +0800 Subject: ceph: check folio PG_private bit instead of folio->private The pages in the file mapping maybe reclaimed and reused by other subsystems and the page->private maybe used as flags field or something else, if later that pages are used by page caches again the page->private maybe not cleared as expected. Here will check the PG_private bit instead of the folio->private. Cc: stable@vger.kernel.org URL: https://tracker.ceph.com/issues/55421 Signed-off-by: Xiubo Li Reviewed-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index aa25bffd4823..b6edcf89a429 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -85,7 +85,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) if (folio_test_dirty(folio)) { dout("%p dirty_folio %p idx %lu -- already dirty\n", mapping->host, folio, folio->index); - BUG_ON(!folio_get_private(folio)); + VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); return false; } @@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) * Reference snap context in folio->private. Also set * PagePrivate so that we get invalidate_folio callback. */ - BUG_ON(folio_get_private(folio)); + VM_BUG_ON_FOLIO(folio_test_private(folio), folio); folio_attach_private(folio, snapc); return ceph_fscache_dirty_folio(mapping, folio); @@ -150,7 +150,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset, } WARN_ON(!folio_test_locked(folio)); - if (folio_get_private(folio)) { + if (folio_test_private(folio)) { dout("%p invalidate_folio idx %lu full dirty page\n", inode, folio->index); @@ -729,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req) /* clean all pages */ for (i = 0; i < req->r_num_ops; i++) { - if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) { + pr_warn("%s incorrect op %d req %p index %d tid %llu\n", + __func__, req->r_ops[i].op, req, i, req->r_tid); break; + } osd_data = osd_req_op_extent_osd_data(req, i); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); -- cgit v1.2.3 From 846a3351ddfe4a86eede4bb26a205c3f38ef84d3 Mon Sep 17 00:00:00 2001 From: Jing Xia Date: Tue, 10 May 2022 10:35:14 +0800 Subject: writeback: Avoid skipping inode writeback We have run into an issue that a task gets stuck in balance_dirty_pages_ratelimited() when perform I/O stress testing. The reason we observed is that an I_DIRTY_PAGES inode with lots of dirty pages is in b_dirty_time list and standard background writeback cannot writeback the inode. After studing the relevant code, the following scenario may lead to the issue: task1 task2 ----- ----- fuse_flush write_inode_now //in b_dirty_time writeback_single_inode __writeback_single_inode fuse_write_end filemap_dirty_folio __xa_set_mark:PAGECACHE_TAG_DIRTY lock inode->i_lock if mapping tagged PAGECACHE_TAG_DIRTY inode->i_state |= I_DIRTY_PAGES unlock inode->i_lock __mark_inode_dirty:I_DIRTY_PAGES lock inode->i_lock -was dirty,inode stays in -b_dirty_time unlock inode->i_lock if(!(inode->i_state & I_DIRTY_All)) -not true,so nothing done This patch moves the dirty inode to b_dirty list when the inode currently is not queued in b_io or b_more_io list at the end of writeback_single_inode. Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig CC: stable@vger.kernel.org Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option") Signed-off-by: Jing Xia Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220510023514.27399-1-jing.xia@unisoc.com --- fs/fs-writeback.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 591fe9cf1659..1fae0196292a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1712,6 +1712,10 @@ static int writeback_single_inode(struct inode *inode, */ if (!(inode->i_state & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); + else if (!(inode->i_state & I_SYNC_QUEUED) && + (inode->i_state & I_DIRTY)) + redirty_tail_locked(inode, wb); + spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: -- cgit v1.2.3 From c1ad35dd0548ce947d97aaf92f7f2f9a202951cf Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 May 2022 12:36:04 +0200 Subject: udf: Avoid using stale lengthOfImpUse udf_write_fi() uses lengthOfImpUse of the entry it is writing to. However this field has not yet been initialized so it either contains completely bogus value or value from last directory entry at that place. In either case this is wrong and can lead to filesystem corruption or kernel crashes. Reported-by: butt3rflyh4ck CC: stable@vger.kernel.org Fixes: 979a6e28dd96 ("udf: Get rid of 0-length arrays in struct fileIdentDesc") Signed-off-by: Jan Kara --- fs/udf/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 0ed4861b038f..b3d5f97f16cd 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -75,11 +75,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, if (fileident) { if (adinicb || (offset + lfi < 0)) { - memcpy(udf_get_fi_ident(sfi), fileident, lfi); + memcpy(sfi->impUse + liu, fileident, lfi); } else if (offset >= 0) { memcpy(fibh->ebh->b_data + offset, fileident, lfi); } else { - memcpy(udf_get_fi_ident(sfi), fileident, -offset); + memcpy(sfi->impUse + liu, fileident, -offset); memcpy(fibh->ebh->b_data, fileident - offset, lfi + offset); } @@ -88,11 +88,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, offset += lfi; if (adinicb || (offset + padlen < 0)) { - memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen); + memset(sfi->impUse + liu + lfi, 0x00, padlen); } else if (offset >= 0) { memset(fibh->ebh->b_data + offset, 0x00, padlen); } else { - memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset); + memset(sfi->impUse + liu + lfi, 0x00, -offset); memset(fibh->ebh->b_data, 0x00, padlen + offset); } -- cgit v1.2.3 From d031a8866e709c9d1ee5537a321b6192b4d2dc5b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 14 Apr 2022 17:52:39 +0200 Subject: gfs2: Fix filesystem block deallocation for short writes When a write cannot be carried out in full, gfs2_iomap_end() releases blocks that have been allocated for this write but haven't been used. To compute the end of the allocation, gfs2_iomap_end() incorrectly rounded the end of the attempted write down to the next block boundary to arrive at the end of the allocation. It would have to round up, but the end of the allocation is also available as iomap->offset + iomap->length, so just use that instead. In addition, use round_up() for computing the start of the unused range. Fixes: 64bc06bb32ee ("gfs2: iomap buffered write support") Signed-off-by: Andreas Gruenbacher --- fs/gfs2/bmap.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 39080b2d6cf8..b6697333bb2b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1153,13 +1153,12 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (length != written && (iomap->flags & IOMAP_F_NEW)) { /* Deallocate blocks that were just allocated. */ - loff_t blockmask = i_blocksize(inode) - 1; - loff_t end = (pos + length) & ~blockmask; + loff_t hstart = round_up(pos + written, i_blocksize(inode)); + loff_t hend = iomap->offset + iomap->length; - pos = (pos + written + blockmask) & ~blockmask; - if (pos < end) { - truncate_pagecache_range(inode, pos, end - 1); - punch_hole(ip, pos, end - pos); + if (hstart < hend) { + truncate_pagecache_range(inode, hstart, hend - 1); + punch_hole(ip, hstart, hend - hstart); } } -- cgit v1.2.3 From 42e4c3bdcae7833eeeaed7bf0c000c2de17dd291 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 27 Apr 2022 13:53:42 +0200 Subject: gfs2: Variable rename Instead of counting the number of bytes read from the filesystem, functions gfs2_file_direct_read and gfs2_file_read_iter count the number of bytes written into the user buffer. Conversely, functions gfs2_file_direct_write and gfs2_file_buffered_write count the number of bytes read from the user buffer. This is nothing but confusing, so change the read functions to count how many bytes they have read, and the write functions to count how many bytes they have written. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 48f01323c37c..4d36c01727ad 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -807,7 +807,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, struct file *file = iocb->ki_filp; struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); size_t prev_count = 0, window_size = 0; - size_t written = 0; + size_t read = 0; ssize_t ret; /* @@ -839,11 +839,11 @@ retry_under_glock: pagefault_disable(); to->nofault = true; ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, - IOMAP_DIO_PARTIAL, written); + IOMAP_DIO_PARTIAL, read); to->nofault = false; pagefault_enable(); if (ret > 0) - written = ret; + read = ret; if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { size_t leftover; @@ -863,7 +863,7 @@ out_uninit: gfs2_holder_uninit(gh); if (ret < 0) return ret; - return written; + return read; } static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, @@ -873,7 +873,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, struct inode *inode = file->f_mapping->host; struct gfs2_inode *ip = GFS2_I(inode); size_t prev_count = 0, window_size = 0; - size_t read = 0; + size_t written = 0; ssize_t ret; /* @@ -906,13 +906,13 @@ retry_under_glock: from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, - IOMAP_DIO_PARTIAL, read); + IOMAP_DIO_PARTIAL, written); from->nofault = false; if (ret == -ENOTBLK) ret = 0; if (ret > 0) - read = ret; + written = ret; if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { size_t leftover; @@ -933,7 +933,7 @@ out_uninit: gfs2_holder_uninit(gh); if (ret < 0) return ret; - return read; + return written; } static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -941,7 +941,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct gfs2_inode *ip; struct gfs2_holder gh; size_t prev_count = 0, window_size = 0; - size_t written = 0; + size_t read = 0; ssize_t ret; /* @@ -962,7 +962,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (ret >= 0) { if (!iov_iter_count(to)) return ret; - written = ret; + read = ret; } else if (ret != -EFAULT) { if (ret != -EAGAIN) return ret; @@ -980,7 +980,7 @@ retry_under_glock: ret = generic_file_read_iter(iocb, to); pagefault_enable(); if (ret > 0) - written += ret; + read += ret; if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { size_t leftover; @@ -998,7 +998,7 @@ retry_under_glock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - return written ? written : ret; + return read ? read : ret; } static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, @@ -1012,7 +1012,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct gfs2_holder *statfs_gh = NULL; size_t prev_count = 0, window_size = 0; size_t orig_count = iov_iter_count(from); - size_t read = 0; + size_t written = 0; ssize_t ret; /* @@ -1050,13 +1050,13 @@ retry_under_glock: current->backing_dev_info = NULL; if (ret > 0) { iocb->ki_pos += ret; - read += ret; + written += ret; } if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); - from->count = orig_count - read; + from->count = orig_count - written; if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { size_t leftover; @@ -1077,8 +1077,8 @@ out_uninit: gfs2_holder_uninit(gh); if (statfs_gh) kfree(statfs_gh); - from->count = orig_count - read; - return read ? read : ret; + from->count = orig_count - written; + return written ? written : ret; } /** -- cgit v1.2.3 From 6d22ff471070e21e24667be70764ee5abdfe5608 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 5 May 2022 12:37:49 +0200 Subject: gfs2: Clean up use of fault_in_iov_iter_{read,write}able No need to store the return value of the fault_in functions in separate variables. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4d36c01727ad..acc0c1d41564 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -846,12 +846,10 @@ retry_under_glock: read = ret; if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { - size_t leftover; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_writeable(to, window_size); + window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { + if (window_size) { if (gfs2_holder_queued(gh)) goto retry_under_glock; goto retry; @@ -915,12 +913,10 @@ retry_under_glock: written = ret; if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { - size_t leftover; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_readable(from, window_size); + window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { + if (window_size) { if (gfs2_holder_queued(gh)) goto retry_under_glock; goto retry; @@ -983,12 +979,10 @@ retry_under_glock: read += ret; if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { - size_t leftover; - gfs2_holder_allow_demote(&gh); - leftover = fault_in_iov_iter_writeable(to, window_size); + window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(&gh); - if (leftover != window_size) { + if (window_size) { if (gfs2_holder_queued(&gh)) goto retry_under_glock; goto retry; @@ -1058,13 +1052,11 @@ retry_under_glock: from->count = orig_count - written; if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { - size_t leftover; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_readable(from, window_size); + window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - from->count = min(from->count, window_size - leftover); + if (window_size) { + from->count = min(from->count, window_size); if (gfs2_holder_queued(gh)) goto retry_under_glock; goto retry; -- cgit v1.2.3 From 72382264502d9348ead372f82ecc3044de5c82d2 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 5 May 2022 12:53:26 +0200 Subject: gfs2: Pull return value test out of should_fault_in_pages Pull the return value test of the previous read or write operation out of should_fault_in_pages(). In a following patch, we'll fault in pages before the I/O and there will be no return value to check. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index acc0c1d41564..ea87bef7314d 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -770,7 +770,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, return ret ? ret : ret1; } -static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, +static inline bool should_fault_in_pages(struct iov_iter *i, size_t *prev_count, size_t *window_size) { @@ -779,8 +779,6 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, if (likely(!count)) return false; - if (ret <= 0 && ret != -EFAULT) - return false; if (!iter_is_iovec(i)) return false; @@ -842,10 +840,12 @@ retry_under_glock: IOMAP_DIO_PARTIAL, read); to->nofault = false; pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; if (ret > 0) read = ret; - if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { + if (should_fault_in_pages(to, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(gh); @@ -855,6 +855,7 @@ retry_under_glock: goto retry; } } +out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: @@ -899,20 +900,23 @@ retry: goto out_uninit; /* Silently fall back to buffered I/O when writing beyond EOF */ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) - goto out; + goto out_unlock; retry_under_glock: from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, IOMAP_DIO_PARTIAL, written); from->nofault = false; - - if (ret == -ENOTBLK) - ret = 0; + if (ret <= 0) { + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EFAULT) + goto out_unlock; + } if (ret > 0) written = ret; - if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { + if (should_fault_in_pages(from, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); @@ -922,7 +926,7 @@ retry_under_glock: goto retry; } } -out: +out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: @@ -975,10 +979,12 @@ retry_under_glock: pagefault_disable(); ret = generic_file_read_iter(iocb, to); pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; if (ret > 0) read += ret; - if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { + if (should_fault_in_pages(to, &prev_count, &window_size)) { gfs2_holder_allow_demote(&gh); window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(&gh); @@ -988,6 +994,7 @@ retry_under_glock: goto retry; } } +out_unlock: if (gfs2_holder_queued(&gh)) gfs2_glock_dq(&gh); out_uninit: @@ -1050,8 +1057,11 @@ retry_under_glock: if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; + from->count = orig_count - written; - if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { + if (should_fault_in_pages(from, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); -- cgit v1.2.3 From 324d116c5a5c8204dc00e63f725a3c5ed09afb53 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 5 May 2022 13:32:23 +0200 Subject: gfs2: Align read and write chunks to the page cache Align the chunks that reads and writes are carried out in to the page cache rather than the user buffers. This will be more efficient in general, especially for allocating writes. Optimizing the case that the user buffer is gfs2 backed isn't very useful; we only need to make sure we won't deadlock. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index ea87bef7314d..11c46407d4a8 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -771,6 +771,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, } static inline bool should_fault_in_pages(struct iov_iter *i, + struct kiocb *iocb, size_t *prev_count, size_t *window_size) { @@ -783,15 +784,13 @@ static inline bool should_fault_in_pages(struct iov_iter *i, return false; size = PAGE_SIZE; - offs = offset_in_page(i->iov[0].iov_base + i->iov_offset); + offs = offset_in_page(iocb->ki_pos); if (*prev_count != count || !*window_size) { size_t nr_dirtied; - size = ALIGN(offs + count, PAGE_SIZE); - size = min_t(size_t, size, SZ_1M); nr_dirtied = max(current->nr_dirtied_pause - current->nr_dirtied, 8); - size = min(size, nr_dirtied << PAGE_SHIFT); + size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT); } *prev_count = count; @@ -845,7 +844,7 @@ retry_under_glock: if (ret > 0) read = ret; - if (should_fault_in_pages(to, &prev_count, &window_size)) { + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(gh); @@ -916,7 +915,7 @@ retry_under_glock: if (ret > 0) written = ret; - if (should_fault_in_pages(from, &prev_count, &window_size)) { + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); @@ -984,7 +983,7 @@ retry_under_glock: if (ret > 0) read += ret; - if (should_fault_in_pages(to, &prev_count, &window_size)) { + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { gfs2_holder_allow_demote(&gh); window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(&gh); @@ -1061,7 +1060,7 @@ retry_under_glock: goto out_unlock; from->count = orig_count - written; - if (should_fault_in_pages(from, &prev_count, &window_size)) { + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); -- cgit v1.2.3 From fa5dfa645d85910d747f4e0c97f19e5e97d1c270 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 4 May 2022 23:37:30 +0200 Subject: gfs2: buffered write prefaulting In gfs2_file_buffered_write, to increase the likelihood that all the user memory we're trying to write will be resident in memory, carry out the write in chunks and fault in each chunk of user memory before trying to write it. Otherwise, some workloads will trigger frequent short "internal" writes, causing filesystem blocks to be allocated and then partially deallocated again when writing into holes, which is wasteful and breaks reservations. Neither the chunked writes nor any of the short "internal" writes are user visible. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 11c46407d4a8..5eda1bcc85e3 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -778,7 +778,7 @@ static inline bool should_fault_in_pages(struct iov_iter *i, size_t count = iov_iter_count(i); size_t size, offs; - if (likely(!count)) + if (!count) return false; if (!iter_is_iovec(i)) return false; @@ -1033,7 +1033,20 @@ retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { retry_under_glock: + gfs2_holder_allow_demote(gh); + window_size -= fault_in_iov_iter_readable(from, window_size); + gfs2_holder_disallow_demote(gh); + if (!window_size) { + ret = -EFAULT; + goto out_unlock; + } + if (!gfs2_holder_queued(gh)) + goto retry; + from->count = min(from->count, window_size); + } + if (inode == sdp->sd_rindex) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -1060,17 +1073,8 @@ retry_under_glock: goto out_unlock; from->count = orig_count - written; - if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { - gfs2_holder_allow_demote(gh); - window_size -= fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); - if (window_size) { - from->count = min(from->count, window_size); - if (gfs2_holder_queued(gh)) - goto retry_under_glock; - goto retry; - } - } + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) + goto retry_under_glock; out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); -- cgit v1.2.3 From e1fa9ea85ce89207d2ac0316da35a4a7736801f9 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 11 May 2022 18:27:12 +0200 Subject: gfs2: Stop using glock holder auto-demotion for now We're having unresolved issues with the glock holder auto-demotion mechanism introduced in commit dc732906c245. This mechanism was assumed to be essential for avoiding frequent short reads and writes until commit 296abc0d91d8 ("gfs2: No short reads or writes upon glock contention"). Since then, when the inode glock is lost, it is simply re-acquired and the operation is resumed. This means that apart from the performance penalty, we might as well drop the inode glock before faulting in pages, and re-acquire it afterwards. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 46 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 5eda1bcc85e3..2556ae1f92ea 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -832,7 +832,6 @@ retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: pagefault_disable(); to->nofault = true; ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, @@ -845,14 +844,10 @@ retry_under_glock: read = ret; if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { - gfs2_holder_allow_demote(gh); + gfs2_glock_dq(gh); window_size -= fault_in_iov_iter_writeable(to, window_size); - gfs2_holder_disallow_demote(gh); - if (window_size) { - if (gfs2_holder_queued(gh)) - goto retry_under_glock; + if (window_size) goto retry; - } } out_unlock: if (gfs2_holder_queued(gh)) @@ -900,7 +895,6 @@ retry: /* Silently fall back to buffered I/O when writing beyond EOF */ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) goto out_unlock; -retry_under_glock: from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, @@ -916,14 +910,10 @@ retry_under_glock: written = ret; if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { - gfs2_holder_allow_demote(gh); + gfs2_glock_dq(gh); window_size -= fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); - if (window_size) { - if (gfs2_holder_queued(gh)) - goto retry_under_glock; + if (window_size) goto retry; - } } out_unlock: if (gfs2_holder_queued(gh)) @@ -974,7 +964,6 @@ retry: ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; -retry_under_glock: pagefault_disable(); ret = generic_file_read_iter(iocb, to); pagefault_enable(); @@ -984,14 +973,10 @@ retry_under_glock: read += ret; if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { - gfs2_holder_allow_demote(&gh); + gfs2_glock_dq(&gh); window_size -= fault_in_iov_iter_writeable(to, window_size); - gfs2_holder_disallow_demote(&gh); - if (window_size) { - if (gfs2_holder_queued(&gh)) - goto retry_under_glock; + if (window_size) goto retry; - } } out_unlock: if (gfs2_holder_queued(&gh)) @@ -1030,22 +1015,17 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); retry: - ret = gfs2_glock_nq(gh); - if (ret) - goto out_uninit; if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { -retry_under_glock: - gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); if (!window_size) { ret = -EFAULT; - goto out_unlock; + goto out_uninit; } - if (!gfs2_holder_queued(gh)) - goto retry; from->count = min(from->count, window_size); } + ret = gfs2_glock_nq(gh); + if (ret) + goto out_uninit; if (inode == sdp->sd_rindex) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -1073,8 +1053,10 @@ retry_under_glock: goto out_unlock; from->count = orig_count - written; - if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) - goto retry_under_glock; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + goto retry; + } out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); -- cgit v1.2.3