summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/ctree.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2021-06-29 14:43:06 +0100
committerDavid Sterba <dsterba@suse.com>2021-07-07 17:42:41 +0200
commit79bd37120b149532af5b21953643ed74af69654f (patch)
treee0c94cf20ebb3d4ca34db5c7bb0419c137f91708 /fs/btrfs/ctree.c
parent1cb3db1cf383a3c7dbda1aa0ce748b0958759947 (diff)
downloadlinux-79bd37120b149532af5b21953643ed74af69654f.tar.gz
linux-79bd37120b149532af5b21953643ed74af69654f.tar.bz2
linux-79bd37120b149532af5b21953643ed74af69654f.zip
btrfs: rework chunk allocation to avoid exhaustion of the system chunk array
Commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array due to concurrent allocations") fixed a problem that resulted in exhausting the system chunk array in the superblock when there are many tasks allocating chunks in parallel. Basically too many tasks enter the first phase of chunk allocation without previous tasks having finished their second phase of allocation, resulting in too many system chunks being allocated. That was originally observed when running the fallocate tests of stress-ng on a PowerPC machine, using a node size of 64K. However that commit also introduced a deadlock where a task in phase 1 of the chunk allocation waited for another task that had allocated a system chunk to finish its phase 2, but that other task was waiting on an extent buffer lock held by the first task, therefore resulting in both tasks not making any progress. That change was later reverted by a patch with the subject "btrfs: fix deadlock with concurrent chunk allocations involving system chunks", since there is no simple and short solution to address it and the deadlock is relatively easy to trigger on zoned filesystems, while the system chunk array exhaustion is not so common. This change reworks the chunk allocation to avoid the system chunk array exhaustion. It accomplishes that by making the first phase of chunk allocation do the updates of the device items in the chunk btree and the insertion of the new chunk item in the chunk btree. This is done while under the protection of the chunk mutex (fs_info->chunk_mutex), in the same critical section that checks for available system space, allocates a new system chunk if needed and reserves system chunk space. This way we do not have chunk space reserved until the second phase completes. The same logic is applied to chunk removal as well, since it keeps reserved system space long after it is done updating the chunk btree. For direct allocation of system chunks, the previous behaviour remains, because otherwise we would deadlock on extent buffers of the chunk btree. Changes to the chunk btree are by large done by chunk allocation and chunk removal, which first reserve chunk system space and then later do changes to the chunk btree. The other remaining cases are uncommon and correspond to adding a device, removing a device and resizing a device. All these other cases do not pre-reserve system space, they modify the chunk btree right away, so they don't hold reserved space for a long period like chunk allocation and chunk removal do. The diff of this change is huge, but more than half of it is just addition of comments describing both how things work regarding chunk allocation and removal, including both the new behavior and the parts of the old behavior that did not change. CC: stable@vger.kernel.org # 5.12+ Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com> Tested-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Tested-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/ctree.c')
-rw-r--r--fs/btrfs/ctree.c67
1 files changed, 13 insertions, 54 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bc3ca2cbd7d..c5c08c87e130 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return 0;
}
-static struct extent_buffer *alloc_tree_block_no_bg_flush(
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 parent_start,
- const struct btrfs_disk_key *disk_key,
- int level,
- u64 hint,
- u64 empty_size,
- enum btrfs_lock_nesting nest)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_buffer *ret;
-
- /*
- * If we are COWing a node/leaf from the extent, chunk, device or free
- * space trees, make sure that we do not finish block group creation of
- * pending block groups. We do this to avoid a deadlock.
- * COWing can result in allocation of a new chunk, and flushing pending
- * block groups (btrfs_create_pending_block_groups()) can be triggered
- * when finishing allocation of a new chunk. Creation of a pending block
- * group modifies the extent, chunk, device and free space trees,
- * therefore we could deadlock with ourselves since we are holding a
- * lock on an extent buffer that btrfs_create_pending_block_groups() may
- * try to COW later.
- * For similar reasons, we also need to delay flushing pending block
- * groups when splitting a leaf or node, from one of those trees, since
- * we are holding a write lock on it and its parent or when inserting a
- * new root node for one of those trees.
- */
- if (root == fs_info->extent_root ||
- root == fs_info->chunk_root ||
- root == fs_info->dev_root ||
- root == fs_info->free_space_root)
- trans->can_flush_pending_bgs = false;
-
- ret = btrfs_alloc_tree_block(trans, root, parent_start,
- root->root_key.objectid, disk_key, level,
- hint, empty_size, nest);
- trans->can_flush_pending_bgs = true;
-
- return ret;
-}
-
/*
* does the dirty work in cow of a single block. The parent block (if
* supplied) is updated to point to the new cow copy. The new buffer is marked
@@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
- cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size, nest);
+ cow = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, &disk_key, level,
+ search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -2458,9 +2416,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0,
- BTRFS_NESTING_NEW_ROOT);
+ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &lower_key, level, root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -2589,8 +2547,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0, BTRFS_NESTING_SPLIT);
+ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, level, c->start, 0,
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3381,10 +3340,10 @@ again:
* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
* use BTRFS_NESTING_NEW_ROOT.
*/
- right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0, num_doubles ?
- BTRFS_NESTING_NEW_ROOT :
- BTRFS_NESTING_SPLIT);
+ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, 0, l->start, 0,
+ num_doubles ? BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);