From 8a1f1e3d1eecf9d2359a2709e276743a67e145db Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 8 Sep 2022 12:31:50 +0100
Subject: btrfs: fix hang during unmount when stopping block group reclaim
 worker

During early unmount, at close_ctree(), we try to stop the block group
reclaim task with cancel_work_sync(), but that may hang if the block group
reclaim task is currently at btrfs_relocate_block_group() waiting for the
flag BTRFS_FS_UNFINISHED_DROPS to be cleared from fs_info->flags. During
unmount we only clear that flag later, after trying to stop the block
group reclaim task.

Fix that by clearing BTRFS_FS_UNFINISHED_DROPS before trying to stop the
block group reclaim task and after setting BTRFS_FS_CLOSING_START, so that
if the reclaim task is waiting on that bit, it will stop immediately after
being woken, because it sees the filesystem is closing (with a call to
btrfs_fs_closing()), and then returns immediately with -EINTR.

Fixes: 31e70e527806c5 ("btrfs: fix hang during unmount when block group reclaim task is running")
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 129711773ae5..9f844328b1e2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4476,6 +4476,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 
 	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
 
+	/*
+	 * If we had UNFINISHED_DROPS we could still be processing them, so
+	 * clear that bit and wake up relocation so it can stop.
+	 * We must do this before stopping the block group reclaim task, because
+	 * at btrfs_relocate_block_group() we wait for this bit, and after the
+	 * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
+	 * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
+	 * return 1.
+	 */
+	btrfs_wake_unfinished_drop(fs_info);
+
 	/*
 	 * We may have the reclaim task running and relocating a data block group,
 	 * in which case it may create delayed iputs. So stop it before we park
@@ -4494,12 +4505,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	 */
 	kthread_park(fs_info->cleaner_kthread);
 
-	/*
-	 * If we had UNFINISHED_DROPS we could still be processing them, so
-	 * clear that bit and wake up relocation so it can stop.
-	 */
-	btrfs_wake_unfinished_drop(fs_info);
-
 	/* wait for the qgroup rescan worker to stop */
 	btrfs_qgroup_wait_for_completion(fs_info, false);
 
-- 
cgit v1.2.3


From a362bb864b8db4861977d00bd2c3222503ccc34b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 8 Sep 2022 12:31:51 +0100
Subject: btrfs: fix hang during unmount when stopping a space reclaim worker

Often when running generic/562 from fstests we can hang during unmount,
resulting in a trace like this:

  Sep 07 11:52:00 debian9 unknown: run fstests generic/562 at 2022-09-07 11:52:00
  Sep 07 11:55:32 debian9 kernel: INFO: task umount:49438 blocked for more than 120 seconds.
  Sep 07 11:55:32 debian9 kernel:       Not tainted 6.0.0-rc2-btrfs-next-122 #1
  Sep 07 11:55:32 debian9 kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  Sep 07 11:55:32 debian9 kernel: task:umount          state:D stack:    0 pid:49438 ppid: 25683 flags:0x00004000
  Sep 07 11:55:32 debian9 kernel: Call Trace:
  Sep 07 11:55:32 debian9 kernel:  <TASK>
  Sep 07 11:55:32 debian9 kernel:  __schedule+0x3c8/0xec0
  Sep 07 11:55:32 debian9 kernel:  ? rcu_read_lock_sched_held+0x12/0x70
  Sep 07 11:55:32 debian9 kernel:  schedule+0x5d/0xf0
  Sep 07 11:55:32 debian9 kernel:  schedule_timeout+0xf1/0x130
  Sep 07 11:55:32 debian9 kernel:  ? lock_release+0x224/0x4a0
  Sep 07 11:55:32 debian9 kernel:  ? lock_acquired+0x1a0/0x420
  Sep 07 11:55:32 debian9 kernel:  ? trace_hardirqs_on+0x2c/0xd0
  Sep 07 11:55:32 debian9 kernel:  __wait_for_common+0xac/0x200
  Sep 07 11:55:32 debian9 kernel:  ? usleep_range_state+0xb0/0xb0
  Sep 07 11:55:32 debian9 kernel:  __flush_work+0x26d/0x530
  Sep 07 11:55:32 debian9 kernel:  ? flush_workqueue_prep_pwqs+0x140/0x140
  Sep 07 11:55:32 debian9 kernel:  ? trace_clock_local+0xc/0x30
  Sep 07 11:55:32 debian9 kernel:  __cancel_work_timer+0x11f/0x1b0
  Sep 07 11:55:32 debian9 kernel:  ? close_ctree+0x12b/0x5b3 [btrfs]
  Sep 07 11:55:32 debian9 kernel:  ? __trace_bputs+0x10b/0x170
  Sep 07 11:55:32 debian9 kernel:  close_ctree+0x152/0x5b3 [btrfs]
  Sep 07 11:55:32 debian9 kernel:  ? evict_inodes+0x166/0x1c0
  Sep 07 11:55:32 debian9 kernel:  generic_shutdown_super+0x71/0x120
  Sep 07 11:55:32 debian9 kernel:  kill_anon_super+0x14/0x30
  Sep 07 11:55:32 debian9 kernel:  btrfs_kill_super+0x12/0x20 [btrfs]
  Sep 07 11:55:32 debian9 kernel:  deactivate_locked_super+0x2e/0xa0
  Sep 07 11:55:32 debian9 kernel:  cleanup_mnt+0x100/0x160
  Sep 07 11:55:32 debian9 kernel:  task_work_run+0x59/0xa0
  Sep 07 11:55:32 debian9 kernel:  exit_to_user_mode_prepare+0x1a6/0x1b0
  Sep 07 11:55:32 debian9 kernel:  syscall_exit_to_user_mode+0x16/0x40
  Sep 07 11:55:32 debian9 kernel:  do_syscall_64+0x48/0x90
  Sep 07 11:55:32 debian9 kernel:  entry_SYSCALL_64_after_hwframe+0x63/0xcd
  Sep 07 11:55:32 debian9 kernel: RIP: 0033:0x7fcde59a57a7
  Sep 07 11:55:32 debian9 kernel: RSP: 002b:00007ffe914217c8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
  Sep 07 11:55:32 debian9 kernel: RAX: 0000000000000000 RBX: 00007fcde5ae8264 RCX: 00007fcde59a57a7
  Sep 07 11:55:32 debian9 kernel: RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000055b57556cdd0
  Sep 07 11:55:32 debian9 kernel: RBP: 000055b57556cba0 R08: 0000000000000000 R09: 00007ffe91420570
  Sep 07 11:55:32 debian9 kernel: R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
  Sep 07 11:55:32 debian9 kernel: R13: 000055b57556cdd0 R14: 000055b57556ccb8 R15: 0000000000000000
  Sep 07 11:55:32 debian9 kernel:  </TASK>

What happens is the following:

1) The cleaner kthread tries to start a transaction to delete an unused
   block group, but the metadata reservation can not be satisfied right
   away, so a reservation ticket is created and it starts the async
   metadata reclaim task (fs_info->async_reclaim_work);

2) Writeback for all the filler inodes with an i_size of 2K starts
   (generic/562 creates a lot of 2K files with the goal of filling
   metadata space). We try to create an inline extent for them, but we
   fail when trying to insert the inline extent with -ENOSPC (at
   cow_file_range_inline()) - since this is not critical, we fallback
   to non-inline mode (back to cow_file_range()), reserve extents, create
   extent maps and create the ordered extents;

3) An unmount starts, enters close_ctree();

4) The async reclaim task is flushing stuff, entering the flush states one
   by one, until it reaches RUN_DELAYED_IPUTS. There it runs all current
   delayed iputs.

   After running the delayed iputs and before calling
   btrfs_wait_on_delayed_iputs(), one or more ordered extents complete,
   and btrfs_add_delayed_iput() is called for each one through
   btrfs_finish_ordered_io() -> btrfs_put_ordered_extent(). This results
   in bumping fs_info->nr_delayed_iputs from 0 to some positive value.

   So the async reclaim task blocks at btrfs_wait_on_delayed_iputs() waiting
   for fs_info->nr_delayed_iputs to become 0;

5) The current transaction is committed by the transaction kthread, we then
   start unpinning extents and end up calling btrfs_try_granting_tickets()
   through unpin_extent_range(), since we released some space.
   This results in satisfying the ticket created by the cleaner kthread at
   step 1, waking up the cleaner kthread;

6) At close_ctree() we ask the cleaner kthread to park;

7) The cleaner kthread starts the transaction, deletes the unused block
   group, and then calls kthread_should_park(), which returns true, so it
   parks. And at this point we have the delayed iputs added by the
   completion of the ordered extents still pending;

8) Then later at close_ctree(), when we call:

       cancel_work_sync(&fs_info->async_reclaim_work);

   We hang forever, since the cleaner was parked and no one else can run
   delayed iputs after that, while the reclaim task is waiting for the
   remaining delayed iputs to be completed.

Fix this by waiting for all ordered extents to complete and running the
delayed iputs before attempting to stop the async reclaim tasks. Note that
we can not wait for ordered extents with btrfs_wait_ordered_roots() (or
other similar functions) because that waits for the BTRFS_ORDERED_COMPLETE
flag to be set on an ordered extent, but the delayed iput is added after
that, when doing the final btrfs_put_ordered_extent(). So instead wait for
the work queues used for executing ordered extent completion to be empty,
which works because we do the final put on an ordered extent at
btrfs_finish_ordered_io() (while we are in the unmount context).

Fixes: d6fd0ae25c6495 ("Btrfs: fix missing delayed iputs on unmount")
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9f844328b1e2..28ba93d4cf23 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4527,6 +4527,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	/* clear out the rbtree of defraggable inodes */
 	btrfs_cleanup_defrag_inodes(fs_info);
 
+	/*
+	 * After we parked the cleaner kthread, ordered extents may have
+	 * completed and created new delayed iputs. If one of the async reclaim
+	 * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
+	 * can hang forever trying to stop it, because if a delayed iput is
+	 * added after it ran btrfs_run_delayed_iputs() and before it called
+	 * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
+	 * no one else to run iputs.
+	 *
+	 * So wait for all ongoing ordered extents to complete and then run
+	 * delayed iputs. This works because once we reach this point no one
+	 * can either create new ordered extents nor create delayed iputs
+	 * through some other means.
+	 *
+	 * Also note that btrfs_wait_ordered_roots() is not safe here, because
+	 * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
+	 * but the delayed iput for the respective inode is made only when doing
+	 * the final btrfs_put_ordered_extent() (which must happen at
+	 * btrfs_finish_ordered_io() when we are unmounting).
+	 */
+	btrfs_flush_workqueue(fs_info->endio_write_workers);
+	/* Ordered extents for free space inodes. */
+	btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+	btrfs_run_delayed_iputs(fs_info);
+
 	cancel_work_sync(&fs_info->async_reclaim_work);
 	cancel_work_sync(&fs_info->async_data_reclaim_work);
 	cancel_work_sync(&fs_info->preempt_reclaim_work);
-- 
cgit v1.2.3


From 2dd7e7bc02829eded71be2342a93dc035f5223f9 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Fri, 9 Sep 2022 15:59:55 +0900
Subject: btrfs: zoned: wait for extent buffer IOs before finishing a zone

Before sending REQ_OP_ZONE_FINISH to a zone, we need to ensure that
ongoing IOs already finished. Or, we will see a "Zone Is Full" error for
the IOs, as the ZONE_FINISH command makes the zone full.

We ensure that with btrfs_wait_block_group_reservations() and
btrfs_wait_ordered_roots() for a data block group. And, for a metadata
block group, the comparison of alloc_offset vs meta_write_pointer mostly
ensures IOs for the allocated region already sent. However, there still
can be a little time frame where the IOs are sent but not yet completed.

Introduce wait_eb_writebacks() to ensure such IOs are completed for a
metadata block group. It walks the buffer_radix to find extent buffers in
the block group and calls wait_on_extent_buffer_writeback() on them.

Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking")
CC: stable@vger.kernel.org # 5.19+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 62e7007a7e46..73c6929f7be6 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1918,10 +1918,44 @@ out_unlock:
 	return ret;
 }
 
+static void wait_eb_writebacks(struct btrfs_block_group *block_group)
+{
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	const u64 end = block_group->start + block_group->length;
+	struct radix_tree_iter iter;
+	struct extent_buffer *eb;
+	void __rcu **slot;
+
+	rcu_read_lock();
+	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
+				 block_group->start >> fs_info->sectorsize_bits) {
+		eb = radix_tree_deref_slot(slot);
+		if (!eb)
+			continue;
+		if (radix_tree_deref_retry(eb)) {
+			slot = radix_tree_iter_retry(&iter);
+			continue;
+		}
+
+		if (eb->start < block_group->start)
+			continue;
+		if (eb->start >= end)
+			break;
+
+		slot = radix_tree_iter_resume(slot, &iter);
+		rcu_read_unlock();
+		wait_on_extent_buffer_writeback(eb);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	struct map_lookup *map;
+	const bool is_metadata = (block_group->flags &
+			(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
 	int ret = 0;
 	int i;
 
@@ -1932,8 +1966,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
 	}
 
 	/* Check if we have unwritten allocated space */
-	if ((block_group->flags &
-	     (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
+	if (is_metadata &&
 	    block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
 		spin_unlock(&block_group->lock);
 		return -EAGAIN;
@@ -1958,6 +1991,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
 		/* No need to wait for NOCOW writers. Zoned mode does not allow that */
 		btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
 					 block_group->length);
+		/* Wait for extent buffers to be written. */
+		if (is_metadata)
+			wait_eb_writebacks(block_group);
 
 		spin_lock(&block_group->lock);
 
-- 
cgit v1.2.3