From b271fee9a41ca1474d30639fd6cc912c9901d0f8 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 19 Dec 2023 01:02:28 +0900 Subject: btrfs: zoned: factor out prepare_allocation_zoned() Factor out prepare_allocation_zoned() for further extension. While at it, optimize the if-branch a bit. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396aba92c57..d260b970bec7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4298,6 +4298,24 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, return 0; } +static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } else if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } + + return 0; +} + static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4308,19 +4326,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - if (ffe_ctl->for_treelog) { - spin_lock(&fs_info->treelog_bg_lock); - if (fs_info->treelog_bg) - ffe_ctl->hint_byte = fs_info->treelog_bg; - spin_unlock(&fs_info->treelog_bg_lock); - } - if (ffe_ctl->for_data_reloc) { - spin_lock(&fs_info->relocation_bg_lock); - if (fs_info->data_reloc_bg) - ffe_ctl->hint_byte = fs_info->data_reloc_bg; - spin_unlock(&fs_info->relocation_bg_lock); - } - return 0; + return prepare_allocation_zoned(fs_info, ffe_ctl); default: BUG(); } -- cgit v1.2.3 From 02444f2ac26eae6385a65fcd66915084d15dffba Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 19 Dec 2023 01:02:29 +0900 Subject: btrfs: zoned: optimize hint byte for zoned allocator Writing sequentially to a huge file on btrfs on a SMR HDD revealed a decline of the performance (220 MiB/s to 30 MiB/s after 500 minutes). The performance goes down because of increased latency of the extent allocation, which is induced by a traversing of a lot of full block groups. So, this patch optimizes the ffe_ctl->hint_byte by choosing a block group with sufficient size from the active block group list, which does not contain full block groups. After applying the patch, the performance is maintained well. Fixes: 2eda57089ea3 ("btrfs: zoned: implement sequential extent allocation") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d260b970bec7..6d680031211a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4311,6 +4311,24 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, if (fs_info->data_reloc_bg) ffe_ctl->hint_byte = fs_info->data_reloc_bg; spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotinically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); } return 0; -- cgit v1.2.3 From 6ff09b6b8c2fb6b3edda4ffaa173153a40653067 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 21 Dec 2023 11:47:45 +0300 Subject: btrfs: fix kvcalloc() arguments order in btrfs_ioctl_send() When compiling with gcc version 14.0.0 20231220 (experimental) and W=1, I've noticed the following warning: fs/btrfs/send.c: In function 'btrfs_ioctl_send': fs/btrfs/send.c:8208:44: warning: 'kvcalloc' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 8208 | sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), | ^ Since 'n' and 'size' arguments of 'kvcalloc()' are multiplied to calculate the final size, their actual order doesn't affect the result and so this is not a bug. But it's still worth to fix it. Signed-off-by: Dmitry Antipov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e36550618e5..2d7519a6ce72 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -8205,8 +8205,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), - arg->clone_sources_count + 1, + sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1, + sizeof(*sctx->clone_roots), GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; -- cgit v1.2.3 From f03e274a8b29d1d1c1bbd7f764766cb5ca537ab7 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 3 Jan 2024 13:31:27 +0300 Subject: btrfs: ref-verify: free ref cache before clearing mount opt As clearing REF_VERIFY mount option indicates there were some errors in a ref-verify process, a ref cache is not relevant anymore and should be freed. btrfs_free_ref_cache() requires REF_VERIFY option being set so call it just before clearing the mount option. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Reported-by: syzbot+be14ed7728594dc8bd42@syzkaller.appspotmail.com Fixes: fd708b81d972 ("Btrfs: add a extent ref verify tool") CC: stable@vger.kernel.org # 5.4+ Closes: https://lore.kernel.org/lkml/000000000000e5a65c05ee832054@google.com/ Reported-by: syzbot+c563a3c79927971f950f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/0000000000007fe09705fdc6086c@google.com/ Reviewed-by: Anand Jain Signed-off-by: Fedor Pchelkin Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 6486f0d7e993..8c4fc98ca9ce 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, out_unlock: spin_unlock(&fs_info->ref_verify_lock); out: - if (ret) + if (ret) { + btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } return ret; } @@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) } } if (ret) { - btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } btrfs_free_path(path); return ret; -- cgit v1.2.3 From d967c914a633ee797255261808720f791b658f24 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 22 Dec 2023 01:46:16 +0900 Subject: btrfs: fix unbalanced unlock of mapping_tree_lock The error path of btrfs_get_chunk_map() releases fs_info->mapping_tree_lock. But, it is taken and released in btrfs_find_chunk_map(). So, there is no need to do so. Fixes: 7dc66abb5a47 ("btrfs: use a dedicated data structure for chunk maps") Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c32497311d2..d67785be2c77 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, map = btrfs_find_chunk_map(fs_info, logical, length); if (unlikely(!map)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); @@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, } if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, map->start, -- cgit v1.2.3 From b18f3b60b35a8c01c9a2a0f0d6424c6d73971dc3 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 22 Dec 2023 13:56:34 +0900 Subject: btrfs: zoned: fix lock ordering in btrfs_zone_activate() The btrfs CI reported a lockdep warning as follows by running generic generic/129. WARNING: possible circular locking dependency detected 6.7.0-rc5+ #1 Not tainted ------------------------------------------------------ kworker/u5:5/793427 is trying to acquire lock: ffff88813256d028 (&cache->lock){+.+.}-{2:2}, at: btrfs_zone_finish_one_bg+0x5e/0x130 but task is already holding lock: ffff88810a23a318 (&fs_info->zone_active_bgs_lock){+.+.}-{2:2}, at: btrfs_zone_finish_one_bg+0x34/0x130 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&fs_info->zone_active_bgs_lock){+.+.}-{2:2}: ... -> #0 (&cache->lock){+.+.}-{2:2}: ... This is because we take fs_info->zone_active_bgs_lock after a block_group's lock in btrfs_zone_activate() while doing the opposite in other places. Fix the issue by expanding the fs_info->zone_active_bgs_lock's critical section and taking it before a block_group's lock. Fixes: a7e1ac7bdc5a ("btrfs: zoned: reserve zones for an active metadata/system block group") CC: stable@vger.kernel.org # 6.6 Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 12066afc235c..ac9bbe0c4ffe 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2072,6 +2072,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&fs_info->zone_active_bgs_lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -2084,7 +2085,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { struct btrfs_zoned_device_info *zinfo; int reserved = 0; @@ -2104,20 +2104,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) */ if (atomic_read(&zinfo->active_zones_left) <= reserved) { ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!is_data) zinfo->reserved_active_zones--; } - spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); @@ -2125,8 +2122,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* For the active block group list */ btrfs_get_block_group(block_group); - - spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -2134,6 +2129,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&fs_info->zone_active_bgs_lock); return ret; } -- cgit v1.2.3 From 7081929ab2572920e94d70be3d332e5c9f97095a Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 4 Jan 2024 11:48:46 -0800 Subject: btrfs: don't abort filesystem when attempting to snapshot deleted subvolume If the source file descriptor to the snapshot ioctl refers to a deleted subvolume, we get the following abort: BTRFS: Transaction aborted (error -2) WARNING: CPU: 0 PID: 833 at fs/btrfs/transaction.c:1875 create_pending_snapshot+0x1040/0x1190 [btrfs] Modules linked in: pata_acpi btrfs ata_piix libata scsi_mod virtio_net blake2b_generic xor net_failover virtio_rng failover scsi_common rng_core raid6_pq libcrc32c CPU: 0 PID: 833 Comm: t_snapshot_dele Not tainted 6.7.0-rc6 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-1.fc39 04/01/2014 RIP: 0010:create_pending_snapshot+0x1040/0x1190 [btrfs] RSP: 0018:ffffa09c01337af8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff9982053e7c78 RCX: 0000000000000027 RDX: ffff99827dc20848 RSI: 0000000000000001 RDI: ffff99827dc20840 RBP: ffffa09c01337c00 R08: 0000000000000000 R09: ffffa09c01337998 R10: 0000000000000003 R11: ffffffffb96da248 R12: fffffffffffffffe R13: ffff99820535bb28 R14: ffff99820b7bd000 R15: ffff99820381ea80 FS: 00007fe20aadabc0(0000) GS:ffff99827dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559a120b502f CR3: 00000000055b6000 CR4: 00000000000006f0 Call Trace: ? create_pending_snapshot+0x1040/0x1190 [btrfs] ? __warn+0x81/0x130 ? create_pending_snapshot+0x1040/0x1190 [btrfs] ? report_bug+0x171/0x1a0 ? handle_bug+0x3a/0x70 ? exc_invalid_op+0x17/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? create_pending_snapshot+0x1040/0x1190 [btrfs] ? create_pending_snapshot+0x1040/0x1190 [btrfs] create_pending_snapshots+0x92/0xc0 [btrfs] btrfs_commit_transaction+0x66b/0xf40 [btrfs] btrfs_mksubvol+0x301/0x4d0 [btrfs] btrfs_mksnapshot+0x80/0xb0 [btrfs] __btrfs_ioctl_snap_create+0x1c2/0x1d0 [btrfs] btrfs_ioctl_snap_create_v2+0xc4/0x150 [btrfs] btrfs_ioctl+0x8a6/0x2650 [btrfs] ? kmem_cache_free+0x22/0x340 ? do_sys_openat2+0x97/0xe0 __x64_sys_ioctl+0x97/0xd0 do_syscall_64+0x46/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 RIP: 0033:0x7fe20abe83af RSP: 002b:00007ffe6eff1360 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fe20abe83af RDX: 00007ffe6eff23c0 RSI: 0000000050009417 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000000 R09: 00007fe20ad16cd0 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffe6eff13c0 R14: 00007fe20ad45000 R15: 0000559a120b6d58 ---[ end trace 0000000000000000 ]--- BTRFS: error (device vdc: state A) in create_pending_snapshot:1875: errno=-2 No such entry BTRFS info (device vdc: state EA): forced readonly BTRFS warning (device vdc: state EA): Skipping commit of aborted transaction. BTRFS: error (device vdc: state EA) in cleanup_transaction:2055: errno=-2 No such entry This happens because create_pending_snapshot() initializes the new root item as a copy of the source root item. This includes the refs field, which is 0 for a deleted subvolume. The call to btrfs_insert_root() therefore inserts a root with refs == 0. btrfs_get_new_fs_root() then finds the root and returns -ENOENT if refs == 0, which causes create_pending_snapshot() to abort. Fix it by checking the source root's refs before attempting the snapshot, but after locking subvol_sem to avoid racing with deletion. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Sweet Tea Dorminy Reviewed-by: Anand Jain Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4e50b62db2a8..fea5d37528b8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, return -EOPNOTSUPP; } + if (btrfs_root_refs(&root->root_item) == 0) + return -ENOENT; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; -- cgit v1.2.3 From 3324d0547861b16cf436d54abba7052e0c8aa9de Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 4 Jan 2024 11:48:47 -0800 Subject: btrfs: avoid copying BTRFS_ROOT_SUBVOL_DEAD flag to snapshot of subvolume being deleted Sweet Tea spotted a race between subvolume deletion and snapshotting that can result in the root item for the snapshot having the BTRFS_ROOT_SUBVOL_DEAD flag set. The race is: Thread 1 | Thread 2 ----------------------------------------------|---------- btrfs_delete_subvolume | btrfs_set_root_flags(BTRFS_ROOT_SUBVOL_DEAD)| |btrfs_mksubvol | down_read(subvol_sem) | create_snapshot | ... | create_pending_snapshot | copy root item from source down_write(subvol_sem) | This flag is only checked in send and swap activate, which this would cause to fail mysteriously. create_snapshot() now checks the root refs to reject a deleted subvolume, so we can fix this by locking subvol_sem earlier so that the BTRFS_ROOT_SUBVOL_DEAD flag and the root refs are updated atomically. CC: stable@vger.kernel.org # 4.14+ Reported-by: Sweet Tea Dorminy Reviewed-by: Sweet Tea Dorminy Reviewed-by: Anand Jain Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b3e39610cc95..7bcc1c03437a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4458,6 +4458,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) u64 root_flags; int ret; + down_write(&fs_info->subvol_sem); + /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit @@ -4469,25 +4471,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", root->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - down_write(&fs_info->subvol_sem); - ret = may_destroy_subvol(dest); if (ret) - goto out_up_write; + goto out_undead; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -4497,7 +4499,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) - goto out_up_write; + goto out_undead; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -4563,15 +4565,17 @@ out_end_trans: inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); -out_up_write: - up_write(&fs_info->subvol_sem); +out_undead: if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - } else { + } +out_up_write: + up_write(&fs_info->subvol_sem); + if (!ret) { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); -- cgit v1.2.3 From 173431b274a9a54fc10b273b46e67f46bcf62d2e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 10 Jan 2024 08:58:26 +1030 Subject: btrfs: defrag: reject unknown flags of btrfs_ioctl_defrag_range_args Add extra sanity check for btrfs_ioctl_defrag_range_args::flags. This is not really to enhance fuzzing tests, but as a preparation for future expansion on btrfs_ioctl_defrag_range_args. In the future we're going to add new members, allowing more fine tuning for btrfs defrag. Without the -ENONOTSUPP error, there would be no way to detect if the kernel supports those new defrag features. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fea5d37528b8..5d42319b43f2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2602,6 +2602,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EFAULT; goto out; } + if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { + ret = -EOPNOTSUPP; + goto out; + } /* compression requires us to start the IO */ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; -- cgit v1.2.3 From 2c25716dcc25a0420c4ad49d6e6bf61e60a21434 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 19:38:44 +1030 Subject: btrfs: zlib: fix and simplify the inline extent decompression [BUG] If we have a filesystem with 4k sectorsize, and an inlined compressed extent created like this: item 4 key (257 INODE_ITEM 0) itemoff 15863 itemsize 160 generation 8 transid 8 size 4096 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 5 key (257 INODE_REF 256) itemoff 15839 itemsize 24 index 2 namelen 14 name: source_inlined item 6 key (257 EXTENT_DATA 0) itemoff 15770 itemsize 69 generation 8 type 0 (inline) inline extent data size 48 ram_bytes 4096 compression 1 (zlib) Which has an inline compressed extent at file offset 0, and its decompressed size is 4K, allowing us to reflink that 4K range to another location (which will not be compressed). If we do such reflink on a subpage system, it would fail like this: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest XFS_IOC_CLONE_RANGE: Input/output error [CAUSE] In zlib_decompress(), we didn't treat @start_byte as just a page offset, but also use it as an indicator on whether we should switch our output buffer. In reality, for subpage cases, although @start_byte can be non-zero, we should never switch input/output buffer, since the whole input/output buffer should never exceed one sector. Note: The above assumption is only not true if we're going to support multi-page sectorsize. Thus the current code using @start_byte as a condition to switch input/output buffer or finish the decompression is completely incorrect. [FIX] The fix involves several modifications: - Rename @start_byte to @dest_pgoff to properly express its meaning - Add an extra ASSERT() inside btrfs_decompress() to make sure the input/output size never exceeds one sector. - Use Z_FINISH flag to make sure the decompression happens in one go - Remove the loop needed to switch input/output buffers - Use correct destination offset inside the destination page - Consider early end as an error After the fix, even on 64K page sized aarch64, above reflink now works as expected: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest linked 4096/4096 bytes at offset 61440 And resulted a correct file layout: item 9 key (258 INODE_ITEM 0) itemoff 15542 itemsize 160 generation 10 transid 10 size 65536 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 10 key (258 INODE_REF 256) itemoff 15528 itemsize 14 index 3 namelen 4 name: dest item 11 key (258 XATTR_ITEM 3817753667) itemoff 15445 itemsize 83 location key (0 UNKNOWN.0 0) type XATTR transid 10 data_len 37 name_len 16 name: security.selinux data unconfined_u:object_r:unlabeled_t:s0 item 12 key (258 EXTENT_DATA 61440) itemoff 15392 itemsize 53 generation 10 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 23 +++++++++++----- fs/btrfs/compression.h | 2 +- fs/btrfs/zlib.c | 73 +++++++++++++------------------------------------- 3 files changed, 36 insertions(+), 62 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 193168214eeb..68345f73d429 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + const u8 *data_in, struct page *dest_page, + unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: /* @@ -1037,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * start_byte tells us the offset into the compressed data we're interested in */ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + unsigned long dest_pgoff, size_t srclen, size_t destlen) { + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); struct list_head *workspace; + const u32 sectorsize = fs_info->sectorsize; int ret; + /* + * The full destination page range should not exceed the page size. + * And the @destlen should not exceed sectorsize, as this is only called for + * inline file extents, which should not exceed sectorsize. + */ + ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + workspace = get_workspace(type, 0); ret = compression_decompress(type, workspace, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); put_workspace(type, workspace); return ret; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 93cc92974dee..2b4dfb1b010c 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 36cf1f0e338e..8da66ea699e8 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -354,18 +354,13 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - unsigned long bytes_left; - unsigned long total_out = 0; - unsigned long pg_offset = 0; - - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes_left = destlen; + unsigned long to_copy; workspace->strm.next_in = data_in; workspace->strm.avail_in = srclen; @@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, return -EIO; } - while (bytes_left > 0) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); - if (ret != Z_OK && ret != Z_STREAM_END) - break; - - buf_start = total_out; - total_out = workspace->strm.total_out; - - if (total_out == buf_start) { - ret = -EIO; - break; - } - - if (total_out <= start_byte) - goto next; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - (buf_offset % PAGE_SIZE)); - bytes = min(bytes, bytes_left); + /* + * Everything (in/out buf) should be at most one sector, there should + * be no need to switch any input/output buffer. + */ + ret = zlib_inflate(&workspace->strm, Z_FINISH); + to_copy = min(workspace->strm.total_out, destlen); + if (ret != Z_STREAM_END) + goto out; - memcpy_to_page(dest_page, pg_offset, - workspace->buf + buf_offset, bytes); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); - pg_offset += bytes; - bytes_left -= bytes; -next: - workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = workspace->buf_size; - } - - if (ret != Z_STREAM_END && bytes_left != 0) +out: + if (unlikely(to_copy != destlen)) { + pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n", + to_copy, destlen); ret = -EIO; - else + } else { ret = 0; + } zlib_inflateEnd(&workspace->strm); - /* - * this should only happen if zlib returned fewer bytes than we - * expected. btrfs_get_block is responsible for zeroing from the - * end of the inline extent (destlen) to the end of the page - */ - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); - } + if (unlikely(to_copy < destlen)) + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); return ret; } -- cgit v1.2.3 From 6a69631ec9b1b23784c012a855bbb23012a6dbeb Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 19:38:45 +1030 Subject: btrfs: lzo: fix and simplify the inline extent decompression [BUG] If we have a filesystem with 4k sectorsize, and an inlined compressed extent created like this: item 4 key (257 INODE_ITEM 0) itemoff 15863 itemsize 160 generation 8 transid 8 size 4096 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 5 key (257 INODE_REF 256) itemoff 15839 itemsize 24 index 2 namelen 14 name: source_inlined item 6 key (257 EXTENT_DATA 0) itemoff 15770 itemsize 69 generation 8 type 0 (inline) inline extent data size 48 ram_bytes 4096 compression 2 (lzo) Then trying to reflink that extent in an aarch64 system with 64K page size, the reflink would just fail: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest XFS_IOC_CLONE_RANGE: Input/output error [CAUSE] In zlib_decompress(), we didn't treat @start_byte as just a page offset, but also use it as an indicator on whether we should error out, without any proper explanation (this is from the very beginning of btrfs). In reality, for subpage cases, although @start_byte can be non-zero, we should never switch input/output buffer nor error out, since the whole input/output buffer should never exceed one sector. Note: The above assumption is only not true if we're going to support multi-page sectorsize. Thus the current code using @start_byte as a condition to switch input/output buffer or finish the decompression is completely incorrect. [FIX] The fix involves several modifications: - Rename @start_byte to @dest_pgoff to properly express its meaning - Use @sectorsize other than PAGE_SIZE to properly initialize the output buffer size - Use correct destination offset inside the destination page - Use memcpy_to_page() to copy the contents to the destination page - Use memzero_page() to zero out the tailing part - Consider early end as an error After the fix, even on 64K page sized aarch64, above reflink now works as expected: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest linked 4096/4096 bytes at offset 61440 And results the correct file layout: item 9 key (258 INODE_ITEM 0) itemoff 15542 itemsize 160 generation 10 transid 10 size 65536 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 10 key (258 INODE_REF 256) itemoff 15528 itemsize 14 index 3 namelen 4 name: dest item 11 key (258 XATTR_ITEM 3817753667) itemoff 15445 itemsize 83 location key (0 UNKNOWN.0 0) type XATTR transid 10 data_len 37 name_len 16 name: security.selinux data unconfined_u:object_r:unlabeled_t:s0 item 12 key (258 EXTENT_DATA 61440) itemoff 15392 itemsize 53 generation 10 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 2 +- fs/btrfs/lzo.c | 34 +++++++++------------------------- 2 files changed, 10 insertions(+), 26 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 2b4dfb1b010c..afd7e50d073d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1131d5a29d61..e43bc0fdc74e 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; - char *kaddr; - unsigned long bytes; if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) return -EUCLEAN; @@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, } data_in += LZO_LEN; - out_len = PAGE_SIZE; + out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { pr_warn("BTRFS: decompress failed!\n"); @@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, goto out; } - if (out_len < start_byte) { + ASSERT(out_len <= sectorsize); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + /* Early end, considered as an error. */ + if (unlikely(out_len < destlen)) { ret = -EIO; - goto out; + memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); } - - /* - * the caller is already checking against PAGE_SIZE, but lets - * move this check closer to the memcpy/memset - */ - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes = min_t(unsigned long, destlen, out_len - start_byte); - - kaddr = kmap_local_page(dest_page); - memcpy(kaddr, workspace->buf + start_byte, bytes); - - /* - * btrfs_getblock is doing a zero on the tail of the page too, - * but this will cover anything missing from the decompressed - * data. - */ - if (bytes < destlen) - memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } -- cgit v1.2.3 From 1e7f6def8b2370ecefb54b3c8f390ff894b0c51b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 19:38:46 +1030 Subject: btrfs: zstd: fix and simplify the inline extent decompression [BUG] If we have a filesystem with 4k sectorsize, and an inlined compressed extent created like this: item 4 key (257 INODE_ITEM 0) itemoff 15863 itemsize 160 generation 8 transid 8 size 4096 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 5 key (257 INODE_REF 256) itemoff 15839 itemsize 24 index 2 namelen 14 name: source_inlined item 6 key (257 EXTENT_DATA 0) itemoff 15770 itemsize 69 generation 8 type 0 (inline) inline extent data size 48 ram_bytes 4096 compression 3 (zstd) Then trying to reflink that extent in an aarch64 system with 64K page size, the reflink would just fail: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest XFS_IOC_CLONE_RANGE: Input/output error [CAUSE] In zstd_decompress(), we didn't treat @start_byte as just a page offset, but also use it as an indicator on whether we should error out, without any proper explanation (this is copied from other decompression code). In reality, for subpage cases, although @start_byte can be non-zero, we should never switch input/output buffer nor error out, since the whole input/output buffer should never exceed one sector, thus we should not need to do any buffer switch. Thus the current code using @start_byte as a condition to switch input/output buffer or finish the decompression is completely incorrect. [FIX] The fix involves several modification: - Rename @start_byte to @dest_pgoff to properly express its meaning - Use @sectorsize other than PAGE_SIZE to properly initialize the output buffer size - Use correct destination offset inside the destination page - Simplify the main loop Since the input/output buffer should never switch, we only need one zstd_decompress_stream() call. - Consider early end as an error After the fix, even on 64K page sized aarch64, above reflink now works as expected: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest linked 4096/4096 bytes at offset 61440 And results the correct file layout: item 9 key (258 INODE_ITEM 0) itemoff 15542 itemsize 160 generation 10 transid 10 size 65536 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 10 key (258 INODE_REF 256) itemoff 15528 itemsize 14 index 3 namelen 4 name: dest item 11 key (258 XATTR_ITEM 3817753667) itemoff 15445 itemsize 83 location key (0 UNKNOWN.0 0) type XATTR transid 10 data_len 37 name_len 16 name: security.selinux data unconfined_u:object_r:unlabeled_t:s0 item 12 key (258 EXTENT_DATA 61440) itemoff 15392 itemsize 53 generation 10 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 2 +- fs/btrfs/zstd.c | 75 +++++++++++++++----------------------------------- 2 files changed, 23 insertions(+), 54 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index afd7e50d073d..97fe3ebf11a2 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -169,7 +169,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 0d66db8bc1d4..346c46d88d07 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -20,6 +20,7 @@ #include "misc.h" #include "compression.h" #include "ctree.h" +#include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) @@ -618,80 +619,48 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; - size_t ret2; - unsigned long total_out = 0; - unsigned long pg_offset = 0; + unsigned long to_copy = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { pr_warn("BTRFS: zstd_init_dstream failed\n"); - ret = -EIO; goto finish; } - destlen = min_t(size_t, destlen, PAGE_SIZE); - workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; - - ret2 = 1; - while (pg_offset < destlen - && workspace->in_buf.pos < workspace->in_buf.size) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - /* Check if the frame is over and we still need more input */ - if (ret2 == 0) { - pr_debug("BTRFS: zstd_decompress_stream ended early\n"); - ret = -EIO; - goto finish; - } - ret2 = zstd_decompress_stream(stream, &workspace->out_buf, - &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_decompress_stream returned %d\n", - zstd_get_error_code(ret2)); - ret = -EIO; - goto finish; - } - - buf_start = total_out; - total_out += workspace->out_buf.pos; - workspace->out_buf.pos = 0; - - if (total_out <= start_byte) - continue; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min_t(unsigned long, destlen - pg_offset, - workspace->out_buf.size - buf_offset); - - memcpy_to_page(dest_page, pg_offset, - workspace->out_buf.dst + buf_offset, bytes); - - pg_offset += bytes; + workspace->out_buf.size = sectorsize; + + /* + * Since both input and output buffers should not exceed one sector, + * one call should end the decompression. + */ + ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); + if (zstd_is_error(ret)) { + pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n", + zstd_get_error_code(ret)); + goto finish; } - ret = 0; + to_copy = workspace->out_buf.pos; + memcpy_to_page(dest_page, dest_pgoff + to_copy, workspace->out_buf.dst, to_copy); finish: - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); + /* Error or early end. */ + if (unlikely(to_copy < destlen)) { + ret = -EIO; + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); } return ret; } -- cgit v1.2.3 From f398e70dd69e6ceea71463a5380e6118f219197e Mon Sep 17 00:00:00 2001 From: Chung-Chiang Cheng Date: Fri, 12 Jan 2024 15:41:05 +0800 Subject: btrfs: tree-checker: fix inline ref size in error messages The error message should accurately reflect the size rather than the type. Fixes: f82d1c7ca8ae ("btrfs: tree-checker: Add EXTENT_ITEM and METADATA_ITEM check") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: Chung-Chiang Cheng Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 50fdc69fdddf..6eccf8496486 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf, if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", - ptr, inline_type, end); + ptr, btrfs_extent_inline_ref_size(inline_type), end); return -EUCLEAN; } -- cgit v1.2.3 From a208b3f132b48e1f94f620024e66fea635925877 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Jan 2024 20:30:26 +0100 Subject: btrfs: don't warn if discard range is not aligned to sector There's a warning in btrfs_issue_discard() when the range is not aligned to 512 bytes, originally added in 4d89d377bbb0 ("btrfs: btrfs_issue_discard ensure offset/length are aligned to sector boundaries"). We can't do sub-sector writes anyway so the adjustment is the only thing that we can do and the warning is unnecessary. CC: stable@vger.kernel.org # 4.19+ Reported-by: syzbot+4a4f1eba14eb5c3417d1@syzkaller.appspotmail.com Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6d680031211a..8e8cc1111277 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 bytes_left, end; u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); - if (WARN_ON(start != aligned_start)) { + /* Adjust the range to be aligned to 512B sectors if necessary. */ + if (start != aligned_start) { len -= aligned_start - start; len = round_down(len, 1 << SECTOR_SHIFT); start = aligned_start; -- cgit v1.2.3 From 2018ef1d9ac3e95448b9206adc3425b0431c2411 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 10 Jan 2024 12:54:41 -0500 Subject: btrfs: use the original mount's mount options for the legacy reconfigure btrfs/330, which tests our old trick to allow mount -o ro,subvol=/x /dev/sda1 /foo mount -o rw,subvol=/y /dev/sda1 /bar fails on the block group tree. This is because we aren't preserving the mount options for what is essentially a remount, and thus we're ending up without the FREE_SPACE_TREE mount option, which triggers our free space tree delete codepath. This isn't possible with the block group tree and thus it falls over. Fix this by making sure we copy the existing mount options for the existing fs mount over in this case. Fixes: f044b318675f ("btrfs: handle the ro->rw transition for mounting different subvolumes") Reviewed-by: Neal Gompa Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3a677b808f0f..f192f8fe0ce6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1451,6 +1451,14 @@ static int btrfs_reconfigure(struct fs_context *fc) btrfs_info_to_ctx(fs_info, &old_ctx); + /* + * This is our "bind mount" trick, we don't want to allow the user to do + * anything other than mount a different ro/rw and a different subvol, + * all of the mount options should be maintained. + */ + if (mount_reconfigure) + ctx->mount_opt = old_ctx.mount_opt; + sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); -- cgit v1.2.3 From 1e61b8c672ab2f59b282c8d48a29c14b52c0f5b4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 10 Jan 2024 17:14:21 -0500 Subject: btrfs: don't unconditionally call folio_start_writeback in subpage In the normal case we check if a page is under writeback and skip it before we attempt to begin writeback. The exception is subpage metadata writes, where we know we don't have an eb under writeback and we're doing it one eb at a time. Since b5612c368648 ("mm: return void from folio_start_writeback() and related functions") we now will BUG_ON() if we call folio_start_writeback() on a folio that's already under writeback. Previously folio_start_writeback() would bail if writeback was already started. Fix this in the subpage code by checking if we have writeback set and skipping it if we do. This fixes the panic we were seeing on subpage. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index d9a30b93d543..277dd6d312ee 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -475,7 +475,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - folio_start_writeback(folio); + if (!folio_test_writeback(folio)) + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } -- cgit v1.2.3 From f546c4282673497a06ecb6190b50ae7f6c85b02f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 17 Jan 2024 11:02:25 +1030 Subject: btrfs: scrub: avoid use-after-free when chunk length is not 64K aligned [BUG] There is a bug report that, on a ext4-converted btrfs, scrub leads to various problems, including: - "unable to find chunk map" errors BTRFS info (device vdb): scrub: started on devid 1 BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 4096 BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 45056 This would lead to unrepariable errors. - Use-after-free KASAN reports: ================================================================== BUG: KASAN: slab-use-after-free in __blk_rq_map_sg+0x18f/0x7c0 Read of size 8 at addr ffff8881013c9040 by task btrfs/909 CPU: 0 PID: 909 Comm: btrfs Not tainted 6.7.0-x64v3-dbg #11 c50636e9419a8354555555245df535e380563b2b Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 2023.11-2 12/24/2023 Call Trace: dump_stack_lvl+0x43/0x60 print_report+0xcf/0x640 kasan_report+0xa6/0xd0 __blk_rq_map_sg+0x18f/0x7c0 virtblk_prep_rq.isra.0+0x215/0x6a0 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff] virtio_queue_rqs+0xc4/0x310 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff] blk_mq_flush_plug_list.part.0+0x780/0x860 __blk_flush_plug+0x1ba/0x220 blk_finish_plug+0x3b/0x60 submit_initial_group_read+0x10a/0x290 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] flush_scrub_stripes+0x38e/0x430 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] scrub_stripe+0x82a/0xae0 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] scrub_chunk+0x178/0x200 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] scrub_enumerate_chunks+0x4bc/0xa30 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] btrfs_scrub_dev+0x398/0x810 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] btrfs_ioctl+0x4b9/0x3020 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] __x64_sys_ioctl+0xbd/0x100 do_syscall_64+0x5d/0xe0 entry_SYSCALL_64_after_hwframe+0x63/0x6b RIP: 0033:0x7f47e5e0952b - Crash, mostly due to above use-after-free [CAUSE] The converted fs has the following data chunk layout: item 2 key (FIRST_CHUNK_TREE CHUNK_ITEM 2214658048) itemoff 16025 itemsize 80 length 86016 owner 2 stripe_len 65536 type DATA|single For above logical bytenr 2214744064, it's at the chunk end (2214658048 + 86016 = 2214744064). This means btrfs_submit_bio() would split the bio, and trigger endio function for both of the two halves. However scrub_submit_initial_read() would only expect the endio function to be called once, not any more. This means the first endio function would already free the bbio::bio, leaving the bvec freed, thus the 2nd endio call would lead to use-after-free. [FIX] - Make sure scrub_read_endio() only updates bits in its range Since we may read less than 64K at the end of the chunk, we should not touch the bits beyond chunk boundary. - Make sure scrub_submit_initial_read() only to read the chunk range This is done by calculating the real number of sectors we need to read, and add sector-by-sector to the bio. Thankfully the scrub read repair path won't need extra fixes: - scrub_stripe_submit_repair_read() With above fixes, we won't update error bit for range beyond chunk, thus scrub_stripe_submit_repair_read() should never submit any read beyond the chunk. Reported-by: Rongrong Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure") Tested-by: Rongrong Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a01807cbd4d4..2d81b1a18a04 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1098,12 +1098,22 @@ out: static void scrub_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + int num_sectors; + u32 bio_size = 0; + int i; + + ASSERT(sector_nr < stripe->nr_sectors); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); - bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); + bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); + bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); + bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1701,6 +1711,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); @@ -1715,14 +1728,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); - /* Read the whole stripe. */ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; - for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { + /* Read the whole range inside the chunk boundary. */ + for (unsigned int cur = 0; cur < nr_sectors; cur++) { + struct page *page = scrub_stripe_get_page(stripe, cur); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); int ret; - ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); /* We should have allocated enough bio vectors. */ - ASSERT(ret == PAGE_SIZE); + ASSERT(ret == fs_info->sectorsize); } atomic_inc(&stripe->pending_io); -- cgit v1.2.3 From 7f2d219e78e95a137a9c76fddac7ff8228260439 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 17 Jan 2024 11:02:26 +1030 Subject: btrfs: scrub: limit RST scrub to chunk boundary [BUG] If there is an extent beyond chunk boundary, currently RST scrub would error out. [CAUSE] In scrub_submit_extent_sector_read(), we completely rely on extent_sector_bitmap, which is populated using extent tree. The extent tree can be corrupted that there is an extent item beyond a chunk. In that case, RST scrub would fail and error out. [FIX] Despite the extent_sector_bitmap usage, also limit the read to chunk boundary. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2d81b1a18a04..0123d2728923 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1646,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1656,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct page *page = scrub_stripe_get_page(stripe, i); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + /* We're beyond the chunk boundary, no need to read anymore. */ + if (i >= nr_sectors) + break; + /* The current sector cannot be merged, submit the bio. */ if (bbio && ((i > 0 && -- cgit v1.2.3