summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-26 11:41:38 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-26 11:41:38 -0700
commitcc423f6337d0a5ff1906f3b3d465d28c0d1705f6 (patch)
treefafc40aa7dc3ecd9800239f647d4fe21ee5db6af
parente940efa936be65866db9ce20798b13fdc6b3891a (diff)
parent8a4a0b2a3eaf75ca8854f856ef29690c12b2f531 (diff)
downloadlinux-cc423f6337d0a5ff1906f3b3d465d28c0d1705f6.tar.gz
linux-cc423f6337d0a5ff1906f3b3d465d28c0d1705f6.tar.bz2
linux-cc423f6337d0a5ff1906f3b3d465d28c0d1705f6.zip
Merge tag 'for-6.5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "Mainly core changes, refactoring and optimizations. Performance is improved in some areas, overall there may be a cumulative improvement due to refactoring that removed lookups in the IO path or simplified IO submission tracking. Core: - submit IO synchronously for fast checksums (crc32c and xxhash), remove high priority worker kthread - read extent buffer in one go, simplify IO tracking, bio submission and locking - remove additional tracking of redirtied extent buffers, originally added for zoned mode but actually not needed - track ordered extent pointer in bio to avoid rbtree lookups during IO - scrub, use recovered data stripes as cache to avoid unnecessary read - in zoned mode, optimize logical to physical mappings of extents - remove PageError handling, not set by VFS nor writeback - cleanups, refactoring, better structure packing - lots of error handling improvements - more assertions, lockdep annotations - print assertion failure with the exact line where it happens - tracepoint updates - more debugging prints Performance: - speedup in fsync(), better tracking of inode logged status can avoid transaction commit - IO path structures track logical offsets in data structures and does not need to look it up User visible changes: - don't commit transaction for every created subvolume, this can reduce time when many subvolumes are created in a batch - print affected files when relocation fails - trigger orphan file cleanup during START_SYNC ioctl Notable fixes: - fix crash when disabling quota and relocation - fix crashes when removing roots from drity list - fix transacion abort during relocation when converting from newer profiles not covered by fallback - in zoned mode, stop reclaiming block groups if filesystem becomes read-only - fix rare race condition in tree mod log rewind that can miss some btree node slots - with enabled fsverity, drop up-to-date page bit in case the verification fails" * tag 'for-6.5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (194 commits) btrfs: fix race between quota disable and relocation btrfs: add comment to struct btrfs_fs_info::dirty_cowonly_roots btrfs: fix race when deleting free space root from the dirty cow roots list btrfs: fix race when deleting quota root from the dirty cow roots list btrfs: tracepoints: also show actual number of the outstanding extents btrfs: update i_version in update_dev_time btrfs: make btrfs_compressed_bioset static btrfs: add handling for RAID1C23/DUP to btrfs_reduce_alloc_profile btrfs: scrub: remove btrfs_fs_info::scrub_wr_completion_workers btrfs: scrub: remove scrub_ctx::csum_list member btrfs: do not BUG_ON after failure to migrate space during truncation btrfs: do not BUG_ON on failure to get dir index for new snapshot btrfs: send: do not BUG_ON() on unexpected symlink data extent btrfs: do not BUG_ON() when dropping inode items from log root btrfs: replace BUG_ON() at split_item() with proper error handling btrfs: do not BUG_ON() on tree mod log failures at btrfs_del_ptr() btrfs: do not BUG_ON() on tree mod log failures at insert_ptr() btrfs: do not BUG_ON() on tree mod log failure at insert_new_root() btrfs: do not BUG_ON() on tree mod log failures at push_nodes_for_insert() btrfs: abort transaction at update_ref_for_cow() when ref count is zero ...
-rw-r--r--fs/btrfs/async-thread.c44
-rw-r--r--fs/btrfs/async-thread.h3
-rw-r--r--fs/btrfs/bio.c122
-rw-r--r--fs/btrfs/bio.h29
-rw-r--r--fs/btrfs/block-group.c47
-rw-r--r--fs/btrfs/block-group.h9
-rw-r--r--fs/btrfs/block-rsv.c19
-rw-r--r--fs/btrfs/block-rsv.h2
-rw-r--r--fs/btrfs/btrfs_inode.h23
-rw-r--r--fs/btrfs/check-integrity.c21
-rw-r--r--fs/btrfs/compression.c50
-rw-r--r--fs/btrfs/compression.h7
-rw-r--r--fs/btrfs/ctree.c429
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/defrag.c3
-rw-r--r--fs/btrfs/delayed-ref.c110
-rw-r--r--fs/btrfs/delayed-ref.h25
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/discard.c34
-rw-r--r--fs/btrfs/discard.h1
-rw-r--r--fs/btrfs/disk-io.c453
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/extent-io-tree.c37
-rw-r--r--fs/btrfs/extent-io-tree.h62
-rw-r--r--fs/btrfs/extent-tree.c186
-rw-r--r--fs/btrfs/extent-tree.h2
-rw-r--r--fs/btrfs/extent_io.c848
-rw-r--r--fs/btrfs/extent_io.h11
-rw-r--r--fs/btrfs/extent_map.c110
-rw-r--r--fs/btrfs/extent_map.h6
-rw-r--r--fs/btrfs/file-item.c90
-rw-r--r--fs/btrfs/file-item.h1
-rw-r--r--fs/btrfs/file.c12
-rw-r--r--fs/btrfs/free-space-cache.c122
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/free-space-tree.c3
-rw-r--r--fs/btrfs/fs.h5
-rw-r--r--fs/btrfs/inode-item.h16
-rw-r--r--fs/btrfs/inode.c592
-rw-r--r--fs/btrfs/ioctl.c23
-rw-r--r--fs/btrfs/locking.c5
-rw-r--r--fs/btrfs/lzo.c6
-rw-r--r--fs/btrfs/messages.c8
-rw-r--r--fs/btrfs/messages.h15
-rw-r--r--fs/btrfs/misc.h20
-rw-r--r--fs/btrfs/ordered-data.c364
-rw-r--r--fs/btrfs/ordered-data.h27
-rw-r--r--fs/btrfs/print-tree.c16
-rw-r--r--fs/btrfs/print-tree.h4
-rw-r--r--fs/btrfs/qgroup.c20
-rw-r--r--fs/btrfs/raid56.c49
-rw-r--r--fs/btrfs/raid56.h3
-rw-r--r--fs/btrfs/relocation.c61
-rw-r--r--fs/btrfs/relocation.h3
-rw-r--r--fs/btrfs/scrub.c125
-rw-r--r--fs/btrfs/send.c16
-rw-r--r--fs/btrfs/subpage.c97
-rw-r--r--fs/btrfs/subpage.h12
-rw-r--r--fs/btrfs/super.c1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c16
-rw-r--r--fs/btrfs/transaction.c16
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-checker.c152
-rw-r--r--fs/btrfs/tree-checker.h29
-rw-r--r--fs/btrfs/tree-log.c58
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/tree-mod-log.c257
-rw-r--r--fs/btrfs/volumes.c173
-rw-r--r--fs/btrfs/volumes.h57
-rw-r--r--fs/btrfs/zlib.c2
-rw-r--r--fs/btrfs/zoned.c159
-rw-r--r--fs/btrfs/zoned.h8
-rw-r--r--fs/btrfs/zstd.c2
-rw-r--r--include/trace/events/btrfs.h39
-rw-r--r--tools/objtool/check.c1
75 files changed, 2737 insertions, 2667 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index aac240430efe..ce083e99ef68 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -71,6 +71,16 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
return atomic_read(&wq->pending) > wq->thresh * 2;
}
+static void btrfs_init_workqueue(struct btrfs_workqueue *wq,
+ struct btrfs_fs_info *fs_info)
+{
+ wq->fs_info = fs_info;
+ atomic_set(&wq->pending, 0);
+ INIT_LIST_HEAD(&wq->ordered_list);
+ spin_lock_init(&wq->list_lock);
+ spin_lock_init(&wq->thres_lock);
+}
+
struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
const char *name, unsigned int flags,
int limit_active, int thresh)
@@ -80,9 +90,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
if (!ret)
return NULL;
- ret->fs_info = fs_info;
+ btrfs_init_workqueue(ret, fs_info);
+
ret->limit_active = limit_active;
- atomic_set(&ret->pending, 0);
if (thresh == 0)
thresh = DFT_THRESHOLD;
/* For low threshold, disabling threshold is a better choice */
@@ -106,9 +116,33 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
return NULL;
}
- INIT_LIST_HEAD(&ret->ordered_list);
- spin_lock_init(&ret->list_lock);
- spin_lock_init(&ret->thres_lock);
+ trace_btrfs_workqueue_alloc(ret, name);
+ return ret;
+}
+
+struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
+ struct btrfs_fs_info *fs_info, const char *name,
+ unsigned int flags)
+{
+ struct btrfs_workqueue *ret;
+
+ ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return NULL;
+
+ btrfs_init_workqueue(ret, fs_info);
+
+ /* Ordered workqueues don't allow @max_active adjustments. */
+ ret->limit_active = 1;
+ ret->current_active = 1;
+ ret->thresh = NO_THRESHOLD;
+
+ ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name);
+ if (!ret->normal_wq) {
+ kfree(ret);
+ return NULL;
+ }
+
trace_btrfs_workqueue_alloc(ret, name);
return ret;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 6e2596ddae10..30f66c5e2e6e 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -31,6 +31,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
unsigned int flags,
int limit_active,
int thresh);
+struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
+ struct btrfs_fs_info *fs_info, const char *name,
+ unsigned int flags);
void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
btrfs_func_t ordered_func, btrfs_func_t ordered_free);
void btrfs_queue_work(struct btrfs_workqueue *wq,
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index b3ad0f51e616..12b12443efaa 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -27,6 +27,17 @@ struct btrfs_failed_bio {
atomic_t repair_count;
};
+/* Is this a data path I/O that needs storage layer checksum and repair? */
+static inline bool is_data_bbio(struct btrfs_bio *bbio)
+{
+ return bbio->inode && is_data_inode(&bbio->inode->vfs_inode);
+}
+
+static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
+{
+ return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
+}
+
/*
* Initialize a btrfs_bio structure. This skips the embedded bio itself as it
* is already initialized by the block layer.
@@ -61,20 +72,6 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
return bbio;
}
-static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio)
-{
- struct btrfs_ordered_extent *ordered;
- int ret;
-
- ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
- if (WARN_ON_ONCE(!ordered))
- return BLK_STS_IOERR;
- ret = btrfs_extract_ordered_extent(bbio, ordered);
- btrfs_put_ordered_extent(ordered);
-
- return errno_to_blk_status(ret);
-}
-
static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
struct btrfs_bio *orig_bbio,
u64 map_length, bool use_append)
@@ -95,13 +92,41 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
bbio->inode = orig_bbio->inode;
bbio->file_offset = orig_bbio->file_offset;
- if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED))
- orig_bbio->file_offset += map_length;
-
+ orig_bbio->file_offset += map_length;
+ if (bbio_has_ordered_extent(bbio)) {
+ refcount_inc(&orig_bbio->ordered->refs);
+ bbio->ordered = orig_bbio->ordered;
+ }
atomic_inc(&orig_bbio->pending_ios);
return bbio;
}
+/* Free a bio that was never submitted to the underlying device. */
+static void btrfs_cleanup_bio(struct btrfs_bio *bbio)
+{
+ if (bbio_has_ordered_extent(bbio))
+ btrfs_put_ordered_extent(bbio->ordered);
+ bio_put(&bbio->bio);
+}
+
+static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
+{
+ if (bbio_has_ordered_extent(bbio)) {
+ struct btrfs_ordered_extent *ordered = bbio->ordered;
+
+ bbio->end_io(bbio);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ bbio->end_io(bbio);
+ }
+}
+
+void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+{
+ bbio->bio.bi_status = status;
+ __btrfs_bio_end_io(bbio);
+}
+
static void btrfs_orig_write_end_io(struct bio *bio);
static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
@@ -130,12 +155,12 @@ static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
if (bbio->bio.bi_status)
btrfs_bbio_propagate_error(bbio, orig_bbio);
- bio_put(&bbio->bio);
+ btrfs_cleanup_bio(bbio);
bbio = orig_bbio;
}
if (atomic_dec_and_test(&bbio->pending_ios))
- bbio->end_io(bbio);
+ __btrfs_bio_end_io(bbio);
}
static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
@@ -327,7 +352,7 @@ static void btrfs_end_bio_work(struct work_struct *work)
struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
/* Metadata reads are checked and repaired by the submitter. */
- if (bbio->inode && !(bbio->bio.bi_opf & REQ_META))
+ if (is_data_bbio(bbio))
btrfs_check_read_bio(bbio, bbio->bio.bi_private);
else
btrfs_orig_bbio_end_io(bbio);
@@ -348,7 +373,7 @@ static void btrfs_simple_end_io(struct bio *bio)
INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
} else {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
btrfs_record_physical_zoned(bbio);
btrfs_orig_bbio_end_io(bbio);
}
@@ -361,8 +386,7 @@ static void btrfs_raid56_end_io(struct bio *bio)
btrfs_bio_counter_dec(bioc->fs_info);
bbio->mirror_num = bioc->mirror_num;
- if (bio_op(bio) == REQ_OP_READ && bbio->inode &&
- !(bbio->bio.bi_opf & REQ_META))
+ if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
btrfs_check_read_bio(bbio, NULL);
else
btrfs_orig_bbio_end_io(bbio);
@@ -472,13 +496,12 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
struct btrfs_io_stripe *smap, int mirror_num)
{
- /* Do not leak our private flag into the block layer. */
- bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED;
-
if (!bioc) {
/* Single mirror read/write fast path. */
btrfs_bio(bio)->mirror_num = mirror_num;
bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
+ if (bio_op(bio) != REQ_OP_READ)
+ btrfs_bio(bio)->orig_physical = smap->physical;
bio->bi_private = smap->dev;
bio->bi_end_io = btrfs_simple_end_io;
btrfs_submit_dev_bio(smap->dev, bio);
@@ -574,27 +597,20 @@ static void run_one_async_free(struct btrfs_work *work)
static bool should_async_write(struct btrfs_bio *bbio)
{
- /*
- * If the I/O is not issued by fsync and friends, (->sync_writers != 0),
- * then try to defer the submission to a workqueue to parallelize the
- * checksum calculation.
- */
- if (atomic_read(&bbio->inode->sync_writers))
+ /* Submit synchronously if the checksum implementation is fast. */
+ if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
return false;
/*
- * Submit metadata writes synchronously if the checksum implementation
- * is fast, or we are on a zoned device that wants I/O to be submitted
- * in order.
+ * Try to defer the submission to a workqueue to parallelize the
+ * checksum calculation unless the I/O is issued synchronously.
*/
- if (bbio->bio.bi_opf & REQ_META) {
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ if (op_is_sync(bbio->bio.bi_opf))
+ return false;
- if (btrfs_is_zoned(fs_info))
- return false;
- if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
- return false;
- }
+ /* Zoned devices require I/O to be submitted in order. */
+ if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info))
+ return false;
return true;
}
@@ -622,10 +638,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
run_one_async_free);
- if (op_is_sync(bbio->bio.bi_opf))
- btrfs_queue_work(fs_info->hipri_workers, &async->work);
- else
- btrfs_queue_work(fs_info->workers, &async->work);
+ btrfs_queue_work(fs_info->workers, &async->work);
return true;
}
@@ -635,7 +648,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
struct btrfs_fs_info *fs_info = bbio->fs_info;
struct btrfs_bio *orig_bbio = bbio;
struct bio *bio = &bbio->bio;
- u64 logical = bio->bi_iter.bi_sector << 9;
+ u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 length = bio->bi_iter.bi_size;
u64 map_length = length;
bool use_append = btrfs_use_zone_append(bbio);
@@ -645,8 +658,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
int error;
btrfs_bio_counter_inc_blocked(fs_info);
- error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num, 1);
+ error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num, 1);
if (error) {
ret = errno_to_blk_status(error);
goto fail;
@@ -665,7 +678,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
* Save the iter for the end_io handler and preload the checksums for
* data reads.
*/
- if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) {
+ if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
bbio->saved_iter = bio->bi_iter;
ret = btrfs_lookup_bio_sums(bbio);
if (ret)
@@ -676,9 +689,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
if (use_append) {
bio->bi_opf &= ~REQ_OP_WRITE;
bio->bi_opf |= REQ_OP_ZONE_APPEND;
- ret = btrfs_bio_extract_ordered_extent(bbio);
- if (ret)
- goto fail_put_bio;
}
/*
@@ -695,6 +705,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
ret = btrfs_bio_csum(bbio);
if (ret)
goto fail_put_bio;
+ } else if (use_append) {
+ ret = btrfs_alloc_dummy_sum(bbio);
+ if (ret)
+ goto fail_put_bio;
}
}
@@ -704,7 +718,7 @@ done:
fail_put_bio:
if (map_length < length)
- bio_put(bio);
+ btrfs_cleanup_bio(bbio);
fail:
btrfs_bio_counter_dec(fs_info);
btrfs_bio_end_io(orig_bbio, ret);
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index a8eca3a65673..ca79decee060 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -39,8 +39,8 @@ struct btrfs_bio {
union {
/*
- * Data checksumming and original I/O information for internal
- * use in the btrfs_submit_bio machinery.
+ * For data reads: checksumming and original I/O information.
+ * (for internal use in the btrfs_submit_bio machinery only)
*/
struct {
u8 *csum;
@@ -48,7 +48,20 @@ struct btrfs_bio {
struct bvec_iter saved_iter;
};
- /* For metadata parentness verification. */
+ /*
+ * For data writes:
+ * - ordered extent covering the bio
+ * - pointer to the checksums for this bio
+ * - original physical address from the allocator
+ * (for zone append only)
+ */
+ struct {
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_sum *sums;
+ u64 orig_physical;
+ };
+
+ /* For metadata reads: parentness verification. */
struct btrfs_tree_parent_check parent_check;
};
@@ -84,15 +97,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
struct btrfs_fs_info *fs_info,
btrfs_bio_end_io_t end_io, void *private);
-
-static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
-{
- bbio->bio.bi_status = status;
- bbio->end_io(bbio);
-}
-
-/* Bio only refers to one ordered extent. */
-#define REQ_BTRFS_ONE_ORDERED REQ_DRV
+void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
/* Submit using blkcg_punt_bio_submit. */
#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index e97af2e510c3..48ae509f2ac2 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -95,14 +95,21 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
}
allowed &= flags;
- if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+ /* Select the highest-redundancy RAID level. */
+ if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
+ allowed = BTRFS_BLOCK_GROUP_RAID1C4;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
allowed = BTRFS_BLOCK_GROUP_RAID6;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
+ allowed = BTRFS_BLOCK_GROUP_RAID1C3;
else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
allowed = BTRFS_BLOCK_GROUP_RAID5;
else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
allowed = BTRFS_BLOCK_GROUP_RAID10;
else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
allowed = BTRFS_BLOCK_GROUP_RAID1;
+ else if (allowed & BTRFS_BLOCK_GROUP_DUP)
+ allowed = BTRFS_BLOCK_GROUP_DUP;
else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
allowed = BTRFS_BLOCK_GROUP_RAID0;
@@ -1633,11 +1640,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ trace_btrfs_add_unused_block_group(bg);
spin_lock(&fs_info->unused_bgs_lock);
if (list_empty(&bg->bg_list)) {
btrfs_get_block_group(bg);
- trace_btrfs_add_unused_block_group(bg);
list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
+ } else {
+ /* Pull out the block group from the reclaim_bgs list. */
+ list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
}
spin_unlock(&fs_info->unused_bgs_lock);
}
@@ -1791,8 +1801,15 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
spin_unlock(&bg->lock);
- /* Get out fast, in case we're unmounting the filesystem */
- if (btrfs_fs_closing(fs_info)) {
+ /*
+ * Get out fast, in case we're read-only or unmounting the
+ * filesystem. It is OK to drop block groups from the list even
+ * for the read-only case. As we did sb_start_write(),
+ * "mount -o remount,ro" won't happen and read-only filesystem
+ * means it is forced read-only due to a fatal error. So, it
+ * never gets back to read-write to let us reclaim again.
+ */
+ if (btrfs_need_cleaner_sleep(fs_info)) {
up_write(&space_info->groups_sem);
goto next;
}
@@ -1823,11 +1840,27 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
next:
+ if (ret)
+ btrfs_mark_bg_to_reclaim(bg);
btrfs_put_block_group(bg);
+
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ /*
+ * Reclaiming all the block groups in the list can take really
+ * long. Prioritize cleaning up unused block groups.
+ */
+ btrfs_delete_unused_bgs(fs_info);
+ /*
+ * If we are interrupted by a balance, we can just bail out. The
+ * cleaner thread restart again if necessary.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
+ goto end;
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
+end:
btrfs_exclop_finish(fs_info);
sb_end_write(fs_info->sb);
}
@@ -3521,9 +3554,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
- set_extent_dirty(&trans->transaction->pinned_extents,
- bytenr, bytenr + num_bytes - 1,
- GFP_NOFS | __GFP_NOFAIL);
+ set_extent_bit(&trans->transaction->pinned_extents,
+ bytenr, bytenr + num_bytes - 1,
+ EXTENT_DIRTY, NULL);
}
spin_lock(&trans->transaction->dirty_bgs_lock);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index cc0e4b37db2d..f204addc3fe8 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -162,7 +162,14 @@ struct btrfs_block_group {
*/
struct list_head cluster_list;
- /* For delayed block group creation or deletion of empty block groups */
+ /*
+ * Used for several lists:
+ *
+ * 1) struct btrfs_fs_info::unused_bgs
+ * 2) struct btrfs_fs_info::reclaim_bgs
+ * 3) struct btrfs_transaction::deleted_bgs
+ * 4) struct btrfs_trans_handle::new_bgs
+ */
struct list_head bg_list;
/* For read-only block groups */
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index ac18c43fadad..6279d200cf83 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -541,3 +541,22 @@ try_reserve:
return ERR_PTR(ret);
}
+
+int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv)
+{
+ u64 needed_bytes;
+ int ret;
+
+ /* 1 for slack space, 1 for updating the inode */
+ needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
+ btrfs_calc_metadata_size(fs_info, 1);
+
+ spin_lock(&rsv->lock);
+ if (rsv->reserved < needed_bytes)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+ spin_unlock(&rsv->lock);
+ return ret;
+}
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 6dc781709aca..b0bd12b8652f 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -82,6 +82,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize);
+int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv);
static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u32 blocksize)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ec2ae4406c16..d47a927b3504 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -116,9 +116,6 @@ struct btrfs_inode {
unsigned long runtime_flags;
- /* Keep track of who's O_SYNC/fsyncing currently */
- atomic_t sync_writers;
-
/* full 64 bit generation number, struct vfs_inode doesn't have a big
* enough field for this.
*/
@@ -335,7 +332,7 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
if (btrfs_is_free_space_inode(inode))
return;
trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
- mod);
+ mod, inode->outstanding_extents);
}
/*
@@ -407,30 +404,12 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
return true;
}
-/*
- * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
- * separate u32s. These two functions convert between the two representations.
- */
-static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
-{
- return (flags | ((u64)ro_flags << 32));
-}
-
-static inline void btrfs_inode_split_flags(u64 inode_item_flags,
- u32 *flags, u32 *ro_flags)
-{
- *flags = (u32)inode_item_flags;
- *ro_flags = (u32)(inode_item_flags >> 32);
-}
-
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
u32 pgoff, u8 *csum, const u8 * const csum_expected);
-int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
- struct btrfs_ordered_extent *ordered);
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u32 bio_offset, struct bio_vec *bv);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 82e49d985019..3caf339c4bb3 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1459,13 +1459,13 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfs_fs_info *fs_info = state->fs_info;
int ret;
u64 length;
- struct btrfs_io_context *multi = NULL;
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_io_stripe smap, *map;
struct btrfs_device *device;
length = len;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
- bytenr, &length, &multi, mirror_num);
-
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc,
+ NULL, &mirror_num, 0);
if (ret) {
block_ctx_out->start = 0;
block_ctx_out->dev_bytenr = 0;
@@ -1478,21 +1478,26 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
return ret;
}
- device = multi->stripes[0].dev;
+ if (bioc)
+ map = &bioc->stripes[0];
+ else
+ map = &smap;
+
+ device = map->dev;
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
!device->bdev || !device->name)
block_ctx_out->dev = NULL;
else
block_ctx_out->dev = btrfsic_dev_state_lookup(
device->bdev->bd_dev);
- block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+ block_ctx_out->dev_bytenr = map->physical;
block_ctx_out->start = bytenr;
block_ctx_out->len = len;
block_ctx_out->datav = NULL;
block_ctx_out->pagev = NULL;
block_ctx_out->mem_to_free = NULL;
- kfree(multi);
+ kfree(bioc);
if (NULL == block_ctx_out->dev) {
ret = -ENXIO;
pr_info("btrfsic: error, cannot lookup dev (#1)!\n");
@@ -1565,7 +1570,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
REQ_OP_READ, GFP_NOFS);
- bio->bi_iter.bi_sector = dev_bytenr >> 9;
+ bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT;
for (j = i; j < num_pages; j++) {
ret = bio_add_page(bio, block_ctx->pagev[j],
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 2d0493f0a184..8818ed5c390f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -37,7 +37,7 @@
#include "file-item.h"
#include "super.h"
-struct bio_set btrfs_compressed_bioset;
+static struct bio_set btrfs_compressed_bioset;
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -211,8 +211,6 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
for (i = 0; i < ret; i++) {
struct folio *folio = fbatch.folios[i];
- if (errno)
- folio_set_error(folio);
btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
cb->start, cb->len);
}
@@ -226,13 +224,8 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
struct compressed_bio *cb =
container_of(work, struct compressed_bio, write_end_work);
- /*
- * Ok, we're the last bio for this extent, step one is to call back
- * into the FS and do all the end_io operations.
- */
- btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL,
- cb->start, cb->start + cb->len - 1,
- cb->bbio.bio.bi_status == BLK_STS_OK);
+ btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len,
+ cb->bbio.bio.bi_status == BLK_STS_OK);
if (cb->writeback)
end_compressed_writeback(cb);
@@ -281,32 +274,31 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
* This also checksums the file bytes and gets things ready for
* the end io hooks.
*/
-void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
- unsigned int len, u64 disk_start,
- unsigned int compressed_len,
- struct page **compressed_pages,
- unsigned int nr_pages,
- blk_opf_t write_flags,
- bool writeback)
+void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
+ struct page **compressed_pages,
+ unsigned int nr_pages,
+ blk_opf_t write_flags,
+ bool writeback)
{
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct compressed_bio *cb;
- ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
- IS_ALIGNED(len, fs_info->sectorsize));
-
- write_flags |= REQ_BTRFS_ONE_ORDERED;
+ ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize));
+ ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize));
- cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags,
+ cb = alloc_compressed_bio(inode, ordered->file_offset,
+ REQ_OP_WRITE | write_flags,
end_compressed_bio_write);
- cb->start = start;
- cb->len = len;
+ cb->start = ordered->file_offset;
+ cb->len = ordered->num_bytes;
cb->compressed_pages = compressed_pages;
- cb->compressed_len = compressed_len;
+ cb->compressed_len = ordered->disk_num_bytes;
cb->writeback = writeback;
INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_pages = nr_pages;
- cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
+ cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
+ cb->bbio.ordered = ordered;
btrfs_add_compressed_bio_pages(cb);
btrfs_submit_bio(&cb->bbio, 0);
@@ -421,7 +413,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
*/
if (!em || cur < em->start ||
(cur + fs_info->sectorsize > extent_map_end(em)) ||
- (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) {
+ (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
@@ -472,7 +464,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
-void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num)
+void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -538,7 +530,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num)
if (memstall)
psi_memstall_leave(&pflags);
- btrfs_submit_bio(&cb->bbio, mirror_num);
+ btrfs_submit_bio(&cb->bbio, 0);
return;
out_free_compressed_pages:
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 19ab2abeddc0..03bb9d143fa7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -10,6 +10,7 @@
#include "bio.h"
struct btrfs_inode;
+struct btrfs_ordered_extent;
/*
* We want to make sure that amount of RAM required to uncompress an extent is
@@ -86,14 +87,12 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
-void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
- unsigned int len, u64 disk_start,
- unsigned int compressed_len,
+void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
struct page **compressed_pages,
unsigned int nr_pages,
blk_opf_t write_flags,
bool writeback);
-void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num);
+void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2ff2961b1183..a4cb4b642987 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -37,8 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
-static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
- int level, int slot);
static const struct btrfs_csums {
u16 size;
@@ -150,13 +148,19 @@ static inline void copy_leaf_items(const struct extent_buffer *dst,
nr_items * sizeof(struct btrfs_item));
}
+/* This exists for btrfs-progs usages. */
+u16 btrfs_csum_type_size(u16 type)
+{
+ return btrfs_csums[type].size;
+}
+
int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
u16 t = btrfs_super_csum_type(s);
/*
* csum type is validated at mount time
*/
- return btrfs_csums[t].size;
+ return btrfs_csum_type_size(t);
}
const char *btrfs_super_csum_name(u16 csum_type)
@@ -417,9 +421,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
&refs, &flags);
if (ret)
return ret;
- if (refs == 0) {
- ret = -EROFS;
- btrfs_handle_fs_error(fs_info, ret, NULL);
+ if (unlikely(refs == 0)) {
+ btrfs_crit(fs_info,
+ "found 0 references for tree block at bytenr %llu level %d root %llu",
+ buf->start, btrfs_header_level(buf),
+ btrfs_root_id(root));
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
return ret;
}
} else {
@@ -464,10 +472,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return ret;
}
if (new_flags != 0) {
- int level = btrfs_header_level(buf);
-
- ret = btrfs_set_disk_extent_flags(trans, buf,
- new_flags, level);
+ ret = btrfs_set_disk_extent_flags(trans, buf, new_flags);
if (ret)
return ret;
}
@@ -583,9 +588,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
parent_start = buf->start;
- atomic_inc(&cow->refs);
ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ atomic_inc(&cow->refs);
rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
@@ -594,8 +604,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
- btrfs_tree_mod_log_insert_key(parent, parent_slot,
- BTRFS_MOD_LOG_KEY_REPLACE);
+ ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
+ BTRFS_MOD_LOG_KEY_REPLACE);
+ if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
@@ -1028,8 +1044,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
child = btrfs_read_node_slot(mid, 0);
if (IS_ERR(child)) {
ret = PTR_ERR(child);
- btrfs_handle_fs_error(fs_info, ret, NULL);
- goto enospc;
+ goto out;
}
btrfs_tree_lock(child);
@@ -1038,11 +1053,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
- goto enospc;
+ goto out;
}
ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
rcu_assign_pointer(root->node, child);
add_root_to_dirty_list(root);
@@ -1070,7 +1090,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (IS_ERR(left)) {
ret = PTR_ERR(left);
left = NULL;
- goto enospc;
+ goto out;
}
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
@@ -1079,7 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
- goto enospc;
+ goto out;
}
}
@@ -1088,7 +1108,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (IS_ERR(right)) {
ret = PTR_ERR(right);
right = NULL;
- goto enospc;
+ goto out;
}
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
@@ -1097,7 +1117,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
- goto enospc;
+ goto out;
}
}
@@ -1119,7 +1139,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(right) == 0) {
btrfs_clear_buffer_dirty(trans, right);
btrfs_tree_unlock(right);
- del_ptr(root, path, level + 1, pslot + 1);
+ ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1);
+ if (ret < 0) {
+ free_extent_buffer_stale(right);
+ right = NULL;
+ goto out;
+ }
root_sub_used(root, right->len);
btrfs_free_tree_block(trans, btrfs_root_id(root), right,
0, 1);
@@ -1130,7 +1155,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &right_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
}
@@ -1145,15 +1173,19 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
* otherwise we would have pulled some pointers from the
* right
*/
- if (!left) {
- ret = -EROFS;
- btrfs_handle_fs_error(fs_info, ret, NULL);
- goto enospc;
+ if (unlikely(!left)) {
+ btrfs_crit(fs_info,
+"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
+ parent->start, btrfs_header_level(parent),
+ mid->start, btrfs_root_id(root));
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
}
wret = balance_node_right(trans, mid, left);
if (wret < 0) {
ret = wret;
- goto enospc;
+ goto out;
}
if (wret == 1) {
wret = push_node_left(trans, left, mid, 1);
@@ -1165,7 +1197,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(mid) == 0) {
btrfs_clear_buffer_dirty(trans, mid);
btrfs_tree_unlock(mid);
- del_ptr(root, path, level + 1, pslot);
+ ret = btrfs_del_ptr(trans, root, path, level + 1, pslot);
+ if (ret < 0) {
+ free_extent_buffer_stale(mid);
+ mid = NULL;
+ goto out;
+ }
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
@@ -1176,7 +1213,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_node_key(mid, &mid_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
}
@@ -1202,7 +1242,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (orig_ptr !=
btrfs_node_blockptr(path->nodes[level], path->slots[level]))
BUG();
-enospc:
+out:
if (right) {
btrfs_tree_unlock(right);
free_extent_buffer(right);
@@ -1278,7 +1318,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(mid, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
@@ -1333,7 +1378,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2379,6 +2429,87 @@ done:
}
/*
+ * Search the tree again to find a leaf with smaller keys.
+ * Returns 0 if it found something.
+ * Returns 1 if there are no smaller keys.
+ * Returns < 0 on error.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
+ */
+static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ struct btrfs_key orig_key;
+ struct btrfs_disk_key found_key;
+ int ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+ orig_key = key;
+
+ if (key.offset > 0) {
+ key.offset--;
+ } else if (key.type > 0) {
+ key.type--;
+ key.offset = (u64)-1;
+ } else if (key.objectid > 0) {
+ key.objectid--;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+ } else {
+ return 1;
+ }
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret <= 0)
+ return ret;
+
+ /*
+ * Previous key not found. Even if we were at slot 0 of the leaf we had
+ * before releasing the path and calling btrfs_search_slot(), we now may
+ * be in a slot pointing to the same original key - this can happen if
+ * after we released the path, one of more items were moved from a
+ * sibling leaf into the front of the leaf we had due to an insertion
+ * (see push_leaf_right()).
+ * If we hit this case and our slot is > 0 and just decrement the slot
+ * so that the caller does not process the same key again, which may or
+ * may not break the caller, depending on its logic.
+ */
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
+ ret = comp_keys(&found_key, &orig_key);
+ if (ret == 0) {
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ return 0;
+ }
+ /*
+ * At slot 0, same key as before, it means orig_key is
+ * the lowest, leftmost, key in the tree. We're done.
+ */
+ return 1;
+ }
+ }
+
+ btrfs_item_key(path->nodes[0], &found_key, 0);
+ ret = comp_keys(&found_key, &key);
+ /*
+ * We might have had an item with the previous key in the tree right
+ * before we released our path. And after we released our path, that
+ * item might have been pushed to the first slot (0) of the leaf we
+ * were holding due to a tree balance. Alternatively, an item with the
+ * previous key can exist as the only element of a leaf (big fat item).
+ * Therefore account for these 2 cases, so that our callers (like
+ * btrfs_previous_item) don't miss an existing item with a key matching
+ * the previous key we computed above.
+ */
+ if (ret <= 0)
+ return 0;
+ return 1;
+}
+
+/*
* helper to use instead of search slot if no exact match is needed but
* instead the next or previous item should be returned.
* When find_higher is true, the next higher item is returned, the next lower
@@ -2552,6 +2683,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
if (slot > 0) {
btrfs_item_key(eb, &disk_key, slot - 1);
if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
+ btrfs_print_leaf(eb);
btrfs_crit(fs_info,
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
slot, btrfs_disk_key_objectid(&disk_key),
@@ -2559,13 +2691,13 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
btrfs_disk_key_offset(&disk_key),
new_key->objectid, new_key->type,
new_key->offset);
- btrfs_print_leaf(eb);
BUG();
}
}
if (slot < btrfs_header_nritems(eb) - 1) {
btrfs_item_key(eb, &disk_key, slot + 1);
if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
+ btrfs_print_leaf(eb);
btrfs_crit(fs_info,
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
slot, btrfs_disk_key_objectid(&disk_key),
@@ -2573,7 +2705,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
btrfs_disk_key_offset(&disk_key),
new_key->objectid, new_key->type,
new_key->offset);
- btrfs_print_leaf(eb);
BUG();
}
}
@@ -2626,7 +2757,7 @@ static bool check_sibling_keys(struct extent_buffer *left,
btrfs_item_key_to_cpu(right, &right_first, 0);
}
- if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
btrfs_crit(left->fs_info, "left extent buffer:");
btrfs_print_tree(left, false);
btrfs_crit(left->fs_info, "right extent buffer:");
@@ -2703,8 +2834,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
if (push_items < src_nritems) {
/*
- * Don't call btrfs_tree_mod_log_insert_move() here, key removal
- * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
+ * btrfs_tree_mod_log_eb_copy handles logging the move, so we
+ * don't need to do an explicit tree mod log operation for it.
*/
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
btrfs_node_key_ptr_offset(src, push_items),
@@ -2765,8 +2896,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
return ret;
}
- ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
- BUG_ON(ret < 0);
+
+ /*
+ * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
+ * need to do an explicit tree mod log operation for it.
+ */
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
btrfs_node_key_ptr_offset(dst, 0),
(dst_nritems) *
@@ -2840,7 +2974,12 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
old = root->node;
ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
+ btrfs_tree_unlock(c);
+ free_extent_buffer(c);
+ return ret;
+ }
rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
@@ -2861,10 +3000,10 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
* slot and level indicate where you want the key to go, and
* blocknr is the block the key points to.
*/
-static void insert_ptr(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_disk_key *key, u64 bytenr,
- int slot, int level)
+static int insert_ptr(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_disk_key *key, u64 bytenr,
+ int slot, int level)
{
struct extent_buffer *lower;
int nritems;
@@ -2880,7 +3019,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
if (level) {
ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
slot, nritems - slot);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(lower, slot + 1),
@@ -2890,7 +3032,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
if (level) {
ret = btrfs_tree_mod_log_insert_key(lower, slot,
BTRFS_MOD_LOG_KEY_ADD);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
btrfs_set_node_key(lower, key, slot);
btrfs_set_node_blockptr(lower, slot, bytenr);
@@ -2898,6 +3043,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(lower, slot, trans->transid);
btrfs_set_header_nritems(lower, nritems + 1);
btrfs_mark_buffer_dirty(lower);
+
+ return 0;
}
/*
@@ -2962,6 +3109,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
if (ret) {
+ btrfs_tree_unlock(split);
+ free_extent_buffer(split);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2975,8 +3124,13 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
- insert_ptr(trans, path, &disk_key, split->start,
- path->slots[level + 1] + 1, level + 1);
+ ret = insert_ptr(trans, path, &disk_key, split->start,
+ path->slots[level + 1] + 1, level + 1);
+ if (ret < 0) {
+ btrfs_tree_unlock(split);
+ free_extent_buffer(split);
+ return ret;
+ }
if (path->slots[level] >= mid) {
path->slots[level] -= mid;
@@ -2996,7 +3150,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
* and nr indicate which items in the leaf to check. This totals up the
* space used both by the item structs and the item data
*/
-static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
{
int data_len;
int nritems = btrfs_header_nritems(l);
@@ -3016,7 +3170,7 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
* the start of the leaf data. IOW, how much room
* the leaf has left for both items and data
*/
-noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
+int btrfs_leaf_free_space(const struct extent_buffer *leaf)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
int nritems = btrfs_header_nritems(leaf);
@@ -3453,16 +3607,17 @@ out:
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
*/
-static noinline void copy_for_split(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct extent_buffer *l,
- struct extent_buffer *right,
- int slot, int mid, int nritems)
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct extent_buffer *l,
+ struct extent_buffer *right,
+ int slot, int mid, int nritems)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int data_copy_size;
int rt_data_off;
int i;
+ int ret;
struct btrfs_disk_key disk_key;
struct btrfs_map_token token;
@@ -3487,7 +3642,9 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(l, mid);
btrfs_item_key(right, &disk_key, 0);
- insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
+ ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
+ if (ret < 0)
+ return ret;
btrfs_mark_buffer_dirty(right);
btrfs_mark_buffer_dirty(l);
@@ -3505,6 +3662,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
}
BUG_ON(path->slots[0] < 0);
+
+ return 0;
}
/*
@@ -3703,8 +3862,13 @@ again:
if (split == 0) {
if (mid <= slot) {
btrfs_set_header_nritems(right, 0);
- insert_ptr(trans, path, &disk_key,
- right->start, path->slots[1] + 1, 1);
+ ret = insert_ptr(trans, path, &disk_key,
+ right->start, path->slots[1] + 1, 1);
+ if (ret < 0) {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3712,8 +3876,13 @@ again:
path->slots[1] += 1;
} else {
btrfs_set_header_nritems(right, 0);
- insert_ptr(trans, path, &disk_key,
- right->start, path->slots[1], 1);
+ ret = insert_ptr(trans, path, &disk_key,
+ right->start, path->slots[1], 1);
+ if (ret < 0) {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3729,7 +3898,12 @@ again:
return ret;
}
- copy_for_split(trans, path, l, right, slot, mid, nritems);
+ ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
+ if (ret < 0) {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
if (split == 2) {
BUG_ON(num_doubles != 0);
@@ -3826,7 +4000,12 @@ static noinline int split_item(struct btrfs_path *path,
struct btrfs_disk_key disk_key;
leaf = path->nodes[0];
- BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
+ /*
+ * Shouldn't happen because the caller must have previously called
+ * setup_leaf_for_split() to make room for the new item in the leaf.
+ */
+ if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
+ return -ENOSPC;
orig_slot = path->slots[0];
orig_offset = btrfs_item_offset(leaf, path->slots[0]);
@@ -4273,9 +4452,11 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
*
* the tree should have been previously balanced so the deletion does not
* empty a node.
+ *
+ * This is exported for use inside btrfs-progs, don't un-export it.
*/
-static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
- int level, int slot)
+int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int level, int slot)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
@@ -4286,7 +4467,10 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
if (level) {
ret = btrfs_tree_mod_log_insert_move(parent, slot,
slot + 1, nritems - slot - 1);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
memmove_extent_buffer(parent,
btrfs_node_key_ptr_offset(parent, slot),
@@ -4296,7 +4480,10 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
} else if (level) {
ret = btrfs_tree_mod_log_insert_key(parent, slot,
BTRFS_MOD_LOG_KEY_REMOVE);
- BUG_ON(ret < 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
nritems--;
@@ -4312,6 +4499,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
fixup_low_keys(path, &disk_key, level + 1);
}
btrfs_mark_buffer_dirty(parent);
+ return 0;
}
/*
@@ -4324,13 +4512,17 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
* The path must have already been setup for deleting the leaf, including
* all the proper balancing. path->nodes[1] must be locked.
*/
-static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *leaf)
+static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf)
{
+ int ret;
+
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
- del_ptr(root, path, 1, path->slots[1]);
+ ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]);
+ if (ret < 0)
+ return ret;
/*
* btrfs_free_extent is expensive, we want to make sure we
@@ -4343,6 +4535,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
atomic_inc(&leaf->refs);
btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
+ return 0;
}
/*
* delete the item at the leaf level in path. If that empties
@@ -4392,7 +4585,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_level(leaf, 0);
} else {
btrfs_clear_buffer_dirty(trans, leaf);
- btrfs_del_leaf(trans, root, path, leaf);
+ ret = btrfs_del_leaf(trans, root, path, leaf);
+ if (ret < 0)
+ return ret;
}
} else {
int used = leaf_space_used(leaf, 0, nritems);
@@ -4416,7 +4611,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
- * for possible call to del_ptr below
+ * for possible call to btrfs_del_ptr below
*/
slot = path->slots[1];
atomic_inc(&leaf->refs);
@@ -4453,7 +4648,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (btrfs_header_nritems(leaf) == 0) {
path->slots[1] = slot;
- btrfs_del_leaf(trans, root, path, leaf);
+ ret = btrfs_del_leaf(trans, root, path, leaf);
+ if (ret < 0)
+ return ret;
free_extent_buffer(leaf);
ret = 0;
} else {
@@ -4474,86 +4671,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/*
- * search the tree again to find a leaf with lesser keys
- * returns 0 if it found something or 1 if there are no lesser leaves.
- * returns < 0 on io errors.
- *
- * This may release the path, and so you may lose any locks held at the
- * time you call it.
- */
-int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
-{
- struct btrfs_key key;
- struct btrfs_key orig_key;
- struct btrfs_disk_key found_key;
- int ret;
-
- btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
- orig_key = key;
-
- if (key.offset > 0) {
- key.offset--;
- } else if (key.type > 0) {
- key.type--;
- key.offset = (u64)-1;
- } else if (key.objectid > 0) {
- key.objectid--;
- key.type = (u8)-1;
- key.offset = (u64)-1;
- } else {
- return 1;
- }
-
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret <= 0)
- return ret;
-
- /*
- * Previous key not found. Even if we were at slot 0 of the leaf we had
- * before releasing the path and calling btrfs_search_slot(), we now may
- * be in a slot pointing to the same original key - this can happen if
- * after we released the path, one of more items were moved from a
- * sibling leaf into the front of the leaf we had due to an insertion
- * (see push_leaf_right()).
- * If we hit this case and our slot is > 0 and just decrement the slot
- * so that the caller does not process the same key again, which may or
- * may not break the caller, depending on its logic.
- */
- if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
- btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
- ret = comp_keys(&found_key, &orig_key);
- if (ret == 0) {
- if (path->slots[0] > 0) {
- path->slots[0]--;
- return 0;
- }
- /*
- * At slot 0, same key as before, it means orig_key is
- * the lowest, leftmost, key in the tree. We're done.
- */
- return 1;
- }
- }
-
- btrfs_item_key(path->nodes[0], &found_key, 0);
- ret = comp_keys(&found_key, &key);
- /*
- * We might have had an item with the previous key in the tree right
- * before we released our path. And after we released our path, that
- * item might have been pushed to the first slot (0) of the leaf we
- * were holding due to a tree balance. Alternatively, an item with the
- * previous key can exist as the only element of a leaf (big fat item).
- * Therefore account for these 2 cases, so that our callers (like
- * btrfs_previous_item) don't miss an existing item with a key matching
- * the previous key we computed above.
- */
- if (ret <= 0)
- return 0;
- return 1;
-}
-
-/*
* A helper function to walk down the tree starting at min_key, and looking
* for nodes or leaves that are have a minimum transaction id.
* This is used by the btree defrag code, and tree logging
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4c1986cd5bed..f2d2b313bde5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -541,6 +541,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct extent_buffer **cow_ret, u64 new_root_objectid);
int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf);
+int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_path *path, u32 data_size);
void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
@@ -633,7 +635,6 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
return btrfs_insert_empty_items(trans, root, path, &batch);
}
-int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq);
@@ -686,7 +687,7 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
{
return btrfs_next_old_item(root, p, 0);
}
-int btrfs_leaf_free_space(struct extent_buffer *leaf);
+int btrfs_leaf_free_space(const struct extent_buffer *leaf);
static inline int is_fstree(u64 rootid)
{
@@ -702,6 +703,7 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
}
+u16 btrfs_csum_type_size(u16 type);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
const char *btrfs_super_csum_driver(u16 csum_type);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 8065341d831a..f2ff4cbe8656 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -1040,7 +1040,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
clear_extent_bit(&inode->io_tree, start, start + len - 1,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, cached_state);
- set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
+ set_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
/* Update the page status */
for (i = start_index - first_index; i <= last_index - first_index; i++) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 0b32432d7d56..6a13cf00218b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -407,7 +407,6 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
- ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
}
@@ -507,6 +506,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
{
struct btrfs_delayed_ref_head *head;
+ lockdep_assert_held(&delayed_refs->lock);
again:
head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
true);
@@ -531,7 +531,7 @@ again:
href_node);
}
- head->processing = 1;
+ head->processing = true;
WARN_ON(delayed_refs->num_heads_ready == 0);
delayed_refs->num_heads_ready--;
delayed_refs->run_delayed_start = head->bytenr +
@@ -549,31 +549,35 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
RB_CLEAR_NODE(&head->href_node);
atomic_dec(&delayed_refs->num_entries);
delayed_refs->num_heads--;
- if (head->processing == 0)
+ if (!head->processing)
delayed_refs->num_heads_ready--;
}
/*
* Helper to insert the ref_node to the tail or merge with tail.
*
- * Return 0 for insert.
- * Return >0 for merge.
+ * Return false if the ref was inserted.
+ * Return true if the ref was merged into an existing one (and therefore can be
+ * freed by the caller).
*/
-static int insert_delayed_ref(struct btrfs_delayed_ref_root *root,
- struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *ref)
+static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
+ struct btrfs_delayed_ref_head *href,
+ struct btrfs_delayed_ref_node *ref)
{
struct btrfs_delayed_ref_node *exist;
int mod;
- int ret = 0;
spin_lock(&href->lock);
exist = tree_insert(&href->ref_tree, ref);
- if (!exist)
- goto inserted;
+ if (!exist) {
+ if (ref->action == BTRFS_ADD_DELAYED_REF)
+ list_add_tail(&ref->add_list, &href->ref_add_list);
+ atomic_inc(&root->num_entries);
+ spin_unlock(&href->lock);
+ return false;
+ }
/* Now we are sure we can merge */
- ret = 1;
if (exist->action == ref->action) {
mod = ref->ref_mod;
} else {
@@ -600,13 +604,7 @@ static int insert_delayed_ref(struct btrfs_delayed_ref_root *root,
if (exist->ref_mod == 0)
drop_delayed_ref(root, href, exist);
spin_unlock(&href->lock);
- return ret;
-inserted:
- if (ref->action == BTRFS_ADD_DELAYED_REF)
- list_add_tail(&ref->add_list, &href->ref_add_list);
- atomic_inc(&root->num_entries);
- spin_unlock(&href->lock);
- return ret;
+ return true;
}
/*
@@ -699,34 +697,38 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
bool is_system)
{
int count_mod = 1;
- int must_insert_reserved = 0;
+ bool must_insert_reserved = false;
/* If reserved is provided, it must be a data extent. */
BUG_ON(!is_data && reserved);
- /*
- * The head node stores the sum of all the mods, so dropping a ref
- * should drop the sum in the head node by one.
- */
- if (action == BTRFS_UPDATE_DELAYED_HEAD)
+ switch (action) {
+ case BTRFS_UPDATE_DELAYED_HEAD:
count_mod = 0;
- else if (action == BTRFS_DROP_DELAYED_REF)
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ /*
+ * The head node stores the sum of all the mods, so dropping a ref
+ * should drop the sum in the head node by one.
+ */
count_mod = -1;
-
- /*
- * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
- * accounting when the extent is finally added, or if a later
- * modification deletes the delayed ref without ever inserting the
- * extent into the extent allocation tree. ref->must_insert_reserved
- * is the flag used to record that accounting mods are required.
- *
- * Once we record must_insert_reserved, switch the action to
- * BTRFS_ADD_DELAYED_REF because other special casing is not required.
- */
- if (action == BTRFS_ADD_DELAYED_EXTENT)
- must_insert_reserved = 1;
- else
- must_insert_reserved = 0;
+ break;
+ case BTRFS_ADD_DELAYED_EXTENT:
+ /*
+ * BTRFS_ADD_DELAYED_EXTENT means that we need to update the
+ * reserved accounting when the extent is finally added, or if a
+ * later modification deletes the delayed ref without ever
+ * inserting the extent into the extent allocation tree.
+ * ref->must_insert_reserved is the flag used to record that
+ * accounting mods are required.
+ *
+ * Once we record must_insert_reserved, switch the action to
+ * BTRFS_ADD_DELAYED_REF because other special casing is not
+ * required.
+ */
+ must_insert_reserved = true;
+ break;
+ }
refcount_set(&head_ref->refs, 1);
head_ref->bytenr = bytenr;
@@ -738,7 +740,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node);
- head_ref->processing = 0;
+ head_ref->processing = false;
head_ref->total_ref_mod = count_mod;
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
@@ -763,11 +765,11 @@ static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_qgroup_extent_record *qrecord,
- int action, int *qrecord_inserted_ret)
+ int action, bool *qrecord_inserted_ret)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
- int qrecord_inserted = 0;
+ bool qrecord_inserted = false;
delayed_refs = &trans->transaction->delayed_refs;
@@ -777,7 +779,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
delayed_refs, qrecord))
kfree(qrecord);
else
- qrecord_inserted = 1;
+ qrecord_inserted = true;
}
trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
@@ -853,8 +855,6 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
ref->num_bytes = num_bytes;
ref->ref_mod = 1;
ref->action = action;
- ref->is_head = 0;
- ref->in_tree = 1;
ref->seq = seq;
ref->type = ref_type;
RB_CLEAR_NODE(&ref->ref_node);
@@ -875,11 +875,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
- int qrecord_inserted;
+ bool qrecord_inserted;
bool is_system;
+ bool merged;
int action = generic_ref->action;
int level = generic_ref->tree_ref.level;
- int ret;
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
@@ -935,7 +935,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
@@ -947,7 +947,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
action == BTRFS_ADD_DELAYED_EXTENT ?
BTRFS_ADD_DELAYED_REF : action);
- if (ret > 0)
+ if (merged)
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
if (qrecord_inserted)
@@ -968,9 +968,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
- int qrecord_inserted;
+ bool qrecord_inserted;
int action = generic_ref->action;
- int ret;
+ bool merged;
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
@@ -1027,7 +1027,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
@@ -1039,7 +1039,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
action == BTRFS_ADD_DELAYED_EXTENT ?
BTRFS_ADD_DELAYED_REF : action);
- if (ret > 0)
+ if (merged)
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index b54261fe509b..b8e14b0ba5f1 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -48,9 +48,6 @@ struct btrfs_delayed_ref_node {
unsigned int action:8;
unsigned int type:8;
- /* is this node still in the rbtree? */
- unsigned int is_head:1;
- unsigned int in_tree:1;
};
struct btrfs_delayed_extent_op {
@@ -70,20 +67,26 @@ struct btrfs_delayed_extent_op {
struct btrfs_delayed_ref_head {
u64 bytenr;
u64 num_bytes;
- refcount_t refs;
+ /*
+ * For insertion into struct btrfs_delayed_ref_root::href_root.
+ * Keep it in the same cache line as 'bytenr' for more efficient
+ * searches in the rbtree.
+ */
+ struct rb_node href_node;
/*
* the mutex is held while running the refs, and it is also
* held when checking the sum of reference modifications.
*/
struct mutex mutex;
+ refcount_t refs;
+
+ /* Protects 'ref_tree' and 'ref_add_list'. */
spinlock_t lock;
struct rb_root_cached ref_tree;
/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
struct list_head ref_add_list;
- struct rb_node href_node;
-
struct btrfs_delayed_extent_op *extent_op;
/*
@@ -113,10 +116,10 @@ struct btrfs_delayed_ref_head {
* we need to update the in ram accounting to properly reflect
* the free has happened.
*/
- unsigned int must_insert_reserved:1;
- unsigned int is_data:1;
- unsigned int is_system:1;
- unsigned int processing:1;
+ bool must_insert_reserved;
+ bool is_data;
+ bool is_system;
+ bool processing;
};
struct btrfs_delayed_tree_ref {
@@ -337,7 +340,7 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
{
WARN_ON(refcount_read(&ref->refs) == 0);
if (refcount_dec_and_test(&ref->refs)) {
- WARN_ON(ref->in_tree);
+ WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
switch (ref->type) {
case BTRFS_TREE_BLOCK_REF_KEY:
case BTRFS_SHARED_BLOCK_REF_KEY:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 78696d331639..5e86bea0a950 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -41,7 +41,7 @@
* All new writes will be written to both target and source devices, so even
* if replace gets canceled, sources device still contains up-to-date data.
*
- * Location: handle_ops_on_dev_replace() from __btrfs_map_block()
+ * Location: handle_ops_on_dev_replace() from btrfs_map_block()
* Start: btrfs_dev_replace_start()
* End: btrfs_dev_replace_finishing()
* Content: Latest data/metadata
@@ -795,8 +795,8 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
while (!find_first_extent_bit(&srcdev->alloc_state, start,
&found_start, &found_end,
CHUNK_ALLOCATED, &cached_state)) {
- ret = set_extent_bits(&tgtdev->alloc_state, found_start,
- found_end, CHUNK_ALLOCATED);
+ ret = set_extent_bit(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED, NULL);
if (ret)
break;
start = found_end + 1;
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index a6d77fe41e1a..944a7340f6a4 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -73,6 +73,23 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
return &discard_ctl->discard_list[block_group->discard_index];
}
+/*
+ * Determine if async discard should be running.
+ *
+ * @discard_ctl: discard control
+ *
+ * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
+ */
+static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
+{
+ struct btrfs_fs_info *fs_info = container_of(discard_ctl,
+ struct btrfs_fs_info,
+ discard_ctl);
+
+ return (!(fs_info->sb->s_flags & SB_RDONLY) &&
+ test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
+}
+
static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
@@ -545,23 +562,6 @@ static void btrfs_discard_workfn(struct work_struct *work)
}
/*
- * Determine if async discard should be running.
- *
- * @discard_ctl: discard control
- *
- * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
- */
-bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
-{
- struct btrfs_fs_info *fs_info = container_of(discard_ctl,
- struct btrfs_fs_info,
- discard_ctl);
-
- return (!(fs_info->sb->s_flags & SB_RDONLY) &&
- test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
-}
-
-/*
* Recalculate the base delay.
*
* @discard_ctl: discard control
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
index 57b9202f427f..dddb0f9101ba 100644
--- a/fs/btrfs/discard.h
+++ b/fs/btrfs/discard.h
@@ -24,7 +24,6 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group);
void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
bool override);
-bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl);
/* Update operations */
void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dabc79c1af1b..7513388b0567 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -60,15 +60,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
-static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
- struct btrfs_fs_info *fs_info);
-static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
-static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *dirty_pages,
- int mark);
-static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *pinned_extents);
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
@@ -110,35 +101,27 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
* detect blocks that either didn't get written at all or got written
* in the wrong place.
*/
-static int verify_parent_transid(struct extent_io_tree *io_tree,
- struct extent_buffer *eb, u64 parent_transid,
- int atomic)
+int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
{
- struct extent_state *cached_state = NULL;
- int ret;
+ if (!extent_buffer_uptodate(eb))
+ return 0;
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
- return 0;
+ return 1;
if (atomic)
return -EAGAIN;
- lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
- if (extent_buffer_uptodate(eb) &&
- btrfs_header_generation(eb) == parent_transid) {
- ret = 0;
- goto out;
- }
- btrfs_err_rl(eb->fs_info,
+ if (!extent_buffer_uptodate(eb) ||
+ btrfs_header_generation(eb) != parent_transid) {
+ btrfs_err_rl(eb->fs_info,
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
eb->start, eb->read_mirror,
parent_transid, btrfs_header_generation(eb));
- ret = 1;
- clear_extent_buffer_uptodate(eb);
-out:
- unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
- return ret;
+ clear_extent_buffer_uptodate(eb);
+ return 0;
+ }
+ return 1;
}
static bool btrfs_supported_super_csum(u16 csum_type)
@@ -180,64 +163,6 @@ int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
- struct btrfs_key *first_key, u64 parent_transid)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int found_level;
- struct btrfs_key found_key;
- int ret;
-
- found_level = btrfs_header_level(eb);
- if (found_level != level) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: tree level check failed\n");
- btrfs_err(fs_info,
-"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
- eb->start, level, found_level);
- return -EIO;
- }
-
- if (!first_key)
- return 0;
-
- /*
- * For live tree block (new tree blocks in current transaction),
- * we need proper lock context to avoid race, which is impossible here.
- * So we only checks tree blocks which is read from disk, whose
- * generation <= fs_info->last_trans_committed.
- */
- if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
- return 0;
-
- /* We have @first_key, so this @eb must have at least one item */
- if (btrfs_header_nritems(eb) == 0) {
- btrfs_err(fs_info,
- "invalid tree nritems, bytenr=%llu nritems=0 expect >0",
- eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- return -EUCLEAN;
- }
-
- if (found_level)
- btrfs_node_key_to_cpu(eb, &found_key, 0);
- else
- btrfs_item_key_to_cpu(eb, &found_key, 0);
- ret = btrfs_comp_cpu_keys(first_key, &found_key);
-
- if (ret) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: tree first key check failed\n");
- btrfs_err(fs_info,
-"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
- eb->start, parent_transid, first_key->objectid,
- first_key->type, first_key->offset,
- found_key.objectid, found_key.type,
- found_key.offset);
- }
- return ret;
-}
-
static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
@@ -312,12 +237,34 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
return ret;
}
-static int csum_one_extent_buffer(struct extent_buffer *eb)
+/*
+ * Checksum a dirty tree block before IO.
+ */
+blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
{
+ struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
+ u64 found_start = btrfs_header_bytenr(eb);
u8 result[BTRFS_CSUM_SIZE];
int ret;
+ /* Btree blocks are always contiguous on disk. */
+ if (WARN_ON_ONCE(bbio->file_offset != eb->start))
+ return BLK_STS_IOERR;
+ if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
+ return BLK_STS_IOERR;
+
+ if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
+ WARN_ON_ONCE(found_start != 0);
+ return BLK_STS_OK;
+ }
+
+ if (WARN_ON_ONCE(found_start != eb->start))
+ return BLK_STS_IOERR;
+ if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start,
+ eb->len)))
+ return BLK_STS_IOERR;
+
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE) == 0);
@@ -326,7 +273,7 @@ static int csum_one_extent_buffer(struct extent_buffer *eb)
if (btrfs_header_level(eb))
ret = btrfs_check_node(eb);
else
- ret = btrfs_check_leaf_full(eb);
+ ret = btrfs_check_leaf(eb);
if (ret < 0)
goto error;
@@ -344,8 +291,7 @@ static int csum_one_extent_buffer(struct extent_buffer *eb)
goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
-
- return 0;
+ return BLK_STS_OK;
error:
btrfs_print_tree(eb, 0);
@@ -359,103 +305,10 @@ error:
*/
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
- return ret;
-}
-
-/* Checksum all dirty extent buffers in one bio_vec */
-static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
- struct bio_vec *bvec)
-{
- struct page *page = bvec->bv_page;
- u64 bvec_start = page_offset(page) + bvec->bv_offset;
- u64 cur;
- int ret = 0;
-
- for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
- cur += fs_info->nodesize) {
- struct extent_buffer *eb;
- bool uptodate;
-
- eb = find_extent_buffer(fs_info, cur);
- uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
- fs_info->nodesize);
-
- /* A dirty eb shouldn't disappear from buffer_radix */
- if (WARN_ON(!eb))
- return -EUCLEAN;
-
- if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
- free_extent_buffer(eb);
- return -EUCLEAN;
- }
- if (WARN_ON(!uptodate)) {
- free_extent_buffer(eb);
- return -EUCLEAN;
- }
-
- ret = csum_one_extent_buffer(eb);
- free_extent_buffer(eb);
- if (ret < 0)
- return ret;
- }
- return ret;
-}
-
-/*
- * Checksum a dirty tree block before IO. This has extra checks to make sure
- * we only fill in the checksum field in the first page of a multi-page block.
- * For subpage extent buffers we need bvec to also read the offset in the page.
- */
-static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
-{
- struct page *page = bvec->bv_page;
- u64 start = page_offset(page);
- u64 found_start;
- struct extent_buffer *eb;
-
- if (fs_info->nodesize < PAGE_SIZE)
- return csum_dirty_subpage_buffers(fs_info, bvec);
-
- eb = (struct extent_buffer *)page->private;
- if (page != eb->pages[0])
- return 0;
-
- found_start = btrfs_header_bytenr(eb);
-
- if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
- WARN_ON(found_start != 0);
- return 0;
- }
-
- /*
- * Please do not consolidate these warnings into a single if.
- * It is useful to know what went wrong.
- */
- if (WARN_ON(found_start != start))
- return -EUCLEAN;
- if (WARN_ON(!PageUptodate(page)))
- return -EUCLEAN;
-
- return csum_one_extent_buffer(eb);
-}
-
-blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
-{
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
- struct bvec_iter iter;
- struct bio_vec bv;
- int ret = 0;
-
- bio_for_each_segment(bv, &bbio->bio, iter) {
- ret = csum_dirty_buffer(fs_info, &bv);
- if (ret)
- break;
- }
-
return errno_to_blk_status(ret);
}
-static int check_tree_block_fsid(struct extent_buffer *eb)
+static bool check_tree_block_fsid(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
@@ -475,18 +328,18 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
metadata_uuid = fs_devices->fsid;
if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
- return 0;
+ return false;
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
- return 0;
+ return false;
- return 1;
+ return true;
}
/* Do basic extent buffer checks at read time */
-static int validate_extent_buffer(struct extent_buffer *eb,
- struct btrfs_tree_parent_check *check)
+int btrfs_validate_extent_buffer(struct extent_buffer *eb,
+ struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
@@ -583,7 +436,7 @@ static int validate_extent_buffer(struct extent_buffer *eb,
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
- if (found_level == 0 && btrfs_check_leaf_full(eb)) {
+ if (found_level == 0 && btrfs_check_leaf(eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
@@ -591,9 +444,7 @@ static int validate_extent_buffer(struct extent_buffer *eb,
if (found_level > 0 && btrfs_check_node(eb))
ret = -EIO;
- if (!ret)
- set_extent_buffer_uptodate(eb);
- else
+ if (ret)
btrfs_err(fs_info,
"read time tree block corruption detected on logical %llu mirror %u",
eb->start, eb->read_mirror);
@@ -601,105 +452,6 @@ out:
return ret;
}
-static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
- int mirror, struct btrfs_tree_parent_check *check)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
- struct extent_buffer *eb;
- bool reads_done;
- int ret = 0;
-
- ASSERT(check);
-
- /*
- * We don't allow bio merge for subpage metadata read, so we should
- * only get one eb for each endio hook.
- */
- ASSERT(end == start + fs_info->nodesize - 1);
- ASSERT(PagePrivate(page));
-
- eb = find_extent_buffer(fs_info, start);
- /*
- * When we are reading one tree block, eb must have been inserted into
- * the radix tree. If not, something is wrong.
- */
- ASSERT(eb);
-
- reads_done = atomic_dec_and_test(&eb->io_pages);
- /* Subpage read must finish in page read */
- ASSERT(reads_done);
-
- eb->read_mirror = mirror;
- if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
- ret = -EIO;
- goto err;
- }
- ret = validate_extent_buffer(eb, check);
- if (ret < 0)
- goto err;
-
- set_extent_buffer_uptodate(eb);
-
- free_extent_buffer(eb);
- return ret;
-err:
- /*
- * end_bio_extent_readpage decrements io_pages in case of error,
- * make sure it has something to decrement.
- */
- atomic_inc(&eb->io_pages);
- clear_extent_buffer_uptodate(eb);
- free_extent_buffer(eb);
- return ret;
-}
-
-int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
- struct page *page, u64 start, u64 end,
- int mirror)
-{
- struct extent_buffer *eb;
- int ret = 0;
- int reads_done;
-
- ASSERT(page->private);
-
- if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
- return validate_subpage_buffer(page, start, end, mirror,
- &bbio->parent_check);
-
- eb = (struct extent_buffer *)page->private;
-
- /*
- * The pending IO might have been the only thing that kept this buffer
- * in memory. Make sure we have a ref for all this other checks
- */
- atomic_inc(&eb->refs);
-
- reads_done = atomic_dec_and_test(&eb->io_pages);
- if (!reads_done)
- goto err;
-
- eb->read_mirror = mirror;
- if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
- ret = -EIO;
- goto err;
- }
- ret = validate_extent_buffer(eb, &bbio->parent_check);
-err:
- if (ret) {
- /*
- * our io error hook is going to dec the io pages
- * again, we have to make sure it has something
- * to decrement
- */
- atomic_inc(&eb->io_pages);
- clear_extent_buffer_uptodate(eb);
- }
- free_extent_buffer(eb);
-
- return ret;
-}
-
#ifdef CONFIG_MIGRATION
static int btree_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
@@ -1396,8 +1148,7 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)root_id);
- if (root)
- root = btrfs_grab_root(root);
+ root = btrfs_grab_root(root);
spin_unlock(&fs_info->fs_roots_radix_lock);
return root;
}
@@ -1411,31 +1162,28 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
.offset = 0,
};
- if (objectid == BTRFS_ROOT_TREE_OBJECTID)
+ switch (objectid) {
+ case BTRFS_ROOT_TREE_OBJECTID:
return btrfs_grab_root(fs_info->tree_root);
- if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ case BTRFS_EXTENT_TREE_OBJECTID:
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
- if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ case BTRFS_CHUNK_TREE_OBJECTID:
return btrfs_grab_root(fs_info->chunk_root);
- if (objectid == BTRFS_DEV_TREE_OBJECTID)
+ case BTRFS_DEV_TREE_OBJECTID:
return btrfs_grab_root(fs_info->dev_root);
- if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ case BTRFS_CSUM_TREE_OBJECTID:
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
+ case BTRFS_QUOTA_TREE_OBJECTID:
+ return btrfs_grab_root(fs_info->quota_root);
+ case BTRFS_UUID_TREE_OBJECTID:
+ return btrfs_grab_root(fs_info->uuid_root);
+ case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
+ return btrfs_grab_root(fs_info->block_group_root);
+ case BTRFS_FREE_SPACE_TREE_OBJECTID:
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
- if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->quota_root) ?
- fs_info->quota_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_UUID_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->uuid_root) ?
- fs_info->uuid_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->block_group_root) ?
- fs_info->block_group_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
- struct btrfs_root *root = btrfs_global_root(fs_info, &key);
-
- return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
- }
- return NULL;
+ default:
+ return NULL;
+ }
}
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
@@ -1991,7 +1739,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
btrfs_destroy_workqueue(fs_info->fixup_workers);
btrfs_destroy_workqueue(fs_info->delalloc_workers);
- btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
if (fs_info->endio_workers)
destroy_workqueue(fs_info->endio_workers);
@@ -2183,12 +1930,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
{
u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+ unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
fs_info->workers =
btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
- fs_info->hipri_workers =
- btrfs_alloc_workqueue(fs_info, "worker-high",
- flags | WQ_HIGHPRI, max_active, 16);
fs_info->delalloc_workers =
btrfs_alloc_workqueue(fs_info, "delalloc",
@@ -2202,7 +1947,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
fs_info->fixup_workers =
- btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
+ btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
fs_info->endio_workers =
alloc_workqueue("btrfs-endio", flags, max_active);
@@ -2221,11 +1966,12 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
max_active, 0);
fs_info->qgroup_rescan_workers =
- btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
+ btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
+ ordered_flags);
fs_info->discard_ctl.discard_workers =
- alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
+ alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE);
- if (!(fs_info->workers && fs_info->hipri_workers &&
+ if (!(fs_info->workers &&
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->compressed_write_workers &&
@@ -2265,6 +2011,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
break;
+ case BTRFS_CSUM_TYPE_XXHASH:
+ set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
+ break;
default:
break;
}
@@ -2642,6 +2391,14 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
+ BTRFS_FSID_SIZE) != 0) {
+ btrfs_err(fs_info,
+ "dev_item UUID does not match metadata fsid: %pU != %pU",
+ fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
+ ret = -EINVAL;
+ }
+
/*
* Artificial requirement for block-group-tree to force newer features
* (free-space-tree, no-holes) so the test matrix is smaller.
@@ -2654,14 +2411,6 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
- BTRFS_FSID_SIZE) != 0) {
- btrfs_err(fs_info,
- "dev_item UUID does not match metadata fsid: %pU != %pU",
- fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
- ret = -EINVAL;
- }
-
/*
* Hint to catch really bogus numbers, bitflips or so, more exact checks are
* done later
@@ -4662,28 +4411,10 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_close_devices(fs_info->fs_devices);
}
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
- int atomic)
-{
- int ret;
- struct inode *btree_inode = buf->pages[0]->mapping->host;
-
- ret = extent_buffer_uptodate(buf);
- if (!ret)
- return ret;
-
- ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
- parent_transid, atomic);
- if (ret == -EAGAIN)
- return ret;
- return !ret;
-}
-
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
u64 transid = btrfs_header_generation(buf);
- int was_dirty;
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/*
@@ -4698,19 +4429,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
buf->start, transid, fs_info->generation);
- was_dirty = set_extent_buffer_dirty(buf);
- if (!was_dirty)
- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
- buf->len,
- fs_info->dirty_metadata_batch);
+ set_extent_buffer_dirty(buf);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
/*
- * Since btrfs_mark_buffer_dirty() can be called with item pointer set
- * but item data not updated.
- * So here we should only check item pointers, not item data.
+ * btrfs_check_leaf() won't check item data if we don't have WRITTEN
+ * set, so this will only validate the basic structure of the items.
*/
- if (btrfs_header_level(buf) == 0 &&
- btrfs_check_leaf_relaxed(buf)) {
+ if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) {
btrfs_print_leaf(buf);
ASSERT(0);
}
@@ -4840,13 +4565,12 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
-static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
- struct btrfs_fs_info *fs_info)
+static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
- int ret = 0;
delayed_refs = &trans->delayed_refs;
@@ -4854,7 +4578,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (atomic_read(&delayed_refs->num_entries) == 0) {
spin_unlock(&delayed_refs->lock);
btrfs_debug(fs_info, "delayed_refs has NO entry");
- return ret;
+ return;
}
while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
@@ -4871,7 +4595,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
- ref->in_tree = 0;
rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
@@ -4916,8 +4639,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
-
- return ret;
}
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
@@ -5142,8 +4863,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
- btrfs_free_redirty_list(cur_trans);
-
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4d5772330110..b03767f4d7ed 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -31,8 +31,6 @@ struct btrfs_tree_parent_check;
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
- struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
struct btrfs_tree_parent_check *check);
struct extent_buffer *btrfs_find_create_tree_block(
@@ -84,9 +82,8 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
- struct page *page, u64 start, u64 end,
- int mirror);
+int btrfs_validate_extent_buffer(struct extent_buffer *eb,
+ struct btrfs_tree_parent_check *check);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 29a225836e28..a2315a4b8b75 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -533,6 +533,16 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
}
/*
+ * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly,
+ * unset the EXTENT_NOWAIT bit.
+ */
+static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
+{
+ *mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS);
+ *bits &= EXTENT_NOWAIT - 1;
+}
+
+/*
* Clear some bits on a range in the tree. This may require splitting or
* inserting elements in the tree, so the gfp mask is used to indicate which
* allocations or sleeping are allowed.
@@ -546,7 +556,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
*/
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
@@ -556,7 +566,9 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int clear = 0;
int wake;
int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+ gfp_t mask;
+ set_gfp_mask_from_bits(&bits, &mask);
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
@@ -953,7 +965,8 @@ out:
/*
* Set some bits on a range in the tree. This may require allocations or
- * sleeping, so the gfp mask is used to indicate what is allowed.
+ * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for
+ * GFP_NOWAIT.
*
* If any of the exclusive bits are set, this will fail with -EEXIST if some
* part of the range already has the desired bits set. The extent_state of the
@@ -968,7 +981,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, u64 *failed_start,
struct extent_state **failed_state,
struct extent_state **cached_state,
- struct extent_changeset *changeset, gfp_t mask)
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -978,7 +991,9 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_start;
u64 last_end;
u32 exclusive_bits = (bits & EXTENT_LOCKED);
+ gfp_t mask;
+ set_gfp_mask_from_bits(&bits, &mask);
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
@@ -1188,10 +1203,10 @@ out:
}
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state, gfp_t mask)
+ u32 bits, struct extent_state **cached_state)
{
return __set_extent_bit(tree, start, end, bits, NULL, NULL,
- cached_state, NULL, mask);
+ cached_state, NULL);
}
/*
@@ -1687,8 +1702,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
ASSERT(!(bits & EXTENT_LOCKED));
- return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL,
- changeset, GFP_NOFS);
+ return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1700,8 +1714,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
ASSERT(!(bits & EXTENT_LOCKED));
- return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS,
- changeset);
+ return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
}
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1711,7 +1724,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
- NULL, cached, NULL, GFP_NOFS);
+ NULL, cached, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
@@ -1733,7 +1746,7 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
- &failed_state, cached_state, NULL, GFP_NOFS);
+ &failed_state, cached_state, NULL);
while (err == -EEXIST) {
if (failed_start != start)
clear_extent_bit(tree, start, failed_start - 1,
@@ -1743,7 +1756,7 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
&failed_state);
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
&failed_start, &failed_state,
- cached_state, NULL, GFP_NOFS);
+ cached_state, NULL);
}
return err;
}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 21766e49ec02..fbd3b275ab1c 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -43,6 +43,15 @@ enum {
* want the extent states to go away.
*/
ENUM_BIT(EXTENT_CLEAR_ALL_BITS),
+
+ /*
+ * This must be last.
+ *
+ * Bit not representing a state but a request for NOWAIT semantics,
+ * e.g. when allocating memory, and must be masked out from the other
+ * bits.
+ */
+ ENUM_BIT(EXTENT_NOWAIT)
};
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
@@ -127,22 +136,20 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached, gfp_t mask,
+ u32 bits, struct extent_state **cached,
struct extent_changeset *changeset);
static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits,
struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, bits, cached,
- GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, bits, cached, NULL);
}
static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
- GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
@@ -154,31 +161,13 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state, gfp_t mask);
-
-static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start,
- u64 end, u32 bits)
-{
- return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT);
-}
-
-static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, u32 bits)
-{
- return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
-}
+ u32 bits, struct extent_state **cached_state);
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
- cached_state, GFP_NOFS, NULL);
-}
-
-static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
+ cached_state, NULL);
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -193,29 +182,6 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, u32 clear_bits,
struct extent_state **cached_state);
-static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
- u64 end, u32 extra_bits,
- struct extent_state **cached_state)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | extra_bits,
- cached_state, GFP_NOFS);
-}
-
-static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DEFRAG,
- cached_state, GFP_NOFS);
-}
-
-static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
- u64 end)
-{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
-}
-
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5cd289de4e92..911908ea5f6f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -73,8 +73,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes)
{
u64 end = start + num_bytes - 1;
- set_extent_bits(&fs_info->excluded_extents, start, end,
- EXTENT_UPTODATE);
+ set_extent_bit(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE, NULL);
return 0;
}
@@ -402,7 +402,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
}
}
- btrfs_print_leaf((struct extent_buffer *)eb);
+ btrfs_print_leaf(eb);
btrfs_err(eb->fs_info,
"eb %llu iref 0x%lx invalid extent inline ref type %d",
eb->start, (unsigned long)iref, type);
@@ -1164,15 +1164,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
* should not happen at all.
*/
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_print_leaf(path->nodes[0]);
btrfs_crit(trans->fs_info,
-"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
- bytenr, num_bytes, root_objectid);
- if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
- WARN_ON(1);
- btrfs_crit(trans->fs_info,
- "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
- btrfs_print_leaf(path->nodes[0]);
- }
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
+ bytenr, num_bytes, root_objectid, path->slots[0]);
return -EUCLEAN;
}
update_inline_extent_backref(path, iref, refs_to_add, extent_op);
@@ -1208,11 +1203,11 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
{
int j, ret = 0;
u64 bytes_left, end;
- u64 aligned_start = ALIGN(start, 1 << 9);
+ u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
if (WARN_ON(start != aligned_start)) {
len -= aligned_start - start;
- len = round_down(len, 1 << 9);
+ len = round_down(len, 1 << SECTOR_SHIFT);
start = aligned_start;
}
@@ -1250,7 +1245,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
}
if (size) {
- ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+ ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
+ size >> SECTOR_SHIFT,
GFP_NOFS);
if (!ret)
*discarded_bytes += size;
@@ -1267,7 +1263,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
}
if (bytes_left) {
- ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+ ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
+ bytes_left >> SECTOR_SHIFT,
GFP_NOFS);
if (!ret)
*discarded_bytes += bytes_left;
@@ -1500,7 +1497,7 @@ out:
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
- int insert_reserved)
+ bool insert_reserved)
{
int ret = 0;
struct btrfs_delayed_data_ref *ref;
@@ -1650,7 +1647,7 @@ out:
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
- int insert_reserved)
+ bool insert_reserved)
{
int ret = 0;
struct btrfs_delayed_tree_ref *ref;
@@ -1690,7 +1687,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
- int insert_reserved)
+ bool insert_reserved)
{
int ret = 0;
@@ -1748,7 +1745,7 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref
struct btrfs_delayed_ref_head *head)
{
spin_lock(&delayed_refs->lock);
- head->processing = 0;
+ head->processing = false;
delayed_refs->num_heads_ready++;
spin_unlock(&delayed_refs->lock);
btrfs_delayed_ref_unlock(head);
@@ -1900,7 +1897,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_extent_op *extent_op;
struct btrfs_delayed_ref_node *ref;
- int must_insert_reserved = 0;
+ bool must_insert_reserved;
int ret;
delayed_refs = &trans->transaction->delayed_refs;
@@ -1916,7 +1913,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
return -EAGAIN;
}
- ref->in_tree = 0;
rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
@@ -1943,7 +1939,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
* spin lock.
*/
must_insert_reserved = locked_ref->must_insert_reserved;
- locked_ref->must_insert_reserved = 0;
+ locked_ref->must_insert_reserved = false;
extent_op = locked_ref->extent_op;
locked_ref->extent_op = NULL;
@@ -2155,10 +2151,10 @@ out:
}
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct extent_buffer *eb, u64 flags,
- int level)
+ struct extent_buffer *eb, u64 flags)
{
struct btrfs_delayed_extent_op *extent_op;
+ int level = btrfs_header_level(eb);
int ret;
extent_op = btrfs_alloc_delayed_extent_op();
@@ -2510,8 +2506,8 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
- bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+ set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
return 0;
}
@@ -2838,6 +2834,13 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
return ret;
}
+#define abort_and_dump(trans, path, fmt, args...) \
+({ \
+ btrfs_abort_transaction(trans, -EUCLEAN); \
+ btrfs_print_leaf(path->nodes[0]); \
+ btrfs_crit(trans->fs_info, fmt, ##args); \
+})
+
/*
* Drop one or more refs of @node.
*
@@ -2978,10 +2981,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (!found_extent) {
if (iref) {
- btrfs_crit(info,
-"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
- btrfs_abort_transaction(trans, -EUCLEAN);
- goto err_dump;
+ abort_and_dump(trans, path,
+"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
+ path->slots[0]);
+ ret = -EUCLEAN;
+ goto out;
}
/* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, extent_root, path,
@@ -3029,11 +3033,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
if (ret) {
- btrfs_err(info,
- "umm, got %d back from search, was looking for %llu",
- ret, bytenr);
if (ret > 0)
btrfs_print_leaf(path->nodes[0]);
+ btrfs_err(info,
+ "umm, got %d back from search, was looking for %llu, slot %d",
+ ret, bytenr, path->slots[0]);
}
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -3042,12 +3046,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
extent_slot = path->slots[0];
}
} else if (WARN_ON(ret == -ENOENT)) {
- btrfs_print_leaf(path->nodes[0]);
- btrfs_err(info,
- "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
- bytenr, parent, root_objectid, owner_objectid,
- owner_offset);
- btrfs_abort_transaction(trans, ret);
+ abort_and_dump(trans, path,
+"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
+ bytenr, parent, root_objectid, owner_objectid,
+ owner_offset, path->slots[0]);
goto out;
} else {
btrfs_abort_transaction(trans, ret);
@@ -3067,14 +3069,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
+
if (item_size < sizeof(*ei) + sizeof(*bi)) {
- btrfs_crit(info,
-"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
- key.objectid, key.type, key.offset,
- owner_objectid, item_size,
- sizeof(*ei) + sizeof(*bi));
- btrfs_abort_transaction(trans, -EUCLEAN);
- goto err_dump;
+ abort_and_dump(trans, path,
+"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
+ key.objectid, key.type, key.offset,
+ path->slots[0], owner_objectid, item_size,
+ sizeof(*ei) + sizeof(*bi));
+ ret = -EUCLEAN;
+ goto out;
}
bi = (struct btrfs_tree_block_info *)(ei + 1);
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
@@ -3082,11 +3085,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
- btrfs_crit(info,
- "trying to drop %d refs but we only have %llu for bytenr %llu",
- refs_to_drop, refs, bytenr);
- btrfs_abort_transaction(trans, -EUCLEAN);
- goto err_dump;
+ abort_and_dump(trans, path,
+ "trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
+ refs_to_drop, refs, bytenr, path->slots[0]);
+ ret = -EUCLEAN;
+ goto out;
}
refs -= refs_to_drop;
@@ -3099,10 +3102,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
*/
if (iref) {
if (!found_extent) {
- btrfs_crit(info,
-"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
- btrfs_abort_transaction(trans, -EUCLEAN);
- goto err_dump;
+ abort_and_dump(trans, path,
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
+ path->slots[0]);
+ ret = -EUCLEAN;
+ goto out;
}
} else {
btrfs_set_extent_refs(leaf, ei, refs);
@@ -3121,21 +3125,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (found_extent) {
if (is_data && refs_to_drop !=
extent_data_ref_count(path, iref)) {
- btrfs_crit(info,
- "invalid refs_to_drop, current refs %u refs_to_drop %u",
- extent_data_ref_count(path, iref),
- refs_to_drop);
- btrfs_abort_transaction(trans, -EUCLEAN);
- goto err_dump;
+ abort_and_dump(trans, path,
+ "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
+ extent_data_ref_count(path, iref),
+ refs_to_drop, path->slots[0]);
+ ret = -EUCLEAN;
+ goto out;
}
if (iref) {
if (path->slots[0] != extent_slot) {
- btrfs_crit(info,
-"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
- key.objectid, key.type,
- key.offset);
- btrfs_abort_transaction(trans, -EUCLEAN);
- goto err_dump;
+ abort_and_dump(trans, path,
+"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
+ key.objectid, key.type,
+ key.offset, path->slots[0]);
+ ret = -EUCLEAN;
+ goto out;
}
} else {
/*
@@ -3145,10 +3149,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
*/
if (path->slots[0] != extent_slot + 1) {
- btrfs_crit(info,
- "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
- btrfs_abort_transaction(trans, -EUCLEAN);
- goto err_dump;
+ abort_and_dump(trans, path,
+ "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
+ path->slots[0]);
+ ret = -EUCLEAN;
+ goto out;
}
path->slots[0] = extent_slot;
num_to_del = 2;
@@ -3170,19 +3175,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
return ret;
-err_dump:
- /*
- * Leaf dump can take up a lot of log buffer, so we only do full leaf
- * dump for debug build.
- */
- if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
- btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
- path->slots[0], extent_slot);
- btrfs_print_leaf(path->nodes[0]);
- }
-
- btrfs_free_path(path);
- return -EUCLEAN;
}
/*
@@ -3219,7 +3211,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out;
btrfs_delete_ref_head(delayed_refs, head);
- head->processing = 0;
+ head->processing = false;
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
@@ -4804,7 +4796,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
!test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
lockdep_owner = BTRFS_FS_TREE_OBJECTID;
- /* btrfs_clean_tree_block() accesses generation field. */
+ /* btrfs_clear_buffer_dirty() accesses generation field. */
btrfs_set_header_generation(buf, trans->transid);
/*
@@ -4836,15 +4828,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* EXTENT bit to differentiate dirty pages.
*/
if (buf->log_index == 0)
- set_extent_dirty(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1, GFP_NOFS);
+ set_extent_bit(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1,
+ EXTENT_DIRTY, NULL);
else
- set_extent_new(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1);
+ set_extent_bit(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1,
+ EXTENT_NEW, NULL);
} else {
buf->log_index = -1;
- set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
- buf->start + buf->len - 1, GFP_NOFS);
+ set_extent_bit(&trans->transaction->dirty_pages, buf->start,
+ buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
}
/* this returns a buffer locked for blocking */
return buf;
@@ -5102,8 +5096,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_set_disk_extent_flags(trans, eb, flag,
- btrfs_header_level(eb));
+ ret = btrfs_set_disk_extent_flags(trans, eb, flag);
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
@@ -5985,9 +5978,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
ret = btrfs_issue_discard(device->bdev, start, len,
&bytes);
if (!ret)
- set_extent_bits(&device->alloc_state, start,
- start + bytes - 1,
- CHUNK_TRIMMED);
+ set_extent_bit(&device->alloc_state, start,
+ start + bytes - 1, CHUNK_TRIMMED, NULL);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 0c958fc1b3b8..429d5c570061 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -141,7 +141,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct extent_buffer *eb, u64 flags, int level);
+ struct extent_buffer *eb, u64 flags);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a1adadd5d25d..a91d5ad27984 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -98,33 +98,16 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
*/
struct btrfs_bio_ctrl {
struct btrfs_bio *bbio;
- int mirror_num;
enum btrfs_compression_type compress_type;
u32 len_to_oe_boundary;
blk_opf_t opf;
btrfs_bio_end_io_t end_io_func;
struct writeback_control *wbc;
-
- /*
- * This is for metadata read, to provide the extra needed verification
- * info. This has to be provided for submit_one_bio(), as
- * submit_one_bio() can submit a bio if it ends at stripe boundary. If
- * no such parent_check is provided, the metadata can hit false alert at
- * endio time.
- */
- struct btrfs_tree_parent_check *parent_check;
-
- /*
- * Tell writepage not to lock the state bits for this range, it still
- * does the unlocking.
- */
- bool extent_locked;
};
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_bio *bbio = bio_ctrl->bbio;
- int mirror_num = bio_ctrl->mirror_num;
if (!bbio)
return;
@@ -132,25 +115,11 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
/* Caller should ensure the bio has at least some range added */
ASSERT(bbio->bio.bi_iter.bi_size);
- if (!is_data_inode(&bbio->inode->vfs_inode)) {
- if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) {
- /*
- * For metadata read, we should have the parent_check,
- * and copy it to bbio for metadata verification.
- */
- ASSERT(bio_ctrl->parent_check);
- memcpy(&bbio->parent_check,
- bio_ctrl->parent_check,
- sizeof(struct btrfs_tree_parent_check));
- }
- bbio->bio.bi_opf |= REQ_META;
- }
-
if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
- btrfs_submit_compressed_read(bbio, mirror_num);
+ btrfs_submit_compressed_read(bbio);
else
- btrfs_submit_bio(bbio, mirror_num);
+ btrfs_submit_bio(bbio, 0);
/* The bbio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
@@ -248,8 +217,6 @@ static int process_one_page(struct btrfs_fs_info *fs_info,
if (page_ops & PAGE_SET_ORDERED)
btrfs_page_clamp_set_ordered(fs_info, page, start, len);
- if (page_ops & PAGE_SET_ERROR)
- btrfs_page_clamp_set_error(fs_info, page, start, len);
if (page_ops & PAGE_START_WRITEBACK) {
btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
btrfs_page_clamp_set_writeback(fs_info, page, start, len);
@@ -295,9 +262,6 @@ static int __process_pages_contig(struct address_space *mapping,
ASSERT(processed_end && *processed_end == start);
}
- if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index)
- mapping_set_error(mapping, -EIO);
-
folio_batch_init(&fbatch);
while (index <= end_index) {
int found_folios;
@@ -506,6 +470,15 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
start, end, page_ops, NULL);
}
+static bool btrfs_verify_page(struct page *page, u64 start)
+{
+ if (!fsverity_active(page->mapping->host) ||
+ PageUptodate(page) ||
+ start >= i_size_read(page->mapping->host))
+ return true;
+ return fsverity_verify_page(page);
+}
+
static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
@@ -513,20 +486,10 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
ASSERT(page_offset(page) <= start &&
start + len <= page_offset(page) + PAGE_SIZE);
- if (uptodate) {
- if (fsverity_active(page->mapping->host) &&
- !PageError(page) &&
- !PageUptodate(page) &&
- start < i_size_read(page->mapping->host) &&
- !fsverity_verify_page(page)) {
- btrfs_page_set_error(fs_info, page, start, len);
- } else {
- btrfs_page_set_uptodate(fs_info, page, start, len);
- }
- } else {
+ if (uptodate && btrfs_verify_page(page, start))
+ btrfs_page_set_uptodate(fs_info, page, start, len);
+ else
btrfs_page_clear_uptodate(fs_info, page, start, len);
- btrfs_page_set_error(fs_info, page, start, len);
- }
if (!btrfs_is_subpage(fs_info, page))
unlock_page(page);
@@ -554,7 +517,6 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
len = end + 1 - start;
btrfs_page_clear_uptodate(fs_info, page, start, len);
- btrfs_page_set_error(fs_info, page, start, len);
ret = err < 0 ? err : -EIO;
mapping_set_error(page->mapping, ret);
}
@@ -574,8 +536,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
- u64 start;
- u64 end;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
@@ -584,6 +544,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
+ u64 start = page_offset(page) + bvec->bv_offset;
+ u32 len = bvec->bv_len;
/* Our read/write should always be sector aligned. */
if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
@@ -595,12 +557,12 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
"incomplete page write with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
- start = page_offset(page) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
-
- end_extent_writepage(page, error, start, end);
-
- btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
+ btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error);
+ if (error) {
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ mapping_set_error(page->mapping, error);
+ }
+ btrfs_page_clear_writeback(fs_info, page, start, len);
}
bio_put(bio);
@@ -686,35 +648,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
}
/*
- * Find extent buffer for a givne bytenr.
- *
- * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
- * in endio context.
- */
-static struct extent_buffer *find_extent_buffer_readpage(
- struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
-{
- struct extent_buffer *eb;
-
- /*
- * For regular sectorsize, we can use page->private to grab extent
- * buffer
- */
- if (fs_info->nodesize >= PAGE_SIZE) {
- ASSERT(PagePrivate(page) && page->private);
- return (struct extent_buffer *)page->private;
- }
-
- /* For subpage case, we need to lookup buffer radix tree */
- rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- bytenr >> fs_info->sectorsize_bits);
- rcu_read_unlock();
- ASSERT(eb);
- return eb;
-}
-
-/*
* after a readpage IO is done, we need to:
* clear the uptodate bits on error
* set the uptodate bits if things worked
@@ -735,7 +668,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
* larger than UINT_MAX, u32 here is enough.
*/
u32 bio_offset = 0;
- int mirror;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
@@ -775,11 +707,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
end = start + bvec->bv_len - 1;
len = bvec->bv_len;
- mirror = bbio->mirror_num;
- if (uptodate && !is_data_inode(inode) &&
- btrfs_validate_metadata_buffer(bbio, page, start, end, mirror))
- uptodate = false;
-
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_SHIFT;
@@ -800,19 +727,12 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
zero_user_segment(page, zero_start,
offset_in_page(end) + 1);
}
- } else if (!is_data_inode(inode)) {
- struct extent_buffer *eb;
-
- eb = find_extent_buffer_readpage(fs_info, page, start);
- set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
- eb->read_mirror = mirror;
- atomic_dec(&eb->io_pages);
}
/* Update page status and unlock. */
end_page_read(page, uptodate, start, len);
endio_readpage_release_extent(&processed, BTRFS_I(inode),
- start, end, PageUptodate(page));
+ start, end, uptodate);
ASSERT(bio_offset + len > bio_offset);
bio_offset += len;
@@ -906,13 +826,8 @@ static void alloc_new_bio(struct btrfs_inode *inode,
bio_ctrl->bbio = bbio;
bio_ctrl->len_to_oe_boundary = U32_MAX;
- /*
- * Limit the extent to the ordered boundary for Zone Append.
- * Compressed bios aren't submitted directly, so it doesn't apply to
- * them.
- */
- if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE &&
- btrfs_use_zone_append(bbio)) {
+ /* Limit data write bios to the ordered boundary. */
+ if (bio_ctrl->wbc) {
struct btrfs_ordered_extent *ordered;
ordered = btrfs_lookup_ordered_extent(inode, file_offset);
@@ -920,11 +835,9 @@ static void alloc_new_bio(struct btrfs_inode *inode,
bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
ordered->file_offset +
ordered->disk_num_bytes - file_offset);
- btrfs_put_ordered_extent(ordered);
+ bbio->ordered = ordered;
}
- }
- if (bio_ctrl->wbc) {
/*
* Pick the last added device to support cgroup writeback. For
* multi-device file systems this means blk-cgroup policies have
@@ -1125,7 +1038,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_extent(tree, start, end, NULL);
- btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
unlock_page(page);
return ret;
}
@@ -1329,11 +1241,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
}
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
delalloc_end, &page_started, &nr_written, wbc);
- if (ret) {
- btrfs_page_set_error(inode->root->fs_info, page,
- page_offset(page), PAGE_SIZE);
+ if (ret)
return ret;
- }
+
/*
* delalloc_end is already one less than the total length, so
* we don't subtract one from PAGE_SIZE
@@ -1438,7 +1348,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct extent_map *em;
int ret = 0;
int nr = 0;
- bool compressed;
ret = btrfs_writepage_cow_fixup(page);
if (ret) {
@@ -1448,12 +1357,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
return 1;
}
- /*
- * we don't want to touch the inode after unlocking the page,
- * so we update the mapping writeback index now
- */
- bio_ctrl->wbc->nr_to_write--;
-
bio_ctrl->end_io_func = end_bio_extent_writepage;
while (cur <= end) {
u64 disk_bytenr;
@@ -1486,7 +1389,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
if (IS_ERR(em)) {
- btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
goto out_error;
}
@@ -1497,10 +1399,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
ASSERT(cur < end);
ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
+
block_start = em->block_start;
- compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
disk_bytenr = em->block_start + extent_offset;
+ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(block_start != EXTENT_MAP_HOLE);
+ ASSERT(block_start != EXTENT_MAP_INLINE);
+
/*
* Note that em_end from extent_map_end() and dirty_range_end from
* find_next_dirty_byte() are all exclusive
@@ -1509,22 +1415,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
free_extent_map(em);
em = NULL;
- /*
- * compressed and inline extents are written through other
- * paths in the FS
- */
- if (compressed || block_start == EXTENT_MAP_HOLE ||
- block_start == EXTENT_MAP_INLINE) {
- if (compressed)
- nr++;
- else
- btrfs_writepage_endio_finish_ordered(inode,
- page, cur, cur + iosize - 1, true);
- btrfs_page_clear_dirty(fs_info, page, cur, iosize);
- cur += iosize;
- continue;
- }
-
btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
btrfs_err(inode->root->fs_info,
@@ -1572,7 +1462,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl
{
struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u64 page_start = page_offset(page);
const u64 page_end = page_start + PAGE_SIZE - 1;
int ret;
@@ -1585,9 +1474,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl
WARN_ON(!PageLocked(page));
- btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
- page_offset(page), PAGE_SIZE);
-
pg_offset = offset_in_page(i_size);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
@@ -1600,77 +1486,30 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl
memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
ret = set_page_extent_mapped(page);
- if (ret < 0) {
- SetPageError(page);
+ if (ret < 0)
goto done;
- }
- if (!bio_ctrl->extent_locked) {
- ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc);
- if (ret == 1)
- return 0;
- if (ret)
- goto done;
- }
+ ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc);
+ if (ret == 1)
+ return 0;
+ if (ret)
+ goto done;
ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);
if (ret == 1)
return 0;
+ bio_ctrl->wbc->nr_to_write--;
+
done:
if (nr == 0) {
/* make sure the mapping tag for page dirty gets cleared */
set_page_writeback(page);
end_page_writeback(page);
}
- /*
- * Here we used to have a check for PageError() and then set @ret and
- * call end_extent_writepage().
- *
- * But in fact setting @ret here will cause different error paths
- * between subpage and regular sectorsize.
- *
- * For regular page size, we never submit current page, but only add
- * current page to current bio.
- * The bio submission can only happen in next page.
- * Thus if we hit the PageError() branch, @ret is already set to
- * non-zero value and will not get updated for regular sectorsize.
- *
- * But for subpage case, it's possible we submit part of current page,
- * thus can get PageError() set by submitted bio of the same page,
- * while our @ret is still 0.
- *
- * So here we unify the behavior and don't set @ret.
- * Error can still be properly passed to higher layer as page will
- * be set error, here we just don't handle the IO failure.
- *
- * NOTE: This is just a hotfix for subpage.
- * The root fix will be properly ending ordered extent when we hit
- * an error during writeback.
- *
- * But that needs a bigger refactoring, as we not only need to grab the
- * submitted OE, but also need to know exactly at which bytenr we hit
- * the error.
- * Currently the full page based __extent_writepage_io() is not
- * capable of that.
- */
- if (PageError(page))
+ if (ret)
end_extent_writepage(page, ret, page_start, page_end);
- if (bio_ctrl->extent_locked) {
- struct writeback_control *wbc = bio_ctrl->wbc;
-
- /*
- * If bio_ctrl->extent_locked, it's from extent_write_locked_range(),
- * the page can either be locked by lock_page() or
- * process_one_page().
- * Let btrfs_page_unlock_writer() handle both cases.
- */
- ASSERT(wbc);
- btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
- wbc->range_end + 1 - wbc->range_start);
- } else {
- unlock_page(page);
- }
+ unlock_page(page);
ASSERT(ret <= 0);
return ret;
}
@@ -1681,52 +1520,26 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
TASK_UNINTERRUPTIBLE);
}
-static void end_extent_buffer_writeback(struct extent_buffer *eb)
-{
- clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
-}
-
/*
* Lock extent buffer status and pages for writeback.
*
- * May try to flush write bio if we can't get the lock.
- *
- * Return 0 if the extent buffer doesn't need to be submitted.
- * (E.g. the extent buffer is not dirty)
- * Return >0 is the extent buffer is submitted to bio.
- * Return <0 if something went wrong, no page is locked.
+ * Return %false if the extent buffer doesn't need to be submitted (e.g. the
+ * extent buffer is not dirty)
+ * Return %true is the extent buffer is submitted to bio.
*/
-static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
- struct btrfs_bio_ctrl *bio_ctrl)
+static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb,
+ struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages;
- int flush = 0;
- int ret = 0;
+ bool ret = false;
- if (!btrfs_try_tree_write_lock(eb)) {
- submit_write_bio(bio_ctrl, 0);
- flush = 1;
- btrfs_tree_lock(eb);
- }
-
- if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+ btrfs_tree_lock(eb);
+ while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
btrfs_tree_unlock(eb);
- if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL)
- return 0;
- if (!flush) {
- submit_write_bio(bio_ctrl, 0);
- flush = 1;
- }
- while (1) {
- wait_on_extent_buffer_writeback(eb);
- btrfs_tree_lock(eb);
- if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
- break;
- btrfs_tree_unlock(eb);
- }
+ if (wbc->sync_mode != WB_SYNC_ALL)
+ return false;
+ wait_on_extent_buffer_writeback(eb);
+ btrfs_tree_lock(eb);
}
/*
@@ -1742,45 +1555,19 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-eb->len,
fs_info->dirty_metadata_batch);
- ret = 1;
+ ret = true;
} else {
spin_unlock(&eb->refs_lock);
}
-
btrfs_tree_unlock(eb);
-
- /*
- * Either we don't need to submit any tree block, or we're submitting
- * subpage eb.
- * Subpage metadata doesn't use page locking at all, so we can skip
- * the page locking.
- */
- if (!ret || fs_info->nodesize < PAGE_SIZE)
- return ret;
-
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- if (!trylock_page(p)) {
- if (!flush) {
- submit_write_bio(bio_ctrl, 0);
- flush = 1;
- }
- lock_page(p);
- }
- }
-
return ret;
}
-static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
+static void set_btree_ioerr(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- btrfs_page_set_error(fs_info, page, eb->start, eb->len);
- if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
- return;
+ set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
/*
* A read may stumble upon this buffer later, make sure that it gets an
@@ -1794,7 +1581,7 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
* return a 0 because we are readonly if we don't modify the err seq for
* the superblock.
*/
- mapping_set_error(page->mapping, -EIO);
+ mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO);
/*
* If writeback for a btree extent that doesn't belong to a log tree
@@ -1869,101 +1656,34 @@ static struct extent_buffer *find_extent_buffer_nolock(
return NULL;
}
-/*
- * The endio function for subpage extent buffer write.
- *
- * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
- * after all extent buffers in the page has finished their writeback.
- */
-static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio)
+static void extent_buffer_write_end_io(struct btrfs_bio *bbio)
{
- struct bio *bio = &bbio->bio;
- struct btrfs_fs_info *fs_info;
- struct bio_vec *bvec;
+ struct extent_buffer *eb = bbio->private;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ bool uptodate = !bbio->bio.bi_status;
struct bvec_iter_all iter_all;
+ struct bio_vec *bvec;
+ u32 bio_offset = 0;
- fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
- ASSERT(fs_info->nodesize < PAGE_SIZE);
+ if (!uptodate)
+ set_btree_ioerr(eb);
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
+ bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ u64 start = eb->start + bio_offset;
struct page *page = bvec->bv_page;
- u64 bvec_start = page_offset(page) + bvec->bv_offset;
- u64 bvec_end = bvec_start + bvec->bv_len - 1;
- u64 cur_bytenr = bvec_start;
-
- ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
-
- /* Iterate through all extent buffers in the range */
- while (cur_bytenr <= bvec_end) {
- struct extent_buffer *eb;
- int done;
-
- /*
- * Here we can't use find_extent_buffer(), as it may
- * try to lock eb->refs_lock, which is not safe in endio
- * context.
- */
- eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
- ASSERT(eb);
-
- cur_bytenr = eb->start + eb->len;
-
- ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
- done = atomic_dec_and_test(&eb->io_pages);
- ASSERT(done);
-
- if (bio->bi_status ||
- test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
- ClearPageUptodate(page);
- set_btree_ioerr(page, eb);
- }
+ u32 len = bvec->bv_len;
- btrfs_subpage_clear_writeback(fs_info, page, eb->start,
- eb->len);
- end_extent_buffer_writeback(eb);
- /*
- * free_extent_buffer() will grab spinlock which is not
- * safe in endio context. Thus here we manually dec
- * the ref.
- */
- atomic_dec(&eb->refs);
- }
+ if (!uptodate)
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_clear_writeback(fs_info, page, start, len);
+ bio_offset += len;
}
- bio_put(bio);
-}
-static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio)
-{
- struct bio *bio = &bbio->bio;
- struct bio_vec *bvec;
- struct extent_buffer *eb;
- int done;
- struct bvec_iter_all iter_all;
-
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
-
- eb = (struct extent_buffer *)page->private;
- BUG_ON(!eb);
- done = atomic_dec_and_test(&eb->io_pages);
-
- if (bio->bi_status ||
- test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
- ClearPageUptodate(page);
- set_btree_ioerr(page, eb);
- }
-
- end_page_writeback(page);
-
- if (!done)
- continue;
-
- end_extent_buffer_writeback(eb);
- }
+ clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
- bio_put(bio);
+ bio_put(&bbio->bio);
}
static void prepare_eb_write(struct extent_buffer *eb)
@@ -1973,7 +1693,6 @@ static void prepare_eb_write(struct extent_buffer *eb)
unsigned long end;
clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
- atomic_set(&eb->io_pages, num_extent_pages(eb));
/* Set btree blocks beyond nritems with 0 to avoid stale content */
nritems = btrfs_header_nritems(eb);
@@ -1995,63 +1714,49 @@ static void prepare_eb_write(struct extent_buffer *eb)
}
}
-/*
- * Unlike the work in write_one_eb(), we rely completely on extent locking.
- * Page locking is only utilized at minimum to keep the VMM code happy.
- */
-static void write_one_subpage_eb(struct extent_buffer *eb,
- struct btrfs_bio_ctrl *bio_ctrl)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page = eb->pages[0];
- bool no_dirty_ebs = false;
-
- prepare_eb_write(eb);
-
- /* clear_page_dirty_for_io() in subpage helper needs page locked */
- lock_page(page);
- btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
-
- /* Check if this is the last dirty bit to update nr_written */
- no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
- eb->start, eb->len);
- if (no_dirty_ebs)
- clear_page_dirty_for_io(page);
-
- bio_ctrl->end_io_func = end_bio_subpage_eb_writepage;
-
- submit_extent_page(bio_ctrl, eb->start, page, eb->len,
- eb->start - page_offset(page));
- unlock_page(page);
- /*
- * Submission finished without problem, if no range of the page is
- * dirty anymore, we have submitted a page. Update nr_written in wbc.
- */
- if (no_dirty_ebs)
- bio_ctrl->wbc->nr_to_write--;
-}
-
static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
- struct btrfs_bio_ctrl *bio_ctrl)
+ struct writeback_control *wbc)
{
- u64 disk_bytenr = eb->start;
- int i, num_pages;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_bio *bbio;
prepare_eb_write(eb);
- bio_ctrl->end_io_func = end_bio_extent_buffer_writepage;
-
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- clear_page_dirty_for_io(p);
- set_page_writeback(p);
- submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0);
- disk_bytenr += PAGE_SIZE;
- bio_ctrl->wbc->nr_to_write--;
+ bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
+ REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
+ eb->fs_info, extent_buffer_write_end_io, eb);
+ bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
+ bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
+ wbc_init_bio(wbc, &bbio->bio);
+ bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
+ bbio->file_offset = eb->start;
+ if (fs_info->nodesize < PAGE_SIZE) {
+ struct page *p = eb->pages[0];
+
+ lock_page(p);
+ btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len);
+ if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start,
+ eb->len)) {
+ clear_page_dirty_for_io(p);
+ wbc->nr_to_write--;
+ }
+ __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p));
+ wbc_account_cgroup_owner(wbc, p, eb->len);
unlock_page(p);
+ } else {
+ for (int i = 0; i < num_extent_pages(eb); i++) {
+ struct page *p = eb->pages[i];
+
+ lock_page(p);
+ clear_page_dirty_for_io(p);
+ set_page_writeback(p);
+ __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0);
+ wbc_account_cgroup_owner(wbc, p, PAGE_SIZE);
+ wbc->nr_to_write--;
+ unlock_page(p);
+ }
}
+ btrfs_submit_bio(bbio, 0);
}
/*
@@ -2068,14 +1773,13 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
* Return >=0 for the number of submitted extent buffers.
* Return <0 for fatal error.
*/
-static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
+static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
- int ret;
/* Lock and write each dirty extent buffers in the range */
while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
@@ -2121,25 +1825,13 @@ static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
if (!eb)
continue;
- ret = lock_extent_buffer_for_io(eb, bio_ctrl);
- if (ret == 0) {
- free_extent_buffer(eb);
- continue;
+ if (lock_extent_buffer_for_io(eb, wbc)) {
+ write_one_eb(eb, wbc);
+ submitted++;
}
- if (ret < 0) {
- free_extent_buffer(eb);
- goto cleanup;
- }
- write_one_subpage_eb(eb, bio_ctrl);
free_extent_buffer(eb);
- submitted++;
}
return submitted;
-
-cleanup:
- /* We hit error, end bio for the submitted extent buffers */
- submit_write_bio(bio_ctrl, ret);
- return ret;
}
/*
@@ -2162,7 +1854,7 @@ cleanup:
* previous call.
* Return <0 for fatal error.
*/
-static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,
+static int submit_eb_page(struct page *page, struct writeback_control *wbc,
struct extent_buffer **eb_context)
{
struct address_space *mapping = page->mapping;
@@ -2174,7 +1866,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,
return 0;
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
- return submit_eb_subpage(page, bio_ctrl);
+ return submit_eb_subpage(page, wbc);
spin_lock(&mapping->private_lock);
if (!PagePrivate(page)) {
@@ -2207,8 +1899,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,
* If for_sync, this hole will be filled with
* trasnsaction commit.
*/
- if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL &&
- !bio_ctrl->wbc->for_sync)
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
ret = -EAGAIN;
else
ret = 0;
@@ -2218,13 +1909,12 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,
*eb_context = eb;
- ret = lock_extent_buffer_for_io(eb, bio_ctrl);
- if (ret <= 0) {
+ if (!lock_extent_buffer_for_io(eb, wbc)) {
btrfs_revert_meta_write_pointer(cache, eb);
if (cache)
btrfs_put_block_group(cache);
free_extent_buffer(eb);
- return ret;
+ return 0;
}
if (cache) {
/*
@@ -2233,7 +1923,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,
btrfs_schedule_zone_finish_bg(cache, eb);
btrfs_put_block_group(cache);
}
- write_one_eb(eb, bio_ctrl);
+ write_one_eb(eb, wbc);
free_extent_buffer(eb);
return 1;
}
@@ -2242,11 +1932,6 @@ int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct extent_buffer *eb_context = NULL;
- struct btrfs_bio_ctrl bio_ctrl = {
- .wbc = wbc,
- .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
- .extent_locked = 0,
- };
struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
int done = 0;
@@ -2288,7 +1973,7 @@ retry:
for (i = 0; i < nr_folios; i++) {
struct folio *folio = fbatch.folios[i];
- ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context);
+ ret = submit_eb_page(&folio->page, wbc, &eb_context);
if (ret == 0)
continue;
if (ret < 0) {
@@ -2349,8 +2034,6 @@ retry:
ret = 0;
if (!ret && BTRFS_FS_ERROR(fs_info))
ret = -EROFS;
- submit_write_bio(&bio_ctrl, ret);
-
btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
@@ -2520,38 +2203,31 @@ retry:
* already been ran (aka, ordered extent inserted) and all pages are still
* locked.
*/
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
+ struct writeback_control *wbc)
{
bool found_error = false;
int first_error = 0;
int ret = 0;
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ loff_t i_size = i_size_read(inode);
u64 cur = start;
- unsigned long nr_pages;
- const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
- struct writeback_control wbc_writepages = {
- .sync_mode = WB_SYNC_ALL,
- .range_start = start,
- .range_end = end + 1,
- .no_cgroup_owner = 1,
- };
struct btrfs_bio_ctrl bio_ctrl = {
- .wbc = &wbc_writepages,
- /* We're called from an async helper function */
- .opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT |
- wbc_to_write_flags(&wbc_writepages),
- .extent_locked = 1,
+ .wbc = wbc,
+ .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
};
+ if (wbc->no_cgroup_owner)
+ bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT;
+
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
- nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
- PAGE_SHIFT;
- wbc_writepages.nr_to_write = nr_pages * 2;
- wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
while (cur <= end) {
u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+ struct page *page;
+ int nr = 0;
page = find_get_page(mapping, cur >> PAGE_SHIFT);
/*
@@ -2562,19 +2238,31 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
ASSERT(PageLocked(page));
ASSERT(PageDirty(page));
clear_page_dirty_for_io(page);
- ret = __extent_writepage(page, &bio_ctrl);
- ASSERT(ret <= 0);
+
+ ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl,
+ i_size, &nr);
+ if (ret == 1)
+ goto next_page;
+
+ /* Make sure the mapping tag for page dirty gets cleared. */
+ if (nr == 0) {
+ set_page_writeback(page);
+ end_page_writeback(page);
+ }
+ if (ret)
+ end_extent_writepage(page, ret, cur, cur_end);
+ btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur);
if (ret < 0) {
found_error = true;
first_error = ret;
}
+next_page:
put_page(page);
cur = cur_end + 1;
}
submit_write_bio(&bio_ctrl, found_error ? ret : 0);
- wbc_detach_inode(&wbc_writepages);
if (found_error)
return first_error;
return ret;
@@ -2588,7 +2276,6 @@ int extent_writepages(struct address_space *mapping,
struct btrfs_bio_ctrl bio_ctrl = {
.wbc = wbc,
.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
- .extent_locked = 0,
};
/*
@@ -2679,8 +2366,7 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* The delalloc new bit will be cleared by ordered extent
* completion.
*/
- ret = __clear_extent_bit(tree, start, end, clear_bits, NULL,
- mask, NULL);
+ ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -3421,10 +3107,9 @@ static void __free_extent_buffer(struct extent_buffer *eb)
kmem_cache_free(extent_buffer_cache, eb);
}
-int extent_buffer_under_io(const struct extent_buffer *eb)
+static int extent_buffer_under_io(const struct extent_buffer *eb)
{
- return (atomic_read(&eb->io_pages) ||
- test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+ return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
@@ -3557,11 +3242,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
init_rwsem(&eb->lock);
btrfs_leak_debug_add_eb(eb);
- INIT_LIST_HEAD(&eb->release_list);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
- atomic_set(&eb->io_pages, 0);
ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
@@ -3678,9 +3361,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
* adequately protected by the refcount, but the TREE_REF bit and
* its corresponding reference are not. To protect against this
* class of races, we call check_buffer_tree_ref from the codepaths
- * which trigger io after they set eb->io_pages. Note that once io is
- * initiated, TREE_REF can no longer be cleared, so that is the
- * moment at which any such race is best fixed.
+ * which trigger io. Note that once io is initiated, TREE_REF can no
+ * longer be cleared, so that is the moment at which any such race is
+ * best fixed.
*/
refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -3939,7 +3622,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
eb->pages[i] = p;
- if (!PageUptodate(p))
+ if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len))
uptodate = 0;
/*
@@ -4142,13 +3825,12 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
continue;
lock_page(page);
btree_clear_page_dirty(page);
- ClearPageError(page);
unlock_page(page);
}
WARN_ON(atomic_read(&eb->refs) == 0);
}
-bool set_extent_buffer_dirty(struct extent_buffer *eb)
+void set_extent_buffer_dirty(struct extent_buffer *eb)
{
int i;
int num_pages;
@@ -4183,13 +3865,14 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
eb->start, eb->len);
if (subpage)
unlock_page(eb->pages[0]);
+ percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
+ eb->len,
+ eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
ASSERT(PageDirty(eb->pages[i]));
#endif
-
- return was_dirty;
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
@@ -4242,84 +3925,54 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
}
}
-static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
- int mirror_num,
- struct btrfs_tree_parent_check *check)
+static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
{
+ struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct extent_io_tree *io_tree;
- struct page *page = eb->pages[0];
- struct extent_state *cached_state = NULL;
- struct btrfs_bio_ctrl bio_ctrl = {
- .opf = REQ_OP_READ,
- .mirror_num = mirror_num,
- .parent_check = check,
- };
- int ret;
+ bool uptodate = !bbio->bio.bi_status;
+ struct bvec_iter_all iter_all;
+ struct bio_vec *bvec;
+ u32 bio_offset = 0;
- ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
- ASSERT(PagePrivate(page));
- ASSERT(check);
- io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+ eb->read_mirror = bbio->mirror_num;
- if (wait == WAIT_NONE) {
- if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state))
- return -EAGAIN;
- } else {
- ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
- if (ret < 0)
- return ret;
- }
+ if (uptodate &&
+ btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0)
+ uptodate = false;
- if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
- PageUptodate(page) ||
- btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
- set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
- return 0;
+ if (uptodate) {
+ set_extent_buffer_uptodate(eb);
+ } else {
+ clear_extent_buffer_uptodate(eb);
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
}
- clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
- eb->read_mirror = 0;
- atomic_set(&eb->io_pages, 1);
- check_buffer_tree_ref(eb);
- bio_ctrl.end_io_func = end_bio_extent_readpage;
+ bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ u64 start = eb->start + bio_offset;
+ struct page *page = bvec->bv_page;
+ u32 len = bvec->bv_len;
- btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
+ if (uptodate)
+ btrfs_page_set_uptodate(fs_info, page, start, len);
+ else
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
- btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
- submit_extent_page(&bio_ctrl, eb->start, page, eb->len,
- eb->start - page_offset(page));
- submit_one_bio(&bio_ctrl);
- if (wait != WAIT_COMPLETE) {
- free_extent_state(cached_state);
- return 0;
+ bio_offset += len;
}
- wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
- EXTENT_LOCKED, &cached_state);
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
- return -EIO;
- return 0;
+ clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ free_extent_buffer(eb);
+
+ bio_put(&bbio->bio);
}
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
struct btrfs_tree_parent_check *check)
{
- int i;
- struct page *page;
- int locked_pages = 0;
- int all_uptodate = 1;
- int num_pages;
- unsigned long num_reads = 0;
- struct btrfs_bio_ctrl bio_ctrl = {
- .opf = REQ_OP_READ,
- .mirror_num = mirror_num,
- .parent_check = check,
- };
+ int num_pages = num_extent_pages(eb), i;
+ struct btrfs_bio *bbio;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -4332,87 +3985,39 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
return -EIO;
- if (eb->fs_info->nodesize < PAGE_SIZE)
- return read_extent_buffer_subpage(eb, wait, mirror_num, check);
-
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (wait == WAIT_NONE) {
- /*
- * WAIT_NONE is only utilized by readahead. If we can't
- * acquire the lock atomically it means either the eb
- * is being read out or under modification.
- * Either way the eb will be or has been cached,
- * readahead can exit safely.
- */
- if (!trylock_page(page))
- goto unlock_exit;
- } else {
- lock_page(page);
- }
- locked_pages++;
- }
- /*
- * We need to firstly lock all pages to make sure that
- * the uptodate bit of our pages won't be affected by
- * clear_extent_buffer_uptodate().
- */
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!PageUptodate(page)) {
- num_reads++;
- all_uptodate = 0;
- }
- }
-
- if (all_uptodate) {
- set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- goto unlock_exit;
- }
+ /* Someone else is already reading the buffer, just wait for it. */
+ if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags))
+ goto done;
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
- atomic_set(&eb->io_pages, num_reads);
- /*
- * It is possible for release_folio to clear the TREE_REF bit before we
- * set io_pages. See check_buffer_tree_ref for a more detailed comment.
- */
check_buffer_tree_ref(eb);
- bio_ctrl.end_io_func = end_bio_extent_readpage;
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
-
- if (!PageUptodate(page)) {
- ClearPageError(page);
- submit_extent_page(&bio_ctrl, page_offset(page), page,
- PAGE_SIZE, 0);
- } else {
- unlock_page(page);
- }
+ atomic_inc(&eb->refs);
+
+ bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
+ REQ_OP_READ | REQ_META, eb->fs_info,
+ extent_buffer_read_end_io, eb);
+ bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
+ bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
+ bbio->file_offset = eb->start;
+ memcpy(&bbio->parent_check, check, sizeof(*check));
+ if (eb->fs_info->nodesize < PAGE_SIZE) {
+ __bio_add_page(&bbio->bio, eb->pages[0], eb->len,
+ eb->start - page_offset(eb->pages[0]));
+ } else {
+ for (i = 0; i < num_pages; i++)
+ __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
}
+ btrfs_submit_bio(bbio, mirror_num);
- submit_one_bio(&bio_ctrl);
-
- if (wait != WAIT_COMPLETE)
- return 0;
-
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- wait_on_page_locked(page);
- if (!PageUptodate(page))
+done:
+ if (wait == WAIT_COMPLETE) {
+ wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return -EIO;
}
return 0;
-
-unlock_exit:
- while (locked_pages > 0) {
- locked_pages--;
- page = eb->pages[locked_pages];
- unlock_page(page);
- }
- return 0;
}
static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
@@ -4561,18 +4166,17 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
* looked up. We don't want to complain in this case, as the page was
* valid before, we just didn't write it out. Instead we want to catch
* the case where we didn't actually read the block properly, which
- * would have !PageUptodate && !PageError, as we clear PageError before
- * reading.
+ * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR.
*/
- if (fs_info->nodesize < PAGE_SIZE) {
- bool uptodate, error;
+ if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
+ return;
- uptodate = btrfs_subpage_test_uptodate(fs_info, page,
- eb->start, eb->len);
- error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
- WARN_ON(!uptodate && !error);
+ if (fs_info->nodesize < PAGE_SIZE) {
+ if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page,
+ eb->start, eb->len)))
+ btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len);
} else {
- WARN_ON(!PageUptodate(page) && !PageError(page));
+ WARN_ON(!PageUptodate(page));
}
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4341ad978fb8..c5fae3a7d911 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -29,6 +29,8 @@ enum {
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
EXTENT_BUFFER_NO_CHECK,
+ /* Indicate that extent buffer pages a being read */
+ EXTENT_BUFFER_READING,
};
/* these are flags for __process_pages_contig */
@@ -38,7 +40,6 @@ enum {
ENUM_BIT(PAGE_START_WRITEBACK),
ENUM_BIT(PAGE_END_WRITEBACK),
ENUM_BIT(PAGE_SET_ORDERED),
- ENUM_BIT(PAGE_SET_ERROR),
ENUM_BIT(PAGE_LOCK),
};
@@ -79,7 +80,6 @@ struct extent_buffer {
struct btrfs_fs_info *fs_info;
spinlock_t refs_lock;
atomic_t refs;
- atomic_t io_pages;
int read_mirror;
struct rcu_head rcu_head;
pid_t lock_owner;
@@ -89,7 +89,6 @@ struct extent_buffer {
struct rw_semaphore lock;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
- struct list_head release_list;
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
@@ -179,7 +178,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int btrfs_read_folio(struct file *file, struct folio *folio);
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
+ struct writeback_control *wbc);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
@@ -262,10 +262,9 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
unsigned long start, unsigned long pos,
unsigned long len);
-bool set_extent_buffer_dirty(struct extent_buffer *eb);
+void set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-int extent_buffer_under_io(const struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 138afa955370..0cdb3e86f29b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -364,8 +364,9 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
- set_extent_bits_nowait(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1, bits);
+ set_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
}
}
@@ -380,8 +381,9 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
struct btrfs_device *device = stripe->dev;
__clear_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1, bits,
- NULL, GFP_NOWAIT, NULL);
+ stripe->physical + stripe_size - 1,
+ bits | EXTENT_NOWAIT,
+ NULL, NULL);
}
}
@@ -502,10 +504,10 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
RB_CLEAR_NODE(&em->rb_node);
}
-void replace_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *cur,
- struct extent_map *new,
- int modified)
+static void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified)
{
lockdep_assert_held_write(&tree->lock);
@@ -959,3 +961,95 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
return ret;
}
+
+/*
+ * Split off the first pre bytes from the extent_map at [start, start + len],
+ * and set the block_start for it to new_logical.
+ *
+ * This function is used when an ordered_extent needs to be split.
+ */
+int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
+ u64 new_logical)
+{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ struct extent_map *split_pre = NULL;
+ struct extent_map *split_mid = NULL;
+ int ret = 0;
+ unsigned long flags;
+
+ ASSERT(pre != 0);
+ ASSERT(pre < len);
+
+ split_pre = alloc_extent_map();
+ if (!split_pre)
+ return -ENOMEM;
+ split_mid = alloc_extent_map();
+ if (!split_mid) {
+ ret = -ENOMEM;
+ goto out_free_pre;
+ }
+
+ lock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ ASSERT(em->len == len);
+ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+ ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(!list_empty(&em->list));
+
+ flags = em->flags;
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ /* First, replace the em with a new extent_map starting from * em->start */
+ split_pre->start = em->start;
+ split_pre->len = pre;
+ split_pre->orig_start = split_pre->start;
+ split_pre->block_start = new_logical;
+ split_pre->block_len = split_pre->len;
+ split_pre->orig_block_len = split_pre->block_len;
+ split_pre->ram_bytes = split_pre->len;
+ split_pre->flags = flags;
+ split_pre->compress_type = em->compress_type;
+ split_pre->generation = em->generation;
+
+ replace_extent_mapping(em_tree, em, split_pre, 1);
+
+ /*
+ * Now we only have an extent_map at:
+ * [em->start, em->start + pre]
+ */
+
+ /* Insert the middle extent_map. */
+ split_mid->start = em->start + pre;
+ split_mid->len = em->len - pre;
+ split_mid->orig_start = split_mid->start;
+ split_mid->block_start = em->block_start + pre;
+ split_mid->block_len = split_mid->len;
+ split_mid->orig_block_len = split_mid->block_len;
+ split_mid->ram_bytes = split_mid->len;
+ split_mid->flags = flags;
+ split_mid->compress_type = em->compress_type;
+ split_mid->generation = em->generation;
+ add_extent_mapping(em_tree, split_mid, 1);
+
+ /* Once for us */
+ free_extent_map(em);
+ /* Once for the tree */
+ free_extent_map(em);
+
+out_unlock:
+ write_unlock(&em_tree->lock);
+ unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ free_extent_map(split_mid);
+out_free_pre:
+ free_extent_map(split_pre);
+ return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ad311864272a..35d27c756e08 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -90,10 +90,8 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified);
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
-void replace_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *cur,
- struct extent_map *new,
- int modified);
+int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
+ u64 new_logical);
struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d1cd0a692f8e..696bf695d8eb 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -94,8 +94,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
- return set_extent_bits(&inode->file_extent_tree, start, start + len - 1,
- EXTENT_DIRTY);
+ return set_extent_bit(&inode->file_extent_tree, start, start + len - 1,
+ EXTENT_DIRTY, NULL);
}
/*
@@ -438,9 +438,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
BTRFS_DATA_RELOC_TREE_OBJECTID) {
u64 file_offset = bbio->file_offset + bio_offset;
- set_extent_bits(&inode->io_tree, file_offset,
- file_offset + sectorsize - 1,
- EXTENT_NODATASUM);
+ set_extent_bit(&inode->io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, NULL);
} else {
btrfs_warn_rl(fs_info,
"csum hole found for disk bytenr range [%llu, %llu)",
@@ -560,8 +560,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
goto fail;
}
- sums->bytenr = start;
- sums->len = (int)size;
+ sums->logical = start;
+ sums->len = size;
offset = bytes_to_csum_size(fs_info, start - key.offset);
@@ -721,20 +721,17 @@ fail:
*/
blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
{
+ struct btrfs_ordered_extent *ordered = bbio->ordered;
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct bio *bio = &bbio->bio;
- u64 offset = bbio->file_offset;
struct btrfs_ordered_sum *sums;
- struct btrfs_ordered_extent *ordered = NULL;
char *data;
struct bvec_iter iter;
struct bio_vec bvec;
int index;
unsigned int blockcount;
- unsigned long total_bytes = 0;
- unsigned long this_sum_bytes = 0;
int i;
unsigned nofs_flag;
@@ -749,61 +746,17 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
- sums->bytenr = bio->bi_iter.bi_sector << 9;
+ sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
index = 0;
shash->tfm = fs_info->csum_shash;
bio_for_each_segment(bvec, bio, iter) {
- if (!ordered) {
- ordered = btrfs_lookup_ordered_extent(inode, offset);
- /*
- * The bio range is not covered by any ordered extent,
- * must be a code logic error.
- */
- if (unlikely(!ordered)) {
- WARN(1, KERN_WARNING
- "no ordered extent for root %llu ino %llu offset %llu\n",
- inode->root->root_key.objectid,
- btrfs_ino(inode), offset);
- kvfree(sums);
- return BLK_STS_IOERR;
- }
- }
-
blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
bvec.bv_len + fs_info->sectorsize
- 1);
for (i = 0; i < blockcount; i++) {
- if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) &&
- !in_range(offset, ordered->file_offset,
- ordered->num_bytes)) {
- unsigned long bytes_left;
-
- sums->len = this_sum_bytes;
- this_sum_bytes = 0;
- btrfs_add_ordered_sum(ordered, sums);
- btrfs_put_ordered_extent(ordered);
-
- bytes_left = bio->bi_iter.bi_size - total_bytes;
-
- nofs_flag = memalloc_nofs_save();
- sums = kvzalloc(btrfs_ordered_sum_size(fs_info,
- bytes_left), GFP_KERNEL);
- memalloc_nofs_restore(nofs_flag);
- if (!sums)
- return BLK_STS_RESOURCE;
-
- sums->len = bytes_left;
- ordered = btrfs_lookup_ordered_extent(inode,
- offset);
- ASSERT(ordered); /* Logic error */
- sums->bytenr = (bio->bi_iter.bi_sector << 9)
- + total_bytes;
- index = 0;
- }
-
data = bvec_kmap_local(&bvec);
crypto_shash_digest(shash,
data + (i * fs_info->sectorsize),
@@ -811,15 +764,28 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
sums->sums + index);
kunmap_local(data);
index += fs_info->csum_size;
- offset += fs_info->sectorsize;
- this_sum_bytes += fs_info->sectorsize;
- total_bytes += fs_info->sectorsize;
}
}
- this_sum_bytes = 0;
+
+ bbio->sums = sums;
btrfs_add_ordered_sum(ordered, sums);
- btrfs_put_ordered_extent(ordered);
+ return 0;
+}
+
+/*
+ * Nodatasum I/O on zoned file systems still requires an btrfs_ordered_sum to
+ * record the updated logical address on Zone Append completion.
+ * Allocate just the structure with an empty sums array here for that case.
+ */
+blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
+{
+ bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS);
+ if (!bbio->sums)
+ return BLK_STS_RESOURCE;
+ bbio->sums->len = bbio->bio.bi_iter.bi_size;
+ bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ btrfs_add_ordered_sum(bbio->ordered, bbio->sums);
return 0;
}
@@ -1086,7 +1052,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
again:
next_offset = (u64)-1;
found_next = 0;
- bytenr = sums->bytenr + total_bytes;
+ bytenr = sums->logical + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 6be8725cd574..4ec669b69008 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -50,6 +50,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio);
+blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f649647392e0..392bc7d512a0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1651,7 +1651,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
struct file *file = iocb->ki_filp;
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
ssize_t num_written, num_sync;
- const bool sync = iocb_is_dsync(iocb);
/*
* If the fs flips readonly due to some impossible error, although we
@@ -1664,9 +1663,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
return -EOPNOTSUPP;
- if (sync)
- atomic_inc(&inode->sync_writers);
-
if (encoded) {
num_written = btrfs_encoded_write(iocb, from, encoded);
num_sync = encoded->len;
@@ -1686,9 +1682,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
num_written = num_sync;
}
- if (sync)
- atomic_dec(&inode->sync_writers);
-
current->backing_dev_info = NULL;
return num_written;
}
@@ -1733,9 +1726,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
* several segments of stripe length (currently 64K).
*/
blk_start_plug(&plug);
- atomic_inc(&BTRFS_I(inode)->sync_writers);
ret = btrfs_fdatawrite_range(inode, start, end);
- atomic_dec(&BTRFS_I(inode)->sync_writers);
blk_finish_plug(&plug);
return ret;
@@ -3709,7 +3700,8 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
+ FMODE_CAN_ODIRECT;
ret = fsverity_file_open(inode, filp);
if (ret)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cf98a3c05480..880800418075 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -292,25 +292,6 @@ out:
return ret;
}
-int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *rsv)
-{
- u64 needed_bytes;
- int ret;
-
- /* 1 for slack space, 1 for updating the inode */
- needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
- btrfs_calc_metadata_size(fs_info, 1);
-
- spin_lock(&rsv->lock);
- if (rsv->reserved < needed_bytes)
- ret = -ENOSPC;
- else
- ret = 0;
- spin_unlock(&rsv->lock);
- return ret;
-}
-
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct inode *vfs_inode)
@@ -923,27 +904,31 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (!info->bitmap) {
+ const u64 offset = info->offset;
+ const u64 bytes = info->bytes;
+
unlink_free_space(ctl, info, true);
- ret = btrfs_add_free_space(block_group, info->offset,
- info->bytes);
+ spin_unlock(&ctl->tree_lock);
kmem_cache_free(btrfs_free_space_cachep, info);
+ ret = btrfs_add_free_space(block_group, offset, bytes);
+ spin_lock(&ctl->tree_lock);
} else {
u64 offset = info->offset;
u64 bytes = ctl->unit;
- while (search_bitmap(ctl, info, &offset, &bytes,
- false) == 0) {
+ ret = search_bitmap(ctl, info, &offset, &bytes, false);
+ if (ret == 0) {
+ bitmap_clear_bits(ctl, info, offset, bytes, true);
+ spin_unlock(&ctl->tree_lock);
ret = btrfs_add_free_space(block_group, offset,
bytes);
- if (ret)
- break;
- bitmap_clear_bits(ctl, info, offset, bytes, true);
- offset = info->offset;
- bytes = ctl->unit;
+ spin_lock(&ctl->tree_lock);
+ } else {
+ free_bitmap(ctl, info);
+ ret = 0;
}
- free_bitmap(ctl, info);
}
- cond_resched();
+ cond_resched_lock(&ctl->tree_lock);
}
return ret;
}
@@ -1037,7 +1022,9 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
block_group->bytes_super));
if (matched) {
+ spin_lock(&tmp_ctl.tree_lock);
ret = copy_free_space_cache(block_group, &tmp_ctl);
+ spin_unlock(&tmp_ctl.tree_lock);
/*
* ret == 1 means we successfully loaded the free space cache,
* so we need to re-set it here.
@@ -1596,20 +1583,34 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
return bitmap_start;
}
-static int tree_insert_offset(struct rb_root *root, u64 offset,
- struct rb_node *node, int bitmap)
+static int tree_insert_offset(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_cluster *cluster,
+ struct btrfs_free_space *new_entry)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_root *root;
+ struct rb_node **p;
struct rb_node *parent = NULL;
- struct btrfs_free_space *info;
+
+ lockdep_assert_held(&ctl->tree_lock);
+
+ if (cluster) {
+ lockdep_assert_held(&cluster->lock);
+ root = &cluster->root;
+ } else {
+ root = &ctl->free_space_offset;
+ }
+
+ p = &root->rb_node;
while (*p) {
+ struct btrfs_free_space *info;
+
parent = *p;
info = rb_entry(parent, struct btrfs_free_space, offset_index);
- if (offset < info->offset) {
+ if (new_entry->offset < info->offset) {
p = &(*p)->rb_left;
- } else if (offset > info->offset) {
+ } else if (new_entry->offset > info->offset) {
p = &(*p)->rb_right;
} else {
/*
@@ -1625,7 +1626,7 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
* found a bitmap, we want to go left, or before
* logically.
*/
- if (bitmap) {
+ if (new_entry->bitmap) {
if (info->bitmap) {
WARN_ON_ONCE(1);
return -EEXIST;
@@ -1641,8 +1642,8 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
}
}
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
+ rb_link_node(&new_entry->offset_index, parent, p);
+ rb_insert_color(&new_entry->offset_index, root);
return 0;
}
@@ -1705,6 +1706,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
struct rb_node *n = ctl->free_space_offset.rb_node;
struct btrfs_free_space *entry = NULL, *prev = NULL;
+ lockdep_assert_held(&ctl->tree_lock);
+
/* find entry that is closest to the 'offset' */
while (n) {
entry = rb_entry(n, struct btrfs_free_space, offset_index);
@@ -1814,6 +1817,8 @@ static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
bool update_stat)
{
+ lockdep_assert_held(&ctl->tree_lock);
+
rb_erase(&info->offset_index, &ctl->free_space_offset);
rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
ctl->free_extents--;
@@ -1832,9 +1837,10 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
{
int ret = 0;
+ lockdep_assert_held(&ctl->tree_lock);
+
ASSERT(info->bytes || info->bitmap);
- ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
- &info->offset_index, (info->bitmap != NULL));
+ ret = tree_insert_offset(ctl, NULL, info);
if (ret)
return ret;
@@ -1862,6 +1868,8 @@ static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
if (RB_EMPTY_NODE(&info->bytes_index))
return;
+ lockdep_assert_held(&ctl->tree_lock);
+
rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
}
@@ -2447,6 +2455,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
u64 offset = info->offset;
u64 bytes = info->bytes;
const bool is_trimmed = btrfs_free_space_trimmed(info);
+ struct rb_node *right_prev = NULL;
/*
* first we want to see if there is free space adjacent to the range we
@@ -2454,9 +2463,11 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
* cover the entire range
*/
right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
- if (right_info && rb_prev(&right_info->offset_index))
- left_info = rb_entry(rb_prev(&right_info->offset_index),
- struct btrfs_free_space, offset_index);
+ if (right_info)
+ right_prev = rb_prev(&right_info->offset_index);
+
+ if (right_prev)
+ left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index);
else if (!right_info)
left_info = tree_search_offset(ctl, offset - 1, 0, 0);
@@ -2969,9 +2980,10 @@ static void __btrfs_return_cluster_to_free_space(
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct btrfs_free_space *entry;
struct rb_node *node;
+ lockdep_assert_held(&ctl->tree_lock);
+
spin_lock(&cluster->lock);
if (cluster->block_group != block_group) {
spin_unlock(&cluster->lock);
@@ -2984,15 +2996,14 @@ static void __btrfs_return_cluster_to_free_space(
node = rb_first(&cluster->root);
while (node) {
- bool bitmap;
+ struct btrfs_free_space *entry;
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
rb_erase(&entry->offset_index, &cluster->root);
RB_CLEAR_NODE(&entry->offset_index);
- bitmap = (entry->bitmap != NULL);
- if (!bitmap) {
+ if (!entry->bitmap) {
/* Merging treats extents as if they were new */
if (!btrfs_free_space_trimmed(entry)) {
ctl->discardable_extents[BTRFS_STAT_CURR]--;
@@ -3010,8 +3021,7 @@ static void __btrfs_return_cluster_to_free_space(
entry->bytes;
}
}
- tree_insert_offset(&ctl->free_space_offset,
- entry->offset, &entry->offset_index, bitmap);
+ tree_insert_offset(ctl, NULL, entry);
rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
entry_less);
}
@@ -3324,6 +3334,8 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
unsigned long total_found = 0;
int ret;
+ lockdep_assert_held(&ctl->tree_lock);
+
i = offset_to_bit(entry->offset, ctl->unit,
max_t(u64, offset, entry->offset));
want_bits = bytes_to_bits(bytes, ctl->unit);
@@ -3385,8 +3397,7 @@ again:
*/
RB_CLEAR_NODE(&entry->bytes_index);
- ret = tree_insert_offset(&cluster->root, entry->offset,
- &entry->offset_index, 1);
+ ret = tree_insert_offset(ctl, cluster, entry);
ASSERT(!ret); /* -EEXIST; Logic error */
trace_btrfs_setup_cluster(block_group, cluster,
@@ -3414,6 +3425,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
u64 max_extent;
u64 total_size = 0;
+ lockdep_assert_held(&ctl->tree_lock);
+
entry = tree_search_offset(ctl, offset, 0, 1);
if (!entry)
return -ENOSPC;
@@ -3476,8 +3489,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
rb_erase(&entry->offset_index, &ctl->free_space_offset);
rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
- ret = tree_insert_offset(&cluster->root, entry->offset,
- &entry->offset_index, 0);
+ ret = tree_insert_offset(ctl, cluster, entry);
total_size += entry->bytes;
ASSERT(!ret); /* -EEXIST; Logic error */
} while (node && entry != last);
@@ -3671,7 +3683,7 @@ static int do_trimming(struct btrfs_block_group *block_group,
__btrfs_add_free_space(block_group, reserved_start,
start - reserved_start,
reserved_trim_state);
- if (start + bytes < reserved_start + reserved_bytes)
+ if (end < reserved_end)
__btrfs_add_free_space(block_group, end, reserved_end - end,
reserved_trim_state);
__btrfs_add_free_space(block_group, start, bytes, trim_state);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index a855e0483e03..33b4da3271b1 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -101,8 +101,6 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_block_group *block_group);
-int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *rsv);
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct inode *inode);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index b21da1446f2a..045ddce32eca 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1280,7 +1280,10 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
goto abort;
btrfs_global_root_delete(free_space_root);
+
+ spin_lock(&fs_info->trans_lock);
list_del(&free_space_root->dirty_list);
+ spin_unlock(&fs_info->trans_lock);
btrfs_tree_lock(free_space_root->node);
btrfs_clear_buffer_dirty(trans, free_space_root->node);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 0d98fc5f6f44..203d2a267828 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -543,7 +543,6 @@ struct btrfs_fs_info {
* A third pool does submit_bio to avoid deadlocking with the other two.
*/
struct btrfs_workqueue *workers;
- struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
struct workqueue_struct *endio_workers;
@@ -577,6 +576,7 @@ struct btrfs_fs_info {
s32 dirty_metadata_batch;
s32 delalloc_batch;
+ /* Protected by 'trans_lock'. */
struct list_head dirty_cowonly_roots;
struct btrfs_fs_devices *fs_devices;
@@ -643,7 +643,6 @@ struct btrfs_fs_info {
*/
refcount_t scrub_workers_refcnt;
struct workqueue_struct *scrub_workers;
- struct workqueue_struct *scrub_wr_completion_workers;
struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -854,7 +853,7 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
{
- return fs_info->zone_size > 0;
+ return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0;
}
/*
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index b80aeb715701..ede43b6c6559 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -60,6 +60,22 @@ struct btrfs_truncate_control {
bool clear_extent_range;
};
+/*
+ * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
+ * separate u32s. These two functions convert between the two representations.
+ */
+static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
+{
+ return (flags | ((u64)ro_flags << 32));
+}
+
+static inline void btrfs_inode_split_flags(u64 inode_item_flags,
+ u32 *flags, u32 *ro_flags)
+{
+ *flags = (u32)inode_item_flags;
+ *ro_flags = (u32)(inode_item_flags >> 32);
+}
+
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_truncate_control *control);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7fcafcc5292c..dbbb67293e34 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,6 +70,7 @@
#include "verity.h"
#include "super.h"
#include "orphan.h"
+#include "backref.h"
struct btrfs_iget_args {
u64 ino;
@@ -100,6 +101,18 @@ struct btrfs_rename_ctx {
u64 index;
};
+/*
+ * Used by data_reloc_print_warning_inode() to pass needed info for filename
+ * resolution and output of error message.
+ */
+struct data_reloc_warn {
+ struct btrfs_path path;
+ struct btrfs_fs_info *fs_info;
+ u64 extent_item_size;
+ u64 logical;
+ int mirror_num;
+};
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -122,12 +135,198 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 ram_bytes, int compress_type,
int type);
+static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
+ u64 root, void *warn_ctx)
+{
+ struct data_reloc_warn *warn = warn_ctx;
+ struct btrfs_fs_info *fs_info = warn->fs_info;
+ struct extent_buffer *eb;
+ struct btrfs_inode_item *inode_item;
+ struct inode_fs_paths *ipath = NULL;
+ struct btrfs_root *local_root;
+ struct btrfs_key key;
+ unsigned int nofs_flag;
+ u32 nlink;
+ int ret;
+
+ local_root = btrfs_get_fs_root(fs_info, root, true);
+ if (IS_ERR(local_root)) {
+ ret = PTR_ERR(local_root);
+ goto err;
+ }
+
+ /* This makes the path point to (inum INODE_ITEM ioff). */
+ key.objectid = inum;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
+ if (ret) {
+ btrfs_put_root(local_root);
+ btrfs_release_path(&warn->path);
+ goto err;
+ }
+
+ eb = warn->path.nodes[0];
+ inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
+ nlink = btrfs_inode_nlink(eb, inode_item);
+ btrfs_release_path(&warn->path);
+
+ nofs_flag = memalloc_nofs_save();
+ ipath = init_ipath(4096, local_root, &warn->path);
+ memalloc_nofs_restore(nofs_flag);
+ if (IS_ERR(ipath)) {
+ btrfs_put_root(local_root);
+ ret = PTR_ERR(ipath);
+ ipath = NULL;
+ /*
+ * -ENOMEM, not a critical error, just output an generic error
+ * without filename.
+ */
+ btrfs_warn(fs_info,
+"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
+ warn->logical, warn->mirror_num, root, inum, offset);
+ return ret;
+ }
+ ret = paths_from_inode(inum, ipath);
+ if (ret < 0)
+ goto err;
+
+ /*
+ * We deliberately ignore the bit ipath might have been too small to
+ * hold all of the paths here
+ */
+ for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
+ btrfs_warn(fs_info,
+"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
+ warn->logical, warn->mirror_num, root, inum, offset,
+ fs_info->sectorsize, nlink,
+ (char *)(unsigned long)ipath->fspath->val[i]);
+ }
+
+ btrfs_put_root(local_root);
+ free_ipath(ipath);
+ return 0;
+
+err:
+ btrfs_warn(fs_info,
+"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
+ warn->logical, warn->mirror_num, root, inum, offset, ret);
+
+ free_ipath(ipath);
+ return ret;
+}
+
+/*
+ * Do extra user-friendly error output (e.g. lookup all the affected files).
+ *
+ * Return true if we succeeded doing the backref lookup.
+ * Return false if such lookup failed, and has to fallback to the old error message.
+ */
+static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
+ const u8 *csum, const u8 *csum_expected,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_path path = { 0 };
+ struct btrfs_key found_key = { 0 };
+ struct extent_buffer *eb;
+ struct btrfs_extent_item *ei;
+ const u32 csum_size = fs_info->csum_size;
+ u64 logical;
+ u64 flags;
+ u32 item_size;
+ int ret;
+
+ mutex_lock(&fs_info->reloc_mutex);
+ logical = btrfs_get_reloc_bg_bytenr(fs_info);
+ mutex_unlock(&fs_info->reloc_mutex);
+
+ if (logical == U64_MAX) {
+ btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
+ btrfs_warn_rl(fs_info,
+"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+ inode->root->root_key.objectid, btrfs_ino(inode), file_off,
+ CSUM_FMT_VALUE(csum_size, csum),
+ CSUM_FMT_VALUE(csum_size, csum_expected),
+ mirror_num);
+ return;
+ }
+
+ logical += file_off;
+ btrfs_warn_rl(fs_info,
+"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), file_off, logical,
+ CSUM_FMT_VALUE(csum_size, csum),
+ CSUM_FMT_VALUE(csum_size, csum_expected),
+ mirror_num);
+
+ ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
+ if (ret < 0) {
+ btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
+ logical, ret);
+ return;
+ }
+ eb = path.nodes[0];
+ ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
+ item_size = btrfs_item_size(eb, path.slots[0]);
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ unsigned long ptr = 0;
+ u64 ref_root;
+ u8 ref_level;
+
+ while (true) {
+ ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
+ item_size, &ref_root,
+ &ref_level);
+ if (ret < 0) {
+ btrfs_warn_rl(fs_info,
+ "failed to resolve tree backref for logical %llu: %d",
+ logical, ret);
+ break;
+ }
+ if (ret > 0)
+ break;
+
+ btrfs_warn_rl(fs_info,
+"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
+ logical, mirror_num,
+ (ref_level ? "node" : "leaf"),
+ ref_level, ref_root);
+ }
+ btrfs_release_path(&path);
+ } else {
+ struct btrfs_backref_walk_ctx ctx = { 0 };
+ struct data_reloc_warn reloc_warn = { 0 };
+
+ btrfs_release_path(&path);
+
+ ctx.bytenr = found_key.objectid;
+ ctx.extent_item_pos = logical - found_key.objectid;
+ ctx.fs_info = fs_info;
+
+ reloc_warn.logical = logical;
+ reloc_warn.extent_item_size = found_key.offset;
+ reloc_warn.mirror_num = mirror_num;
+ reloc_warn.fs_info = fs_info;
+
+ iterate_extent_inodes(&ctx, true,
+ data_reloc_print_warning_inode, &reloc_warn);
+ }
+}
+
static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
{
struct btrfs_root *root = inode->root;
const u32 csum_size = root->fs_info->csum_size;
+ /* For data reloc tree, it's better to do a backref lookup instead. */
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ return print_data_reloc_error(inode, logical_start, csum,
+ csum_expected, mirror_num);
+
/* Output without objectid, which is more meaningful */
if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
btrfs_warn_rl(root->fs_info,
@@ -636,6 +835,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
{
struct btrfs_inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
@@ -750,7 +950,7 @@ again:
/* Compression level is applied here and only here */
ret = btrfs_compress_pages(
compress_type | (fs_info->compress_level << 4),
- inode->vfs_inode.i_mapping, start,
+ mapping, start,
pages,
&nr_pages,
&total_in,
@@ -793,9 +993,9 @@ cont:
unsigned long clear_flags = EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING;
- unsigned long page_error_op;
- page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
+ if (ret < 0)
+ mapping_set_error(mapping, -EIO);
/*
* inline extent creation worked or returned error,
@@ -812,7 +1012,6 @@ cont:
clear_flags,
PAGE_UNLOCK |
PAGE_START_WRITEBACK |
- page_error_op |
PAGE_END_WRITEBACK);
/*
@@ -934,6 +1133,12 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
unsigned long nr_written = 0;
int page_started = 0;
int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .range_start = start,
+ .range_end = end,
+ .no_cgroup_owner = 1,
+ };
/*
* Call cow_file_range() to run the delalloc range directly, since we
@@ -954,8 +1159,6 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
const u64 page_start = page_offset(locked_page);
const u64 page_end = page_start + PAGE_SIZE - 1;
- btrfs_page_set_error(inode->root->fs_info, locked_page,
- page_start, PAGE_SIZE);
set_page_writeback(locked_page);
end_page_writeback(locked_page);
end_extent_writepage(locked_page, ret, page_start, page_end);
@@ -965,7 +1168,10 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
}
/* All pages will be unlocked, including @locked_page */
- return extent_write_locked_range(&inode->vfs_inode, start, end);
+ wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
+ ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc);
+ wbc_detach_inode(&wbc);
+ return ret;
}
static int submit_one_async_extent(struct btrfs_inode *inode,
@@ -976,6 +1182,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ordered_extent *ordered;
struct btrfs_key ins;
struct page *locked_page = NULL;
struct extent_map *em;
@@ -1037,7 +1244,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, start, /* file_offset */
+ ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */
async_extent->ram_size, /* num_bytes */
async_extent->ram_size, /* ram_bytes */
ins.objectid, /* disk_bytenr */
@@ -1045,8 +1252,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
0, /* offset */
1 << BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
- if (ret) {
+ if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = PTR_ERR(ordered);
goto out_free_reserve;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1055,11 +1263,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
extent_clear_unlock_delalloc(inode, start, end,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_START_WRITEBACK);
-
- btrfs_submit_compressed_write(inode, start, /* file_offset */
- async_extent->ram_size, /* num_bytes */
- ins.objectid, /* disk_bytenr */
- ins.offset, /* compressed_len */
+ btrfs_submit_compressed_write(ordered,
async_extent->pages, /* compressed_pages */
async_extent->nr_pages,
async_chunk->write_flags, true);
@@ -1074,12 +1278,13 @@ out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_free:
+ mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
- PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+ PAGE_END_WRITEBACK);
free_async_extent_pages(async_extent);
goto done;
}
@@ -1287,6 +1492,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
min_alloc_size = fs_info->sectorsize;
while (num_bytes > 0) {
+ struct btrfs_ordered_extent *ordered;
+
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
min_alloc_size, 0, alloc_hint,
@@ -1311,16 +1518,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size,
- ins.objectid, cur_alloc_size, 0,
- 1 << BTRFS_ORDERED_REGULAR,
- BTRFS_COMPRESS_NONE);
- if (ret)
+ ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
+ ram_size, ins.objectid, cur_alloc_size,
+ 0, 1 << BTRFS_ORDERED_REGULAR,
+ BTRFS_COMPRESS_NONE);
+ if (IS_ERR(ordered)) {
+ ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
+ }
if (btrfs_is_data_reloc_root(root)) {
- ret = btrfs_reloc_clone_csums(inode, start,
- cur_alloc_size);
+ ret = btrfs_reloc_clone_csums(ordered);
+
/*
* Only drop cache here, and process as normal.
*
@@ -1337,6 +1546,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
start + ram_size - 1,
false);
}
+ btrfs_put_ordered_extent(ordered);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1494,7 +1704,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
* always adjust ->async_delalloc_pages as its paired with the init
- * happening in cow_file_range_async
+ * happening in run_delalloc_compressed
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
@@ -1521,58 +1731,36 @@ static noinline void async_cow_free(struct btrfs_work *work)
kvfree(async_cow);
}
-static int cow_file_range_async(struct btrfs_inode *inode,
- struct writeback_control *wbc,
- struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written)
+static bool run_delalloc_compressed(struct btrfs_inode *inode,
+ struct writeback_control *wbc,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
struct async_cow *ctx;
struct async_chunk *async_chunk;
unsigned long nr_pages;
- u64 cur_end;
u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
int i;
- bool should_compress;
unsigned nofs_flag;
const blk_opf_t write_flags = wbc_to_write_flags(wbc);
- unlock_extent(&inode->io_tree, start, end, NULL);
-
- if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
- !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
- num_chunks = 1;
- should_compress = false;
- } else {
- should_compress = true;
- }
-
nofs_flag = memalloc_nofs_save();
ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
+ if (!ctx)
+ return false;
- if (!ctx) {
- unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING;
- unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
- PAGE_END_WRITEBACK | PAGE_SET_ERROR;
-
- extent_clear_unlock_delalloc(inode, start, end, locked_page,
- clear_bits, page_ops);
- return -ENOMEM;
- }
+ unlock_extent(&inode->io_tree, start, end, NULL);
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
async_chunk = ctx->chunks;
atomic_set(&ctx->num_chunks, num_chunks);
for (i = 0; i < num_chunks; i++) {
- if (should_compress)
- cur_end = min(end, start + SZ_512K - 1);
- else
- cur_end = end;
+ u64 cur_end = min(end, start + SZ_512K - 1);
/*
* igrab is called higher up in the call chain, take only the
@@ -1633,13 +1821,14 @@ static int cow_file_range_async(struct btrfs_inode *inode,
start = cur_end + 1;
}
*page_started = 1;
- return 0;
+ return true;
}
static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
struct page *locked_page, u64 start,
u64 end, int *page_started,
- unsigned long *nr_written)
+ unsigned long *nr_written,
+ struct writeback_control *wbc)
{
u64 done_offset = end;
int ret;
@@ -1671,8 +1860,8 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
account_page_redirty(locked_page);
}
locked_page_done = true;
- extent_write_locked_range(&inode->vfs_inode, start, done_offset);
-
+ extent_write_locked_range(&inode->vfs_inode, start, done_offset,
+ wbc);
start = done_offset + 1;
}
@@ -1947,6 +2136,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
nocow_args.writeback_path = true;
while (1) {
+ struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
@@ -1954,6 +2144,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
u64 ram_bytes;
u64 nocow_end;
int extent_type;
+ bool is_prealloc;
nocow = false;
@@ -2092,8 +2283,8 @@ out_check:
}
nocow_end = cur_offset + nocow_args.num_bytes - 1;
-
- if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
+ if (is_prealloc) {
u64 orig_start = found_key.offset - nocow_args.extent_offset;
struct extent_map *em;
@@ -2109,29 +2300,22 @@ out_check:
goto error;
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode,
- cur_offset, nocow_args.num_bytes,
- nocow_args.num_bytes,
- nocow_args.disk_bytenr,
- nocow_args.num_bytes, 0,
- 1 << BTRFS_ORDERED_PREALLOC,
- BTRFS_COMPRESS_NONE);
- if (ret) {
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
+ nocow_args.num_bytes, nocow_args.num_bytes,
+ nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
+ is_prealloc
+ ? (1 << BTRFS_ORDERED_PREALLOC)
+ : (1 << BTRFS_ORDERED_NOCOW),
+ BTRFS_COMPRESS_NONE);
+ if (IS_ERR(ordered)) {
+ if (is_prealloc) {
btrfs_drop_extent_map_range(inode, cur_offset,
nocow_end, false);
- goto error;
}
- } else {
- ret = btrfs_add_ordered_extent(inode, cur_offset,
- nocow_args.num_bytes,
- nocow_args.num_bytes,
- nocow_args.disk_bytenr,
- nocow_args.num_bytes,
- 0,
- 1 << BTRFS_ORDERED_NOCOW,
- BTRFS_COMPRESS_NONE);
- if (ret)
- goto error;
+ ret = PTR_ERR(ordered);
+ goto error;
}
if (nocow) {
@@ -2145,8 +2329,8 @@ out_check:
* extent_clear_unlock_delalloc() in error handler
* from freeing metadata of created ordered extent.
*/
- ret = btrfs_reloc_clone_csums(inode, cur_offset,
- nocow_args.num_bytes);
+ ret = btrfs_reloc_clone_csums(ordered);
+ btrfs_put_ordered_extent(ordered);
extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
locked_page, EXTENT_LOCKED |
@@ -2214,7 +2398,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
- int ret;
+ int ret = 0;
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
/*
@@ -2235,19 +2419,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, nr_written);
- } else if (!btrfs_inode_can_compress(inode) ||
- !inode_need_compress(inode, start, end)) {
- if (zoned)
- ret = run_delalloc_zoned(inode, locked_page, start, end,
- page_started, nr_written);
- else
- ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1, NULL);
- } else {
- set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
- ret = cow_file_range_async(inode, wbc, locked_page, start, end,
- page_started, nr_written);
+ goto out;
}
+
+ if (btrfs_inode_can_compress(inode) &&
+ inode_need_compress(inode, start, end) &&
+ run_delalloc_compressed(inode, wbc, locked_page, start,
+ end, page_started, nr_written))
+ goto out;
+
+ if (zoned)
+ ret = run_delalloc_zoned(inode, locked_page, start, end,
+ page_started, nr_written, wbc);
+ else
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1, NULL);
+
+out:
ASSERT(ret <= 0);
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
@@ -2515,125 +2703,42 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
}
}
-/*
- * Split off the first pre bytes from the extent_map at [start, start + len]
- *
- * This function is intended to be used only for extract_ordered_extent().
- */
-static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre)
-{
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- struct extent_map *split_pre = NULL;
- struct extent_map *split_mid = NULL;
- int ret = 0;
- unsigned long flags;
-
- ASSERT(pre != 0);
- ASSERT(pre < len);
-
- split_pre = alloc_extent_map();
- if (!split_pre)
- return -ENOMEM;
- split_mid = alloc_extent_map();
- if (!split_mid) {
- ret = -ENOMEM;
- goto out_free_pre;
- }
-
- lock_extent(&inode->io_tree, start, start + len - 1, NULL);
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em) {
- ret = -EIO;
- goto out_unlock;
- }
-
- ASSERT(em->len == len);
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
- ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
- ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
- ASSERT(!list_empty(&em->list));
-
- flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-
- /* First, replace the em with a new extent_map starting from * em->start */
- split_pre->start = em->start;
- split_pre->len = pre;
- split_pre->orig_start = split_pre->start;
- split_pre->block_start = em->block_start;
- split_pre->block_len = split_pre->len;
- split_pre->orig_block_len = split_pre->block_len;
- split_pre->ram_bytes = split_pre->len;
- split_pre->flags = flags;
- split_pre->compress_type = em->compress_type;
- split_pre->generation = em->generation;
-
- replace_extent_mapping(em_tree, em, split_pre, 1);
-
- /*
- * Now we only have an extent_map at:
- * [em->start, em->start + pre]
- */
-
- /* Insert the middle extent_map. */
- split_mid->start = em->start + pre;
- split_mid->len = em->len - pre;
- split_mid->orig_start = split_mid->start;
- split_mid->block_start = em->block_start + pre;
- split_mid->block_len = split_mid->len;
- split_mid->orig_block_len = split_mid->block_len;
- split_mid->ram_bytes = split_mid->len;
- split_mid->flags = flags;
- split_mid->compress_type = em->compress_type;
- split_mid->generation = em->generation;
- add_extent_mapping(em_tree, split_mid, 1);
-
- /* Once for us */
- free_extent_map(em);
- /* Once for the tree */
- free_extent_map(em);
-
-out_unlock:
- write_unlock(&em_tree->lock);
- unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
- free_extent_map(split_mid);
-out_free_pre:
- free_extent_map(split_pre);
- return ret;
-}
-
-int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
- struct btrfs_ordered_extent *ordered)
+static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+ struct btrfs_ordered_extent *ordered)
{
u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
u64 len = bbio->bio.bi_iter.bi_size;
- struct btrfs_inode *inode = bbio->inode;
- u64 ordered_len = ordered->num_bytes;
- int ret = 0;
+ struct btrfs_ordered_extent *new;
+ int ret;
/* Must always be called for the beginning of an ordered extent. */
if (WARN_ON_ONCE(start != ordered->disk_bytenr))
return -EINVAL;
/* No need to split if the ordered extent covers the entire bio. */
- if (ordered->disk_num_bytes == len)
+ if (ordered->disk_num_bytes == len) {
+ refcount_inc(&ordered->refs);
+ bbio->ordered = ordered;
return 0;
-
- ret = btrfs_split_ordered_extent(ordered, len);
- if (ret)
- return ret;
+ }
/*
* Don't split the extent_map for NOCOW extents, as we're writing into
* a pre-existing one.
*/
- if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
- return 0;
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+ ret = split_extent_map(bbio->inode, bbio->file_offset,
+ ordered->num_bytes, len,
+ ordered->disk_bytenr);
+ if (ret)
+ return ret;
+ }
- return split_extent_map(inode, bbio->file_offset, ordered_len, len);
+ new = btrfs_split_ordered_extent(ordered, len);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ bbio->ordered = new;
+ return 0;
}
/*
@@ -2651,7 +2756,7 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,
trans->adding_csums = true;
if (!csum_root)
csum_root = btrfs_csum_root(trans->fs_info,
- sum->bytenr);
+ sum->logical);
ret = btrfs_csum_file_blocks(trans, csum_root, sum);
trans->adding_csums = false;
if (ret)
@@ -2689,8 +2794,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
ret = set_extent_bit(&inode->io_tree, search_start,
search_start + em_len - 1,
- EXTENT_DELALLOC_NEW, cached_state,
- GFP_NOFS);
+ EXTENT_DELALLOC_NEW, cached_state);
next:
search_start = extent_map_end(em);
free_extent_map(em);
@@ -2723,8 +2827,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
return ret;
}
- return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
- cached_state);
+ return set_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC | extra_bits, cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -2847,7 +2951,6 @@ out_page:
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
clear_page_dirty_for_io(page);
- SetPageError(page);
}
btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
unlock_page(page);
@@ -3068,7 +3171,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
-int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
{
struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
struct btrfs_root *root = inode->root;
@@ -3103,15 +3206,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- /* A valid ->physical implies a write on a sequential zone. */
- if (ordered_extent->physical != (u64)-1) {
- btrfs_rewrite_logical_zoned(ordered_extent);
+ if (btrfs_is_zoned(fs_info))
btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes);
- } else if (btrfs_is_data_reloc_root(inode->root)) {
- btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes);
- }
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
@@ -3279,6 +3376,14 @@ out:
return ret;
}
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
+{
+ if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
+ !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ btrfs_finish_ordered_zoned(ordered);
+ return btrfs_finish_one_ordered(ordered);
+}
+
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate)
@@ -4226,7 +4331,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
}
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
- 0);
+ false);
ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name);
@@ -4801,7 +4906,7 @@ again:
if (only_release_metadata)
set_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_NORESERVE, NULL, GFP_NOFS);
+ EXTENT_NORESERVE, NULL);
out_unlock:
if (ret) {
@@ -7670,8 +7775,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
pos += submitted;
length -= submitted;
if (write)
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
- pos, length, false);
+ btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+ pos, length, false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, pos,
pos + length - 1, NULL);
@@ -7701,12 +7806,14 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio)
dip->file_offset, dip->bytes, bio->bi_status);
}
- if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset,
- dip->bytes, !bio->bi_status);
- else
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+ btrfs_finish_ordered_extent(bbio->ordered, NULL,
+ dip->file_offset, dip->bytes,
+ !bio->bi_status);
+ } else {
unlock_extent(&inode->io_tree, dip->file_offset,
dip->file_offset + dip->bytes - 1, NULL);
+ }
bbio->bio.bi_private = bbio->private;
iomap_dio_bio_end_io(bio);
@@ -7742,7 +7849,8 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
if (ret) {
- btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
+ bbio->bio.bi_status = errno_to_blk_status(ret);
+ btrfs_dio_end_io(bbio);
return;
}
}
@@ -8236,7 +8344,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
int ret;
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
- u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
+ const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(&inode->vfs_inode,
@@ -8293,7 +8401,15 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
/* Migrate the slack space for the truncate to our reserve */
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, false);
- BUG_ON(ret);
+ /*
+ * We have reserved 2 metadata units when we started the transaction and
+ * min_size matches 1 unit, so this should never fail, but if it does,
+ * it's not critical we just fail truncation.
+ */
+ if (WARN_ON(ret)) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
trans->block_rsv = rsv;
@@ -8341,7 +8457,14 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
- BUG_ON(ret); /* shouldn't happen */
+ /*
+ * We have reserved 2 metadata units when we started the
+ * transaction and min_size matches 1 unit, so this should never
+ * fail, but if it does, it's not critical we just fail truncation.
+ */
+ if (WARN_ON(ret))
+ break;
+
trans->block_rsv = rsv;
}
@@ -8468,7 +8591,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->io_tree.inode = ei;
extent_io_tree_init(fs_info, &ei->file_extent_tree,
IO_TREE_INODE_FILE_EXTENT);
- atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -8639,7 +8761,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
inode_bytes = inode_get_bytes(inode);
spin_unlock(&BTRFS_I(inode)->lock);
stat->blocks = (ALIGN(inode_bytes, blocksize) +
- ALIGN(delalloc_bytes, blocksize)) >> 9;
+ ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
return 0;
}
@@ -8795,9 +8917,9 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
- BTRFS_I(old_inode), 1);
+ BTRFS_I(old_inode), true);
btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
- BTRFS_I(new_inode), 1);
+ BTRFS_I(new_inode), true);
}
/* src is a subvolume */
@@ -9063,7 +9185,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
- BTRFS_I(old_inode), 1);
+ BTRFS_I(old_inode), true);
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
@@ -10170,6 +10292,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_changeset *data_reserved = NULL;
struct extent_state *cached_state = NULL;
+ struct btrfs_ordered_extent *ordered;
int compression;
size_t orig_count;
u64 start, end;
@@ -10346,14 +10469,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
+ ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
ins.objectid, ins.offset,
encoded->unencoded_offset,
(1 << BTRFS_ORDERED_ENCODED) |
(1 << BTRFS_ORDERED_COMPRESSED),
compression);
- if (ret) {
+ if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = PTR_ERR(ordered);
goto out_free_reserved;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -10365,8 +10489,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
btrfs_delalloc_release_extents(inode, num_bytes);
- btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
- ins.offset, pages, nr_pages, 0, false);
+ btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
ret = orig_count;
goto out;
@@ -10903,7 +11026,6 @@ static const struct address_space_operations btrfs_aops = {
.read_folio = btrfs_read_folio,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
- .direct_IO = noop_direct_IO,
.invalidate_folio = btrfs_invalidate_folio,
.release_folio = btrfs_release_folio,
.migrate_folio = btrfs_migrate_folio,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2fa36f694daa..edbbd5cf23fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -649,6 +649,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
+ /* Tree log can't currently deal with an inode which is a new root. */
+ btrfs_set_log_full_commit(trans);
ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
if (ret)
@@ -757,10 +759,7 @@ out:
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv);
- if (ret)
- btrfs_end_transaction(trans);
- else
- ret = btrfs_commit_transaction(trans);
+ btrfs_end_transaction(trans);
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
@@ -3113,6 +3112,13 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
struct btrfs_trans_handle *trans;
u64 transid;
+ /*
+ * Start orphan cleanup here for the given root in case it hasn't been
+ * started already by other means. Errors are handled in the other
+ * functions during transaction commit.
+ */
+ btrfs_orphan_cleanup(root);
+
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT)
@@ -3134,14 +3140,13 @@ out:
static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
void __user *argp)
{
- u64 transid;
+ /* By default wait for the current transaction. */
+ u64 transid = 0;
- if (argp) {
+ if (argp)
if (copy_from_user(&transid, argp, sizeof(transid)))
return -EFAULT;
- } else {
- transid = 0; /* current trans */
- }
+
return btrfs_wait_for_commit(fs_info, transid);
}
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 3a496b0d3d2b..7979449a58d6 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -57,8 +57,8 @@
static struct btrfs_lockdep_keyset {
u64 id; /* root objectid */
- /* Longest entry: btrfs-free-space-00 */
- char names[BTRFS_MAX_LEVEL][20];
+ /* Longest entry: btrfs-block-group-00 */
+ char names[BTRFS_MAX_LEVEL][24];
struct lock_class_key keys[BTRFS_MAX_LEVEL];
} btrfs_lockdep_keysets[] = {
{ .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
@@ -72,6 +72,7 @@ static struct btrfs_lockdep_keyset {
{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
{ .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
+ { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
{ .id = 0, DEFINE_NAME("tree") },
};
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 3a095b9c6373..d3fcfc628a4f 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -88,9 +88,9 @@ struct list_head *lzo_alloc_workspace(unsigned int level)
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
- workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL);
- workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL);
+ workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN);
+ workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN);
+ workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN);
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 310a05cf95ef..23fc11af498a 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -252,14 +252,6 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
}
#endif
-#ifdef CONFIG_BTRFS_ASSERT
-void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line)
-{
- pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
- BUG();
-}
-#endif
-
void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
{
btrfs_err(fs_info,
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index ac2d1982ba3d..deedc1a168e2 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -4,14 +4,23 @@
#define BTRFS_MESSAGES_H
#include <linux/types.h>
+#include <linux/printk.h>
+#include <linux/bug.h>
struct btrfs_fs_info;
+/*
+ * We want to be able to override this in btrfs-progs.
+ */
+#ifdef __KERNEL__
+
static inline __printf(2, 3) __cold
void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
+#endif
+
#ifdef CONFIG_PRINTK
#define btrfs_printk(fs_info, fmt, args...) \
@@ -160,7 +169,11 @@ do { \
} while (0)
#ifdef CONFIG_BTRFS_ASSERT
-void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line);
+
+#define btrfs_assertfail(expr, file, line) ({ \
+ pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \
+ BUG(); \
+})
#define ASSERT(expr) \
(likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 768583a440e1..005751a12911 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -143,4 +143,24 @@ static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
return NULL;
}
+static inline bool bitmap_test_range_all_set(const unsigned long *addr,
+ unsigned long start,
+ unsigned long nbits)
+{
+ unsigned long found_zero;
+
+ found_zero = find_next_zero_bit(addr, start + nbits, start);
+ return (found_zero == start + nbits);
+}
+
+static inline bool bitmap_test_range_all_zero(const unsigned long *addr,
+ unsigned long start,
+ unsigned long nbits)
+{
+ unsigned long found_set;
+
+ found_set = find_next_bit(addr, start + nbits, start);
+ return (found_set == start + nbits);
+}
+
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a9778a91511e..a629532283bc 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -146,35 +146,11 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
-/*
- * Add an ordered extent to the per-inode tree.
- *
- * @inode: Inode that this extent is for.
- * @file_offset: Logical offset in file where the extent starts.
- * @num_bytes: Logical length of extent in file.
- * @ram_bytes: Full length of unencoded data.
- * @disk_bytenr: Offset of extent on disk.
- * @disk_num_bytes: Size of extent on disk.
- * @offset: Offset into unencoded data where file data starts.
- * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
- * @compress_type: Compression algorithm used for data.
- *
- * Most of these parameters correspond to &struct btrfs_file_extent_item. The
- * tree is given a single reference on the ordered extent that was inserted, and
- * the returned pointer is given a second reference.
- *
- * Return: the new ordered extent or error pointer.
- */
-struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
- struct btrfs_inode *inode, u64 file_offset,
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned long flags,
- int compress_type)
+static struct btrfs_ordered_extent *alloc_ordered_extent(
+ struct btrfs_inode *inode, u64 file_offset, u64 num_bytes,
+ u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes,
+ u64 offset, unsigned long flags, int compress_type)
{
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
- struct rb_node *node;
struct btrfs_ordered_extent *entry;
int ret;
@@ -184,7 +160,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
if (ret < 0)
return ERR_PTR(ret);
- ret = 0;
} else {
/*
* The ordered extent has reserved qgroup space, release now
@@ -209,15 +184,7 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = ret;
- entry->physical = (u64)-1;
-
- ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
entry->flags = flags;
-
- percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
- fs_info->delalloc_batch);
-
- /* one ref for the tree */
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
@@ -226,15 +193,40 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
+ /*
+ * We don't need the count_max_extents here, we can assume that all of
+ * that work has been done at higher layers, so this is truly the
+ * smallest the extent is going to get.
+ */
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, 1);
+ spin_unlock(&inode->lock);
+
+ return entry;
+}
+
+static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *node;
+
trace_btrfs_ordered_extent_add(inode, entry);
+ percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes,
+ fs_info->delalloc_batch);
+
+ /* One ref for the tree. */
+ refcount_inc(&entry->refs);
+
spin_lock_irq(&tree->lock);
- node = tree_insert(&tree->tree, file_offset,
- &entry->rb_node);
+ node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node);
if (node)
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
- file_offset);
+ entry->file_offset);
spin_unlock_irq(&tree->lock);
spin_lock(&root->ordered_extent_lock);
@@ -248,43 +240,43 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
spin_unlock(&fs_info->ordered_root_lock);
}
spin_unlock(&root->ordered_extent_lock);
-
- /*
- * We don't need the count_max_extents here, we can assume that all of
- * that work has been done at higher layers, so this is truly the
- * smallest the extent is going to get.
- */
- spin_lock(&inode->lock);
- btrfs_mod_outstanding_extents(inode, 1);
- spin_unlock(&inode->lock);
-
- /* One ref for the returned entry to match semantics of lookup. */
- refcount_inc(&entry->refs);
-
- return entry;
}
/*
- * Add a new btrfs_ordered_extent for the range, but drop the reference instead
- * of returning it to the caller.
+ * Add an ordered extent to the per-inode tree.
+ *
+ * @inode: Inode that this extent is for.
+ * @file_offset: Logical offset in file where the extent starts.
+ * @num_bytes: Logical length of extent in file.
+ * @ram_bytes: Full length of unencoded data.
+ * @disk_bytenr: Offset of extent on disk.
+ * @disk_num_bytes: Size of extent on disk.
+ * @offset: Offset into unencoded data where file data starts.
+ * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @compress_type: Compression algorithm used for data.
+ *
+ * Most of these parameters correspond to &struct btrfs_file_extent_item. The
+ * tree is given a single reference on the ordered extent that was inserted, and
+ * the returned pointer is given a second reference.
+ *
+ * Return: the new ordered extent or error pointer.
*/
-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned long flags,
- int compress_type)
+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
+ struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned long flags,
+ int compress_type)
{
- struct btrfs_ordered_extent *ordered;
-
- ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes,
- ram_bytes, disk_bytenr,
- disk_num_bytes, offset, flags,
- compress_type);
+ struct btrfs_ordered_extent *entry;
- if (IS_ERR(ordered))
- return PTR_ERR(ordered);
- btrfs_put_ordered_extent(ordered);
+ ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
- return 0;
+ entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes,
+ disk_bytenr, disk_num_bytes, offset, flags,
+ compress_type);
+ if (!IS_ERR(entry))
+ insert_ordered_extent(entry);
+ return entry;
}
/*
@@ -311,6 +303,90 @@ static void finish_ordered_fn(struct btrfs_work *work)
btrfs_finish_ordered_io(ordered_extent);
}
+static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+ struct page *page, u64 file_offset,
+ u64 len, bool uptodate)
+{
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ lockdep_assert_held(&inode->ordered_tree.lock);
+
+ if (page) {
+ ASSERT(page->mapping);
+ ASSERT(page_offset(page) <= file_offset);
+ ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE);
+
+ /*
+ * Ordered (Private2) bit indicates whether we still have
+ * pending io unfinished for the ordered extent.
+ *
+ * If there's no such bit, we need to skip to next range.
+ */
+ if (!btrfs_page_test_ordered(fs_info, page, file_offset, len))
+ return false;
+ btrfs_page_clear_ordered(fs_info, page, file_offset, len);
+ }
+
+ /* Now we're fine to update the accounting. */
+ if (WARN_ON_ONCE(len > ordered->bytes_left)) {
+ btrfs_crit(fs_info,
+"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
+ inode->root->root_key.objectid, btrfs_ino(inode),
+ ordered->file_offset, ordered->num_bytes,
+ len, ordered->bytes_left);
+ ordered->bytes_left = 0;
+ } else {
+ ordered->bytes_left -= len;
+ }
+
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+
+ if (ordered->bytes_left)
+ return false;
+
+ /*
+ * All the IO of the ordered extent is finished, we need to queue
+ * the finish_func to be executed.
+ */
+ set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags);
+ cond_wake_up(&ordered->wait);
+ refcount_inc(&ordered->refs);
+ trace_btrfs_ordered_extent_mark_finished(inode, ordered);
+ return true;
+}
+
+static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
+ fs_info->endio_freespace_worker : fs_info->endio_write_workers;
+
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+ btrfs_queue_work(wq, &ordered->work);
+}
+
+bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+ struct page *page, u64 file_offset, u64 len,
+ bool uptodate)
+{
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ unsigned long flags;
+ bool ret;
+
+ trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
+
+ spin_lock_irqsave(&inode->ordered_tree.lock, flags);
+ ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
+ spin_unlock_irqrestore(&inode->ordered_tree.lock, flags);
+
+ if (ret)
+ btrfs_queue_ordered_fn(ordered);
+ return ret;
+}
+
/*
* Mark all ordered extents io inside the specified range finished.
*
@@ -329,22 +405,11 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
u64 num_bytes, bool uptodate)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_workqueue *wq;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
u64 cur = file_offset;
- if (btrfs_is_free_space_inode(inode))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
-
- if (page)
- ASSERT(page->mapping && page_offset(page) <= file_offset &&
- file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
-
spin_lock_irqsave(&tree->lock, flags);
while (cur < file_offset + num_bytes) {
u64 entry_end;
@@ -397,50 +462,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
ASSERT(end + 1 - cur < U32_MAX);
len = end + 1 - cur;
- if (page) {
- /*
- * Ordered (Private2) bit indicates whether we still
- * have pending io unfinished for the ordered extent.
- *
- * If there's no such bit, we need to skip to next range.
- */
- if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
- cur += len;
- continue;
- }
- btrfs_page_clear_ordered(fs_info, page, cur, len);
- }
-
- /* Now we're fine to update the accounting */
- if (unlikely(len > entry->bytes_left)) {
- WARN_ON(1);
- btrfs_crit(fs_info,
-"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
- inode->root->root_key.objectid,
- btrfs_ino(inode),
- entry->file_offset,
- entry->num_bytes,
- len, entry->bytes_left);
- entry->bytes_left = 0;
- } else {
- entry->bytes_left -= len;
- }
-
- if (!uptodate)
- set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
-
- /*
- * All the IO of the ordered extent is finished, we need to queue
- * the finish_func to be executed.
- */
- if (entry->bytes_left == 0) {
- set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- cond_wake_up(&entry->wait);
- refcount_inc(&entry->refs);
- trace_btrfs_ordered_extent_mark_finished(inode, entry);
+ if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
spin_unlock_irqrestore(&tree->lock, flags);
- btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(wq, &entry->work);
+ btrfs_queue_ordered_fn(entry);
spin_lock_irqsave(&tree->lock, flags);
}
cur += len;
@@ -564,7 +588,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
- /* This is paired with btrfs_add_ordered_extent. */
+ /* This is paired with btrfs_alloc_ordered_extent. */
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
@@ -1117,17 +1141,22 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
}
/* Split out a new ordered extent for this first @len bytes of @ordered. */
-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len)
+struct btrfs_ordered_extent *btrfs_split_ordered_extent(
+ struct btrfs_ordered_extent *ordered, u64 len)
{
- struct inode *inode = ordered->inode;
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 file_offset = ordered->file_offset;
u64 disk_bytenr = ordered->disk_bytenr;
- unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
+ unsigned long flags = ordered->flags;
+ struct btrfs_ordered_sum *sum, *tmpsum;
+ struct btrfs_ordered_extent *new;
struct rb_node *node;
+ u64 offset = 0;
- trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
+ trace_btrfs_ordered_extent_split(inode, ordered);
ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
@@ -1136,18 +1165,27 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len)
* reduce the original extent to a zero length either.
*/
if (WARN_ON_ONCE(len >= ordered->num_bytes))
- return -EINVAL;
- /* We cannot split once ordered extent is past end_bio. */
- if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
+ /* We cannot split partially completed ordered extents. */
+ if (ordered->bytes_left) {
+ ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
+ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
+ return ERR_PTR(-EINVAL);
+ }
/* We cannot split a compressed ordered extent. */
if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
- return -EINVAL;
- /* Checksum list should be empty. */
- if (WARN_ON_ONCE(!list_empty(&ordered->list)))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- spin_lock_irq(&tree->lock);
+ new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr,
+ len, 0, flags, ordered->compress_type);
+ if (IS_ERR(new))
+ return new;
+
+ /* One ref for the tree. */
+ refcount_inc(&new->refs);
+
+ spin_lock_irq(&root->ordered_extent_lock);
+ spin_lock(&tree->lock);
/* Remove from tree once */
node = &ordered->rb_node;
rb_erase(node, &tree->tree);
@@ -1159,26 +1197,48 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len)
ordered->disk_bytenr += len;
ordered->num_bytes -= len;
ordered->disk_num_bytes -= len;
- ordered->bytes_left -= len;
+
+ if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
+ ASSERT(ordered->bytes_left == 0);
+ new->bytes_left = 0;
+ } else {
+ ordered->bytes_left -= len;
+ }
+
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) {
+ if (ordered->truncated_len > len) {
+ ordered->truncated_len -= len;
+ } else {
+ new->truncated_len = ordered->truncated_len;
+ ordered->truncated_len = 0;
+ }
+ }
+
+ list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) {
+ if (offset == len)
+ break;
+ list_move_tail(&sum->list, &new->list);
+ offset += sum->len;
+ }
/* Re-insert the node */
node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
if (node)
btrfs_panic(fs_info, -EEXIST,
"zoned: inconsistency in ordered tree at offset %llu",
- ordered->file_offset);
+ ordered->file_offset);
- spin_unlock_irq(&tree->lock);
-
- /*
- * The splitting extent is already counted and will be added again in
- * btrfs_add_ordered_extent(). Subtract len to avoid double counting.
- */
- percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch);
+ node = tree_insert(&tree->tree, new->file_offset, &new->rb_node);
+ if (node)
+ btrfs_panic(fs_info, -EEXIST,
+ "zoned: inconsistency in ordered tree at offset %llu",
+ new->file_offset);
+ spin_unlock(&tree->lock);
- return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
- disk_bytenr, len, 0, flags,
- ordered->compress_type);
+ list_add_tail(&new->root_extent_list, &root->ordered_extents);
+ root->nr_ordered_extents++;
+ spin_unlock_irq(&root->ordered_extent_lock);
+ return new;
}
int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f0f1138d23c3..173bd5c5df26 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -14,13 +14,13 @@ struct btrfs_ordered_inode_tree {
};
struct btrfs_ordered_sum {
- /* bytenr is the start of this extent on disk */
- u64 bytenr;
-
/*
- * this is the length in bytes covered by the sums array below.
+ * Logical start address and length for of the blocks covered by
+ * the sums array.
*/
- int len;
+ u64 logical;
+ u32 len;
+
struct list_head list;
/* last field is a variable length array of csums */
u8 sums[];
@@ -151,12 +151,6 @@ struct btrfs_ordered_extent {
struct completion completion;
struct btrfs_work flush_work;
struct list_head work_list;
-
- /*
- * Used to reverse-map physical address returned from ZONE_APPEND write
- * command in a workqueue context
- */
- u64 physical;
};
static inline void
@@ -167,11 +161,15 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
t->last = NULL;
}
+int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent);
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
+bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+ struct page *page, u64 file_offset, u64 len,
+ bool uptodate);
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
struct page *page, u64 file_offset,
u64 num_bytes, bool uptodate);
@@ -183,10 +181,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
u64 disk_num_bytes, u64 offset, unsigned long flags,
int compress_type);
-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned long flags,
- int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
@@ -212,7 +206,8 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
struct extent_state **cached_state);
bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len);
+struct btrfs_ordered_extent *btrfs_split_ordered_extent(
+ struct btrfs_ordered_extent *ordered, u64 len);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 497b9dbd8a13..aa06d9ca911d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -49,7 +49,7 @@ const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
return buf;
}
-static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+static void print_chunk(const struct extent_buffer *eb, struct btrfs_chunk *chunk)
{
int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
int i;
@@ -62,7 +62,7 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
btrfs_stripe_offset_nr(eb, chunk, i));
}
}
-static void print_dev_item(struct extent_buffer *eb,
+static void print_dev_item(const struct extent_buffer *eb,
struct btrfs_dev_item *dev_item)
{
pr_info("\t\tdev item devid %llu total_bytes %llu bytes used %llu\n",
@@ -70,7 +70,7 @@ static void print_dev_item(struct extent_buffer *eb,
btrfs_device_total_bytes(eb, dev_item),
btrfs_device_bytes_used(eb, dev_item));
}
-static void print_extent_data_ref(struct extent_buffer *eb,
+static void print_extent_data_ref(const struct extent_buffer *eb,
struct btrfs_extent_data_ref *ref)
{
pr_cont("extent data backref root %llu objectid %llu offset %llu count %u\n",
@@ -80,7 +80,7 @@ static void print_extent_data_ref(struct extent_buffer *eb,
btrfs_extent_data_ref_count(eb, ref));
}
-static void print_extent_item(struct extent_buffer *eb, int slot, int type)
+static void print_extent_item(const struct extent_buffer *eb, int slot, int type)
{
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
@@ -169,7 +169,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
WARN_ON(ptr > end);
}
-static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
+static void print_uuid_item(const struct extent_buffer *l, unsigned long offset,
u32 item_size)
{
if (!IS_ALIGNED(item_size, sizeof(u64))) {
@@ -191,7 +191,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
* Helper to output refs and locking status of extent buffer. Useful to debug
* race condition related problems.
*/
-static void print_eb_refs_lock(struct extent_buffer *eb)
+static void print_eb_refs_lock(const struct extent_buffer *eb)
{
#ifdef CONFIG_BTRFS_DEBUG
btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u",
@@ -199,7 +199,7 @@ static void print_eb_refs_lock(struct extent_buffer *eb)
#endif
}
-void btrfs_print_leaf(struct extent_buffer *l)
+void btrfs_print_leaf(const struct extent_buffer *l)
{
struct btrfs_fs_info *fs_info;
int i;
@@ -355,7 +355,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
}
}
-void btrfs_print_tree(struct extent_buffer *c, bool follow)
+void btrfs_print_tree(const struct extent_buffer *c, bool follow)
{
struct btrfs_fs_info *fs_info;
int i; u32 nr;
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 8c3e9319ec4e..c42bc666d5ee 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -9,8 +9,8 @@
/* Buffer size to contain tree name and possibly additional data (offset) */
#define BTRFS_ROOT_NAME_BUF_LEN 48
-void btrfs_print_leaf(struct extent_buffer *l);
-void btrfs_print_tree(struct extent_buffer *c, bool follow);
+void btrfs_print_leaf(const struct extent_buffer *l);
+void btrfs_print_tree(const struct extent_buffer *c, bool follow);
const char *btrfs_root_name(const struct btrfs_key *key, char *buf);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f41da7ac360d..da1f84a0eb29 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1232,12 +1232,23 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
int ret = 0;
/*
- * We need to have subvol_sem write locked, to prevent races between
- * concurrent tasks trying to disable quotas, because we will unlock
- * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ * We need to have subvol_sem write locked to prevent races with
+ * snapshot creation.
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
+ /*
+ * Lock the cleaner mutex to prevent races with concurrent relocation,
+ * because relocation may be building backrefs for blocks of the quota
+ * root while we are deleting the root. This is like dropping fs roots
+ * of deleted snapshots/subvolumes, we need the same protection.
+ *
+ * This also prevents races between concurrent tasks trying to disable
+ * quotas, because we will unlock and relock qgroup_ioctl_lock across
+ * BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ mutex_lock(&fs_info->cleaner_mutex);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
@@ -1301,7 +1312,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
goto out;
}
+ spin_lock(&fs_info->trans_lock);
list_del(&quota_root->dirty_list);
+ spin_unlock(&fs_info->trans_lock);
btrfs_tree_lock(quota_root->node);
btrfs_clear_buffer_dirty(trans, quota_root->node);
@@ -1317,6 +1330,7 @@ out:
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_end_transaction(trans);
+ mutex_unlock(&fs_info->cleaner_mutex);
return ret;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 2fab37f062de..f37b925d587f 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1079,7 +1079,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
/* see if we can add this page onto our existing bio */
if (last) {
- u64 last_end = last->bi_iter.bi_sector << 9;
+ u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
last_end += last->bi_iter.bi_size;
/*
@@ -1099,7 +1099,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
bio = bio_alloc(stripe->dev->bdev,
max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
op, GFP_NOFS);
- bio->bi_iter.bi_sector = disk_start >> 9;
+ bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
bio->bi_private = rbio;
__bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
@@ -2747,3 +2747,48 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
if (!lock_stripe_add(rbio))
start_async_work(rbio, scrub_rbio_work_locked);
}
+
+/*
+ * This is for scrub call sites where we already have correct data contents.
+ * This allows us to avoid reading data stripes again.
+ *
+ * Unfortunately here we have to do page copy, other than reusing the pages.
+ * This is due to the fact rbio has its own page management for its cache.
+ */
+void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
+ struct page **data_pages, u64 data_logical)
+{
+ const u64 offset_in_full_stripe = data_logical -
+ rbio->bioc->full_stripe_logical;
+ const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int ret;
+
+ /*
+ * If we hit ENOMEM temporarily, but later at
+ * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
+ * the extra read, not a big deal.
+ *
+ * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
+ * the bio would got proper error number set.
+ */
+ ret = alloc_rbio_data_pages(rbio);
+ if (ret < 0)
+ return;
+
+ /* data_logical must be at stripe boundary and inside the full stripe. */
+ ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
+ ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
+
+ for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
+ struct page *dst = rbio->stripe_pages[page_nr + page_index];
+ struct page *src = data_pages[page_nr];
+
+ memcpy_page(dst, 0, src, 0, PAGE_SIZE);
+ for (int sector_nr = sectors_per_page * page_index;
+ sector_nr < sectors_per_page * (page_index + 1);
+ sector_nr++)
+ rbio->stripe_sectors[sector_nr].uptodate = true;
+ }
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 0f7f31c8cb98..0e84c9c9293f 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -193,6 +193,9 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
+void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
+ struct page **data_pages, u64 data_logical);
+
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59a06499c647..25a3361caedc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -174,8 +174,8 @@ static void mark_block_processed(struct reloc_control *rc,
in_range(node->bytenr, rc->block_group->start,
rc->block_group->length)) {
blocksize = rc->extent_root->fs_info->nodesize;
- set_extent_bits(&rc->processed_blocks, node->bytenr,
- node->bytenr + blocksize - 1, EXTENT_DIRTY);
+ set_extent_bit(&rc->processed_blocks, node->bytenr,
+ node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL);
}
node->processed = 1;
}
@@ -3051,9 +3051,9 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
u64 boundary_end = boundary_start +
fs_info->sectorsize - 1;
- set_extent_bits(&BTRFS_I(inode)->io_tree,
- boundary_start, boundary_end,
- EXTENT_BOUNDARY);
+ set_extent_bit(&BTRFS_I(inode)->io_tree,
+ boundary_start, boundary_end,
+ EXTENT_BOUNDARY, NULL);
}
unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
&cached_state);
@@ -4342,29 +4342,25 @@ out:
* cloning checksum properly handles the nodatasum extents.
* it also saves CPU time to re-calculate the checksum.
*/
-int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
+int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
{
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_root *csum_root;
- struct btrfs_ordered_sum *sums;
- struct btrfs_ordered_extent *ordered;
- int ret;
- u64 disk_bytenr;
- u64 new_bytenr;
+ u64 disk_bytenr = ordered->file_offset + inode->index_cnt;
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr);
LIST_HEAD(list);
+ int ret;
- ordered = btrfs_lookup_ordered_extent(inode, file_pos);
- BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
-
- disk_bytenr = file_pos + inode->index_cnt;
- csum_root = btrfs_csum_root(fs_info, disk_bytenr);
ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
- disk_bytenr + len - 1, &list, 0, false);
+ disk_bytenr + ordered->num_bytes - 1,
+ &list, 0, false);
if (ret)
- goto out;
+ return ret;
while (!list_empty(&list)) {
- sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+ struct btrfs_ordered_sum *sums =
+ list_entry(list.next, struct btrfs_ordered_sum, list);
+
list_del_init(&sums->list);
/*
@@ -4379,14 +4375,11 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
* disk_len vs real len like with real inodes since it's all
* disk length.
*/
- new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
- sums->bytenr = new_bytenr;
-
+ sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr;
btrfs_add_ordered_sum(ordered, sums);
}
-out:
- btrfs_put_ordered_extent(ordered);
- return ret;
+
+ return 0;
}
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
@@ -4523,3 +4516,19 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
ret = clone_backref_node(trans, rc, root, reloc_root);
return ret;
}
+
+/*
+ * Get the current bytenr for the block group which is being relocated.
+ *
+ * Return U64_MAX if no running relocation.
+ */
+u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info)
+{
+ u64 logical = U64_MAX;
+
+ lockdep_assert_held(&fs_info->reloc_mutex);
+
+ if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group)
+ logical = fs_info->reloc_ctl->block_group->start;
+ return logical;
+}
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 2041a86186de..77d69f6ae967 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -8,7 +8,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *r
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
-int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
+int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow);
@@ -19,5 +19,6 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
+u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 16c228344cbb..4cae41bd6de0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -177,7 +177,6 @@ struct scrub_ctx {
struct btrfs_fs_info *fs_info;
int first_free;
int cur_stripe;
- struct list_head csum_list;
atomic_t cancel_req;
int readonly;
int sectors_per_bio;
@@ -309,17 +308,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
scrub_pause_off(fs_info);
}
-static void scrub_free_csums(struct scrub_ctx *sctx)
-{
- while (!list_empty(&sctx->csum_list)) {
- struct btrfs_ordered_sum *sum;
- sum = list_first_entry(&sctx->csum_list,
- struct btrfs_ordered_sum, list);
- list_del(&sum->list);
- kfree(sum);
- }
-}
-
static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
{
int i;
@@ -330,7 +318,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
release_scrub_stripe(&sctx->stripes[i]);
- scrub_free_csums(sctx);
kfree(sctx);
}
@@ -352,7 +339,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->fs_info = fs_info;
- INIT_LIST_HEAD(&sctx->csum_list);
for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
int ret;
@@ -479,11 +465,8 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct scrub_warning swarn;
- unsigned long ptr = 0;
u64 flags = 0;
- u64 ref_root;
u32 item_size;
- u8 ref_level = 0;
int ret;
/* Super block error, no need to search extent tree. */
@@ -513,19 +496,28 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
item_size = btrfs_item_size(eb, path->slots[0]);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- do {
+ unsigned long ptr = 0;
+ u8 ref_level;
+ u64 ref_root;
+
+ while (true) {
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
item_size, &ref_root,
&ref_level);
+ if (ret < 0) {
+ btrfs_warn(fs_info,
+ "failed to resolve tree backref for logical %llu: %d",
+ swarn.logical, ret);
+ break;
+ }
+ if (ret > 0)
+ break;
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
- errstr, swarn.logical,
- btrfs_dev_name(dev),
- swarn.physical,
- ref_level ? "node" : "leaf",
- ret < 0 ? -1 : ref_level,
- ret < 0 ? -1 : ref_root);
- } while (ret != 1);
+ errstr, swarn.logical, btrfs_dev_name(dev),
+ swarn.physical, (ref_level ? "node" : "leaf"),
+ ref_level, ref_root);
+ }
btrfs_release_path(path);
} else {
struct btrfs_backref_walk_ctx ctx = { 0 };
@@ -546,48 +538,6 @@ out:
btrfs_free_path(path);
}
-static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
-{
- if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
- return 2;
- else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
- return 3;
- else
- return (int)bioc->num_stripes;
-}
-
-static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
- u64 full_stripe_logical,
- int nstripes, int mirror,
- int *stripe_index,
- u64 *stripe_offset)
-{
- int i;
-
- if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
- nstripes - 1 : nstripes - 2;
-
- /* RAID5/6 */
- for (i = 0; i < nr_data_stripes; i++) {
- const u64 data_stripe_start = full_stripe_logical +
- (i * BTRFS_STRIPE_LEN);
-
- if (logical >= data_stripe_start &&
- logical < data_stripe_start + BTRFS_STRIPE_LEN)
- break;
- }
-
- *stripe_index = i;
- *stripe_offset = (logical - full_stripe_logical) &
- BTRFS_STRIPE_LEN_MASK;
- } else {
- /* The other RAID type */
- *stripe_index = mirror;
- *stripe_offset = 0;
- }
-}
-
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
{
int ret = 0;
@@ -924,8 +874,9 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
/* For scrub, our mirror_num should always start at 1. */
ASSERT(stripe->mirror_num >= 1);
- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- stripe->logical, &mapped_len, &bioc);
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ stripe->logical, &mapped_len, &bioc,
+ NULL, NULL, 1);
/*
* If we failed, dev will be NULL, and later detailed reports
* will just be skipped.
@@ -1957,8 +1908,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
bio->bi_end_io = raid56_scrub_wait_endio;
btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
- &length, &bioc);
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
+ &length, &bioc, NULL, NULL, 1);
if (ret < 0) {
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
@@ -1972,6 +1923,13 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
btrfs_bio_counter_dec(fs_info);
goto out;
}
+ /* Use the recovered stripes as cache to avoid read them from disk again. */
+ for (int i = 0; i < data_stripes; i++) {
+ stripe = &sctx->raid56_data_stripes[i];
+
+ raid56_parity_cache_data_pages(rbio, stripe->pages,
+ full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
+ }
raid56_parity_submit_scrub_rbio(rbio);
wait_for_completion_io(&io_done);
ret = blk_status_to_errno(bio->bi_status);
@@ -2740,17 +2698,12 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
&fs_info->scrub_lock)) {
struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
- struct workqueue_struct *scrub_wr_comp =
- fs_info->scrub_wr_completion_workers;
fs_info->scrub_workers = NULL;
- fs_info->scrub_wr_completion_workers = NULL;
mutex_unlock(&fs_info->scrub_lock);
if (scrub_workers)
destroy_workqueue(scrub_workers);
- if (scrub_wr_comp)
- destroy_workqueue(scrub_wr_comp);
}
}
@@ -2761,7 +2714,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
struct workqueue_struct *scrub_workers = NULL;
- struct workqueue_struct *scrub_wr_comp = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
int ret = -ENOMEM;
@@ -2769,21 +2721,17 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
return 0;
- scrub_workers = alloc_workqueue("btrfs-scrub", flags,
- is_dev_replace ? 1 : max_active);
+ if (is_dev_replace)
+ scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags);
+ else
+ scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
if (!scrub_workers)
- goto fail_scrub_workers;
-
- scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
- if (!scrub_wr_comp)
- goto fail_scrub_wr_completion_workers;
+ return -ENOMEM;
mutex_lock(&fs_info->scrub_lock);
if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
- ASSERT(fs_info->scrub_workers == NULL &&
- fs_info->scrub_wr_completion_workers == NULL);
+ ASSERT(fs_info->scrub_workers == NULL);
fs_info->scrub_workers = scrub_workers;
- fs_info->scrub_wr_completion_workers = scrub_wr_comp;
refcount_set(&fs_info->scrub_workers_refcnt, 1);
mutex_unlock(&fs_info->scrub_lock);
return 0;
@@ -2794,10 +2742,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
ret = 0;
- destroy_workqueue(scrub_wr_comp);
-fail_scrub_wr_completion_workers:
destroy_workqueue(scrub_workers);
-fail_scrub_workers:
return ret;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index af2e153543a5..8bfd44750efe 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1774,9 +1774,21 @@ static int read_symlink(struct btrfs_root *root,
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
type = btrfs_file_extent_type(path->nodes[0], ei);
+ if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) {
+ ret = -EUCLEAN;
+ btrfs_crit(root->fs_info,
+"send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
+ ino, btrfs_root_id(root), type);
+ goto out;
+ }
compression = btrfs_file_extent_compression(path->nodes[0], ei);
- BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
- BUG_ON(compression);
+ if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
+ ret = -EUCLEAN;
+ btrfs_crit(root->fs_info,
+"send: found symlink extent with compression, ino %llu root %llu compression type %d",
+ ino, btrfs_root_id(root), compression);
+ goto out;
+ }
off = btrfs_file_extent_inline_start(ei);
len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index dd46b978ac2c..1b999c6e4193 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -100,9 +100,6 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector
subpage_info->uptodate_offset = cur;
cur += nr_bits;
- subpage_info->error_offset = cur;
- cur += nr_bits;
-
subpage_info->dirty_offset = cur;
cur += nr_bits;
@@ -367,28 +364,6 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
unlock_page(page);
}
-static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start,
- unsigned int nbits)
-{
- unsigned int found_zero;
-
- found_zero = find_next_zero_bit(addr, start + nbits, start);
- if (found_zero == start + nbits)
- return true;
- return false;
-}
-
-static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start,
- unsigned int nbits)
-{
- unsigned int found_set;
-
- found_set = find_next_bit(addr, start + nbits, start);
- if (found_set == start + nbits)
- return true;
- return false;
-}
-
#define subpage_calc_start_bit(fs_info, page, name, start, len) \
({ \
unsigned int start_bit; \
@@ -438,35 +413,6 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
spin_unlock_irqrestore(&subpage->lock, flags);
}
-void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
- error, start, len);
- unsigned long flags;
-
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- SetPageError(page);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
-void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
- error, start, len);
- unsigned long flags;
-
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, error))
- ClearPageError(page);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
@@ -628,7 +574,6 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
return ret; \
}
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
-IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
@@ -696,7 +641,6 @@ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
}
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
PageUptodate);
-IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
PageDirty);
IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
@@ -767,3 +711,44 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
/* Have writers, use proper subpage helper to end it */
btrfs_page_end_writer_lock(fs_info, page, start, len);
}
+
+#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \
+ bitmap_cut(dst, subpage->bitmaps, 0, \
+ subpage_info->name##_offset, subpage_info->bitmap_nr_bits)
+
+void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
+ struct btrfs_subpage *subpage;
+ unsigned long uptodate_bitmap;
+ unsigned long error_bitmap;
+ unsigned long dirty_bitmap;
+ unsigned long writeback_bitmap;
+ unsigned long ordered_bitmap;
+ unsigned long checked_bitmap;
+ unsigned long flags;
+
+ ASSERT(PagePrivate(page) && page->private);
+ ASSERT(subpage_info);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+
+ dump_page(page, "btrfs subpage dump");
+ btrfs_warn(fs_info,
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
+ start, len, page_offset(page),
+ subpage_info->bitmap_nr_bits, &uptodate_bitmap,
+ subpage_info->bitmap_nr_bits, &error_bitmap,
+ subpage_info->bitmap_nr_bits, &dirty_bitmap,
+ subpage_info->bitmap_nr_bits, &writeback_bitmap,
+ subpage_info->bitmap_nr_bits, &ordered_bitmap,
+ subpage_info->bitmap_nr_bits, &checked_bitmap);
+}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 0e80ad336904..5cbf67ccbdeb 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -8,17 +8,17 @@
/*
* Extra info for subpapge bitmap.
*
- * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into
+ * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into
* one larger bitmap.
*
* This structure records how they are organized in the bitmap:
*
- * /- uptodate_offset /- error_offset /- dirty_offset
+ * /- uptodate_offset /- dirty_offset /- ordered_offset
* | | |
* v v v
- * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o|
+ * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o|
* |<- bitmap_nr_bits ->|
- * |<--------------- total_nr_bits ---------------->|
+ * |<----------------- total_nr_bits ------------------>|
*/
struct btrfs_subpage_info {
/* Number of bits for each bitmap */
@@ -32,7 +32,6 @@ struct btrfs_subpage_info {
* @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
*/
unsigned int uptodate_offset;
- unsigned int error_offset;
unsigned int dirty_offset;
unsigned int writeback_offset;
unsigned int ordered_offset;
@@ -141,7 +140,6 @@ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
-DECLARE_BTRFS_SUBPAGE_OPS(error);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
DECLARE_BTRFS_SUBPAGE_OPS(writeback);
DECLARE_BTRFS_SUBPAGE_OPS(ordered);
@@ -154,5 +152,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct page *page);
void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
u64 start, u32 len);
+void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index efeb1a9d040a..8b1c1225245e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1631,7 +1631,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
old_pool_size, new_pool_size);
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
workqueue_set_max_active(fs_info->endio_workers, new_pool_size);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index dfc5c7fa6038..f6bc6d738555 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -159,7 +159,7 @@ static int test_find_delalloc(u32 sectorsize)
* |--- delalloc ---|
* |--- search ---|
*/
- set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
+ set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL);
start = 0;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
@@ -190,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("couldn't find the locked page");
goto out_bits;
}
- set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
+ set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
@@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize)
*
* We are re-using our test_start from above since it works out well.
*/
- set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
+ set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
@@ -503,8 +503,8 @@ static int test_find_first_clear_extent_bit(void)
* Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
* 4M-32M
*/
- set_extent_bits(&tree, SZ_1M, SZ_4M - 1,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ set_extent_bit(&tree, SZ_1M, SZ_4M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
@@ -516,8 +516,8 @@ static int test_find_first_clear_extent_bit(void)
}
/* Now add 32M-64M so that we have a hole between 4M-32M */
- set_extent_bits(&tree, SZ_32M, SZ_64M - 1,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ set_extent_bit(&tree, SZ_32M, SZ_64M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
/*
* Request first hole starting at 12M, we should get 4M-32M
@@ -548,7 +548,7 @@ static int test_find_first_clear_extent_bit(void)
* Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag
* being unset in this range, we should get the entry in range 64M-72M
*/
- set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED);
+ set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL);
find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
CHUNK_TRIMMED);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8b6a99b8d7f6..cf306351b148 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -374,8 +374,6 @@ loop:
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
spin_lock_init(&cur_trans->dropped_roots_lock);
- INIT_LIST_HEAD(&cur_trans->releasing_ebs);
- spin_lock_init(&cur_trans->releasing_ebs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES);
@@ -1056,7 +1054,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
- atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
bool wait_writeback = false;
@@ -1092,7 +1089,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
cond_resched();
start = end + 1;
}
- atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
return werr;
}
@@ -1688,7 +1684,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* insert the directory item
*/
ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
/* check if there is a file/dir which has the same name. */
dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
@@ -2484,13 +2483,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
goto scrub_continue;
}
- /*
- * At this point, we should have written all the tree blocks allocated
- * in this transaction. So it's now safe to free the redirtyied extent
- * buffers.
- */
- btrfs_free_redirty_list(cur_trans);
-
ret = write_all_supers(fs_info, 0);
/*
* the super is written, we can safely allow the tree-loggers
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index fa728ab80826..8e9fa23bd7fe 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -94,9 +94,6 @@ struct btrfs_transaction {
*/
atomic_t pending_ordered;
wait_queue_head_t pending_wait;
-
- spinlock_t releasing_ebs_lock;
- struct list_head releasing_ebs;
};
enum {
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 2138e9fc0564..038dfa8f1788 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -25,10 +25,10 @@
#include "compression.h"
#include "volumes.h"
#include "misc.h"
-#include "btrfs_inode.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
+#include "inode-item.h"
/*
* Error message should follow the following format:
@@ -1620,9 +1620,10 @@ static int check_inode_ref(struct extent_buffer *leaf,
/*
* Common point to switch the item-specific validation.
*/
-static int check_leaf_item(struct extent_buffer *leaf,
- struct btrfs_key *key, int slot,
- struct btrfs_key *prev_key)
+static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
+ struct btrfs_key *key,
+ int slot,
+ struct btrfs_key *prev_key)
{
int ret = 0;
struct btrfs_chunk *chunk;
@@ -1671,10 +1672,13 @@ static int check_leaf_item(struct extent_buffer *leaf,
ret = check_extent_data_ref(leaf, key, slot);
break;
}
- return ret;
+
+ if (ret)
+ return BTRFS_TREE_BLOCK_INVALID_ITEM;
+ return BTRFS_TREE_BLOCK_CLEAN;
}
-static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
+enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
/* No valid key type is 0, so all key should be larger than this key */
@@ -1687,7 +1691,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
generic_err(leaf, 0,
"invalid level for leaf, have %d expect 0",
btrfs_header_level(leaf));
- return -EUCLEAN;
+ return BTRFS_TREE_BLOCK_INVALID_LEVEL;
}
/*
@@ -1710,32 +1714,32 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
generic_err(leaf, 0,
"invalid root, root %llu must never be empty",
owner);
- return -EUCLEAN;
+ return BTRFS_TREE_BLOCK_INVALID_NRITEMS;
}
/* Unknown tree */
if (unlikely(owner == 0)) {
generic_err(leaf, 0,
"invalid owner, root 0 is not defined");
- return -EUCLEAN;
+ return BTRFS_TREE_BLOCK_INVALID_OWNER;
}
/* EXTENT_TREE_V2 can have empty extent trees. */
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
- return 0;
+ return BTRFS_TREE_BLOCK_CLEAN;
if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) {
generic_err(leaf, 0,
"invalid root, root %llu must never be empty",
owner);
- return -EUCLEAN;
+ return BTRFS_TREE_BLOCK_INVALID_NRITEMS;
}
- return 0;
+ return BTRFS_TREE_BLOCK_CLEAN;
}
if (unlikely(nritems == 0))
- return 0;
+ return BTRFS_TREE_BLOCK_CLEAN;
/*
* Check the following things to make sure this is a good leaf, and
@@ -1751,7 +1755,6 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
for (slot = 0; slot < nritems; slot++) {
u32 item_end_expected;
u64 item_data_end;
- int ret;
btrfs_item_key_to_cpu(leaf, &key, slot);
@@ -1762,7 +1765,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
prev_key.objectid, prev_key.type,
prev_key.offset, key.objectid, key.type,
key.offset);
- return -EUCLEAN;
+ return BTRFS_TREE_BLOCK_BAD_KEY_ORDER;
}
item_data_end = (u64)btrfs_item_offset(leaf, slot) +
@@ -1781,7 +1784,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
generic_err(leaf, slot,
"unexpected item end, have %llu expect %u",
item_data_end, item_end_expected);
- return -EUCLEAN;
+ return BTRFS_TREE_BLOCK_INVALID_OFFSETS;
}
/*
@@ -1793,7 +1796,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
generic_err(leaf, slot,
"slot end outside of leaf, have %llu expect range [0, %u]",
item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info));
- return -EUCLEAN;
+ return BTRFS_TREE_BLOCK_INVALID_OFFSETS;
}
/* Also check if the item pointer overlaps with btrfs item. */
@@ -1804,16 +1807,22 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
btrfs_item_nr_offset(leaf, slot) +
sizeof(struct btrfs_item),
btrfs_item_ptr_offset(leaf, slot));
- return -EUCLEAN;
+ return BTRFS_TREE_BLOCK_INVALID_OFFSETS;
}
- if (check_item_data) {
+ /*
+ * We only want to do this if WRITTEN is set, otherwise the leaf
+ * may be in some intermediate state and won't appear valid.
+ */
+ if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ enum btrfs_tree_block_status ret;
+
/*
* Check if the item size and content meet other
* criteria
*/
ret = check_leaf_item(leaf, &key, slot, &prev_key);
- if (unlikely(ret < 0))
+ if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
return ret;
}
@@ -1822,21 +1831,21 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
prev_key.offset = key.offset;
}
- return 0;
+ return BTRFS_TREE_BLOCK_CLEAN;
}
-int btrfs_check_leaf_full(struct extent_buffer *leaf)
+int btrfs_check_leaf(struct extent_buffer *leaf)
{
- return check_leaf(leaf, true);
-}
-ALLOW_ERROR_INJECTION(btrfs_check_leaf_full, ERRNO);
+ enum btrfs_tree_block_status ret;
-int btrfs_check_leaf_relaxed(struct extent_buffer *leaf)
-{
- return check_leaf(leaf, false);
+ ret = __btrfs_check_leaf(leaf);
+ if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
+ return -EUCLEAN;
+ return 0;
}
+ALLOW_ERROR_INJECTION(btrfs_check_leaf, ERRNO);
-int btrfs_check_node(struct extent_buffer *node)
+enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node)
{
struct btrfs_fs_info *fs_info = node->fs_info;
unsigned long nr = btrfs_header_nritems(node);
@@ -1844,13 +1853,12 @@ int btrfs_check_node(struct extent_buffer *node)
int slot;
int level = btrfs_header_level(node);
u64 bytenr;
- int ret = 0;
if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) {
generic_err(node, 0,
"invalid level for node, have %d expect [1, %d]",
level, BTRFS_MAX_LEVEL - 1);
- return -EUCLEAN;
+ return BTRFS_TREE_BLOCK_INVALID_LEVEL;
}
if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) {
btrfs_crit(fs_info,
@@ -1858,7 +1866,7 @@ int btrfs_check_node(struct extent_buffer *node)
btrfs_header_owner(node), node->start,
nr == 0 ? "small" : "large", nr,
BTRFS_NODEPTRS_PER_BLOCK(fs_info));
- return -EUCLEAN;
+ return BTRFS_TREE_BLOCK_INVALID_NRITEMS;
}
for (slot = 0; slot < nr - 1; slot++) {
@@ -1869,15 +1877,13 @@ int btrfs_check_node(struct extent_buffer *node)
if (unlikely(!bytenr)) {
generic_err(node, slot,
"invalid NULL node pointer");
- ret = -EUCLEAN;
- goto out;
+ return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR;
}
if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) {
generic_err(node, slot,
"unaligned pointer, have %llu should be aligned to %u",
bytenr, fs_info->sectorsize);
- ret = -EUCLEAN;
- goto out;
+ return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR;
}
if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) {
@@ -1886,12 +1892,20 @@ int btrfs_check_node(struct extent_buffer *node)
key.objectid, key.type, key.offset,
next_key.objectid, next_key.type,
next_key.offset);
- ret = -EUCLEAN;
- goto out;
+ return BTRFS_TREE_BLOCK_BAD_KEY_ORDER;
}
}
-out:
- return ret;
+ return BTRFS_TREE_BLOCK_CLEAN;
+}
+
+int btrfs_check_node(struct extent_buffer *node)
+{
+ enum btrfs_tree_block_status ret;
+
+ ret = __btrfs_check_node(node);
+ if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
+ return -EUCLEAN;
+ return 0;
}
ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
@@ -1949,3 +1963,61 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
}
return 0;
}
+
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
+ struct btrfs_key *first_key, u64 parent_transid)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ int found_level;
+ struct btrfs_key found_key;
+ int ret;
+
+ found_level = btrfs_header_level(eb);
+ if (found_level != level) {
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: tree level check failed\n");
+ btrfs_err(fs_info,
+"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
+ eb->start, level, found_level);
+ return -EIO;
+ }
+
+ if (!first_key)
+ return 0;
+
+ /*
+ * For live tree block (new tree blocks in current transaction),
+ * we need proper lock context to avoid race, which is impossible here.
+ * So we only checks tree blocks which is read from disk, whose
+ * generation <= fs_info->last_trans_committed.
+ */
+ if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
+ return 0;
+
+ /* We have @first_key, so this @eb must have at least one item */
+ if (btrfs_header_nritems(eb) == 0) {
+ btrfs_err(fs_info,
+ "invalid tree nritems, bytenr=%llu nritems=0 expect >0",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return -EUCLEAN;
+ }
+
+ if (found_level)
+ btrfs_node_key_to_cpu(eb, &found_key, 0);
+ else
+ btrfs_item_key_to_cpu(eb, &found_key, 0);
+ ret = btrfs_comp_cpu_keys(first_key, &found_key);
+
+ if (ret) {
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: tree first key check failed\n");
+ btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+ eb->start, parent_transid, first_key->objectid,
+ first_key->type, first_key->offset,
+ found_key.objectid, found_key.type,
+ found_key.offset);
+ }
+ return ret;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index bfb5efa4e01f..3c2a02a72f64 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -40,22 +40,33 @@ struct btrfs_tree_parent_check {
u8 level;
};
-/*
- * Comprehensive leaf checker.
- * Will check not only the item pointers, but also every possible member
- * in item data.
- */
-int btrfs_check_leaf_full(struct extent_buffer *leaf);
+enum btrfs_tree_block_status {
+ BTRFS_TREE_BLOCK_CLEAN,
+ BTRFS_TREE_BLOCK_INVALID_NRITEMS,
+ BTRFS_TREE_BLOCK_INVALID_PARENT_KEY,
+ BTRFS_TREE_BLOCK_BAD_KEY_ORDER,
+ BTRFS_TREE_BLOCK_INVALID_LEVEL,
+ BTRFS_TREE_BLOCK_INVALID_FREE_SPACE,
+ BTRFS_TREE_BLOCK_INVALID_OFFSETS,
+ BTRFS_TREE_BLOCK_INVALID_BLOCKPTR,
+ BTRFS_TREE_BLOCK_INVALID_ITEM,
+ BTRFS_TREE_BLOCK_INVALID_OWNER,
+};
/*
- * Less strict leaf checker.
- * Will only check item pointers, not reading item data.
+ * Exported simply for btrfs-progs which wants to have the
+ * btrfs_tree_block_status return codes.
*/
-int btrfs_check_leaf_relaxed(struct extent_buffer *leaf);
+enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf);
+enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node);
+
+int btrfs_check_leaf(struct extent_buffer *leaf);
int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
+ struct btrfs_key *first_key, u64 parent_transid);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d2755d5e338b..365a1cc0a3c3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -859,10 +859,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
csum_root = btrfs_csum_root(fs_info,
- sums->bytenr);
+ sums->logical);
if (!ret)
ret = btrfs_del_csums(trans, csum_root,
- sums->bytenr,
+ sums->logical,
sums->len);
if (!ret)
ret = btrfs_csum_file_blocks(trans,
@@ -3252,7 +3252,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
* Returns 1 if the inode was logged before in the transaction, 0 if it was not,
* and < 0 on error.
*/
-static int inode_logged(struct btrfs_trans_handle *trans,
+static int inode_logged(const struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path_in)
{
@@ -4056,14 +4056,14 @@ static int drop_inode_items(struct btrfs_trans_handle *trans,
while (1) {
ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
- BUG_ON(ret == 0); /* Logic error */
- if (ret < 0)
- break;
-
- if (path->slots[0] == 0)
+ if (ret < 0) {
break;
+ } else if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
- path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
@@ -4221,7 +4221,7 @@ static int log_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *log_root,
struct btrfs_ordered_sum *sums)
{
- const u64 lock_end = sums->bytenr + sums->len - 1;
+ const u64 lock_end = sums->logical + sums->len - 1;
struct extent_state *cached_state = NULL;
int ret;
@@ -4239,7 +4239,7 @@ static int log_csums(struct btrfs_trans_handle *trans,
* file which happens to refer to the same extent as well. Such races
* can leave checksum items in the log with overlapping ranges.
*/
- ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
&cached_state);
if (ret)
return ret;
@@ -4252,11 +4252,11 @@ static int log_csums(struct btrfs_trans_handle *trans,
* some checksums missing in the fs/subvolume tree. So just delete (or
* trim and adjust) any existing csum items in the log for this range.
*/
- ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
+ ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
if (!ret)
ret = btrfs_csum_file_blocks(trans, log_root, sums);
- unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
&cached_state);
return ret;
@@ -5303,7 +5303,7 @@ out:
* multiple times when multiple tasks have joined the same log transaction.
*/
static bool need_log_inode(const struct btrfs_trans_handle *trans,
- const struct btrfs_inode *inode)
+ struct btrfs_inode *inode)
{
/*
* If a directory was not modified, no dentries added or removed, we can
@@ -5321,7 +5321,7 @@ static bool need_log_inode(const struct btrfs_trans_handle *trans,
* logged_trans will be 0, in which case we have to fully log it since
* logged_trans is a transient field, not persisted.
*/
- if (inode->logged_trans == trans->transid &&
+ if (inode_logged(trans, inode, NULL) == 1 &&
!test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
return false;
@@ -7309,7 +7309,7 @@ error:
*/
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
- int for_rename)
+ bool for_rename)
{
/*
* when we're logging a file, if it hasn't been renamed
@@ -7325,18 +7325,25 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
inode->last_unlink_trans = trans->transid;
mutex_unlock(&inode->log_mutex);
+ if (!for_rename)
+ return;
+
/*
- * if this directory was already logged any new
- * names for this file/dir will get recorded
+ * If this directory was already logged, any new names will be logged
+ * with btrfs_log_new_name() and old names will be deleted from the log
+ * tree with btrfs_del_dir_entries_in_log() or with
+ * btrfs_del_inode_ref_in_log().
*/
- if (dir->logged_trans == trans->transid)
+ if (inode_logged(trans, dir, NULL) == 1)
return;
/*
- * if the inode we're about to unlink was logged,
- * the log will be properly updated for any new names
+ * If the inode we're about to unlink was logged before, the log will be
+ * properly updated with the new name with btrfs_log_new_name() and the
+ * old name removed with btrfs_del_dir_entries_in_log() or with
+ * btrfs_del_inode_ref_in_log().
*/
- if (inode->logged_trans == trans->transid)
+ if (inode_logged(trans, inode, NULL) == 1)
return;
/*
@@ -7346,13 +7353,6 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
* properly. So, we have to be conservative and force commits
* so the new name gets discovered.
*/
- if (for_rename)
- goto record;
-
- /* we can safely do the unlink without any special recording */
- return;
-
-record:
mutex_lock(&dir->log_mutex);
dir->last_unlink_trans = trans->transid;
mutex_unlock(&dir->log_mutex);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index bdeb5216718f..a550a8a375cd 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -100,7 +100,7 @@ void btrfs_end_log_trans(struct btrfs_root *root);
void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
- int for_rename);
+ bool for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index a555baa0143a..3df6153d5d5a 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -226,21 +226,32 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op)
{
struct tree_mod_elem *tm;
- int ret;
+ int ret = 0;
if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
tm = alloc_tree_mod_elem(eb, slot, op);
if (!tm)
- return -ENOMEM;
+ ret = -ENOMEM;
if (tree_mod_dont_log(eb->fs_info, eb)) {
kfree(tm);
+ /*
+ * Don't error if we failed to allocate memory because we don't
+ * need to log.
+ */
return 0;
+ } else if (ret != 0) {
+ /*
+ * We previously failed to allocate memory and we need to log,
+ * so we have to fail.
+ */
+ goto out_unlock;
}
ret = tree_mod_log_insert(eb->fs_info, tm);
+out_unlock:
write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
kfree(tm);
@@ -248,6 +259,26 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
return ret;
}
+static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot,
+ int nr_items)
+{
+ struct tree_mod_elem *tm;
+
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
+ if (!tm)
+ return ERR_PTR(-ENOMEM);
+
+ tm->logical = eb->start;
+ tm->slot = src_slot;
+ tm->move.dst_slot = dst_slot;
+ tm->move.nr_items = nr_items;
+ tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
+ RB_CLEAR_NODE(&tm->node);
+
+ return tm;
+}
+
int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
int dst_slot, int src_slot,
int nr_items)
@@ -262,35 +293,46 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
return 0;
tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- tm = kzalloc(sizeof(*tm), GFP_NOFS);
- if (!tm) {
+ if (!tm_list) {
ret = -ENOMEM;
- goto free_tms;
+ goto lock;
}
- tm->logical = eb->start;
- tm->slot = src_slot;
- tm->move.dst_slot = dst_slot;
- tm->move.nr_items = nr_items;
- tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
+ tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
+ if (IS_ERR(tm)) {
+ ret = PTR_ERR(tm);
+ tm = NULL;
+ goto lock;
+ }
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
if (!tm_list[i]) {
ret = -ENOMEM;
- goto free_tms;
+ goto lock;
}
}
- if (tree_mod_dont_log(eb->fs_info, eb))
+lock:
+ if (tree_mod_dont_log(eb->fs_info, eb)) {
+ /*
+ * Don't error if we failed to allocate memory because we don't
+ * need to log.
+ */
+ ret = 0;
goto free_tms;
+ }
locked = true;
/*
+ * We previously failed to allocate memory and we need to log, so we
+ * have to fail.
+ */
+ if (ret != 0)
+ goto free_tms;
+
+ /*
* When we override something during the move, we log these removals.
* This can only happen when we move towards the beginning of the
* buffer, i.e. dst_slot < src_slot.
@@ -310,10 +352,12 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
return 0;
free_tms:
- for (i = 0; i < nr_items; i++) {
- if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
- kfree(tm_list[i]);
+ if (tm_list) {
+ for (i = 0; i < nr_items; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
}
if (locked)
write_unlock(&eb->fs_info->tree_mod_log_lock);
@@ -363,14 +407,14 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
GFP_NOFS);
if (!tm_list) {
ret = -ENOMEM;
- goto free_tms;
+ goto lock;
}
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(old_root, i,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
if (!tm_list[i]) {
ret = -ENOMEM;
- goto free_tms;
+ goto lock;
}
}
}
@@ -378,7 +422,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
- goto free_tms;
+ goto lock;
}
tm->logical = new_root->start;
@@ -387,14 +431,28 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
tm->generation = btrfs_header_generation(old_root);
tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
- if (tree_mod_dont_log(fs_info, NULL))
+lock:
+ if (tree_mod_dont_log(fs_info, NULL)) {
+ /*
+ * Don't error if we failed to allocate memory because we don't
+ * need to log.
+ */
+ ret = 0;
goto free_tms;
+ } else if (ret != 0) {
+ /*
+ * We previously failed to allocate memory and we need to log,
+ * so we have to fail.
+ */
+ goto out_unlock;
+ }
if (tm_list)
ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
if (!ret)
ret = tree_mod_log_insert(fs_info, tm);
+out_unlock:
write_unlock(&fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
@@ -486,9 +544,14 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
struct btrfs_fs_info *fs_info = dst->fs_info;
int ret = 0;
struct tree_mod_elem **tm_list = NULL;
- struct tree_mod_elem **tm_list_add, **tm_list_rem;
+ struct tree_mod_elem **tm_list_add = NULL;
+ struct tree_mod_elem **tm_list_rem = NULL;
int i;
bool locked = false;
+ struct tree_mod_elem *dst_move_tm = NULL;
+ struct tree_mod_elem *src_move_tm = NULL;
+ u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
+ u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
if (!tree_mod_need_log(fs_info, NULL))
return 0;
@@ -498,8 +561,30 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
+ if (!tm_list) {
+ ret = -ENOMEM;
+ goto lock;
+ }
+
+ if (dst_move_nr_items) {
+ dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
+ dst_offset, dst_move_nr_items);
+ if (IS_ERR(dst_move_tm)) {
+ ret = PTR_ERR(dst_move_tm);
+ dst_move_tm = NULL;
+ goto lock;
+ }
+ }
+ if (src_move_nr_items) {
+ src_move_tm = tree_mod_log_alloc_move(src, src_offset,
+ src_offset + nr_items,
+ src_move_nr_items);
+ if (IS_ERR(src_move_tm)) {
+ ret = PTR_ERR(src_move_tm);
+ src_move_tm = NULL;
+ goto lock;
+ }
+ }
tm_list_add = tm_list;
tm_list_rem = tm_list + nr_items;
@@ -508,21 +593,40 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
BTRFS_MOD_LOG_KEY_REMOVE);
if (!tm_list_rem[i]) {
ret = -ENOMEM;
- goto free_tms;
+ goto lock;
}
tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
BTRFS_MOD_LOG_KEY_ADD);
if (!tm_list_add[i]) {
ret = -ENOMEM;
- goto free_tms;
+ goto lock;
}
}
- if (tree_mod_dont_log(fs_info, NULL))
+lock:
+ if (tree_mod_dont_log(fs_info, NULL)) {
+ /*
+ * Don't error if we failed to allocate memory because we don't
+ * need to log.
+ */
+ ret = 0;
goto free_tms;
+ }
locked = true;
+ /*
+ * We previously failed to allocate memory and we need to log, so we
+ * have to fail.
+ */
+ if (ret != 0)
+ goto free_tms;
+
+ if (dst_move_tm) {
+ ret = tree_mod_log_insert(fs_info, dst_move_tm);
+ if (ret)
+ goto free_tms;
+ }
for (i = 0; i < nr_items; i++) {
ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
if (ret)
@@ -531,6 +635,11 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
if (ret)
goto free_tms;
}
+ if (src_move_tm) {
+ ret = tree_mod_log_insert(fs_info, src_move_tm);
+ if (ret)
+ goto free_tms;
+ }
write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
@@ -538,10 +647,18 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
return 0;
free_tms:
- for (i = 0; i < nr_items * 2; i++) {
- if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
- kfree(tm_list[i]);
+ if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
+ rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
+ kfree(dst_move_tm);
+ if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
+ rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
+ kfree(src_move_tm);
+ if (tm_list) {
+ for (i = 0; i < nr_items * 2; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
}
if (locked)
write_unlock(&fs_info->tree_mod_log_lock);
@@ -562,22 +679,38 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
nritems = btrfs_header_nritems(eb);
tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
+ if (!tm_list) {
+ ret = -ENOMEM;
+ goto lock;
+ }
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
if (!tm_list[i]) {
ret = -ENOMEM;
- goto free_tms;
+ goto lock;
}
}
- if (tree_mod_dont_log(eb->fs_info, eb))
+lock:
+ if (tree_mod_dont_log(eb->fs_info, eb)) {
+ /*
+ * Don't error if we failed to allocate memory because we don't
+ * need to log.
+ */
+ ret = 0;
goto free_tms;
+ } else if (ret != 0) {
+ /*
+ * We previously failed to allocate memory and we need to log,
+ * so we have to fail.
+ */
+ goto out_unlock;
+ }
ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
+out_unlock:
write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
@@ -586,9 +719,11 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
return 0;
free_tms:
- for (i = 0; i < nritems; i++)
- kfree(tm_list[i]);
- kfree(tm_list);
+ if (tm_list) {
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+ }
return ret;
}
@@ -664,10 +799,27 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
unsigned long o_dst;
unsigned long o_src;
unsigned long p_size = sizeof(struct btrfs_key_ptr);
+ /*
+ * max_slot tracks the maximum valid slot of the rewind eb at every
+ * step of the rewind. This is in contrast with 'n' which eventually
+ * matches the number of items, but can be wrong during moves or if
+ * removes overlap on already valid slots (which is probably separately
+ * a bug). We do this to validate the offsets of memmoves for rewinding
+ * moves and detect invalid memmoves.
+ *
+ * Since a rewind eb can start empty, max_slot is a signed integer with
+ * a special meaning for -1, which is that no slot is valid to move out
+ * of. Any other negative value is invalid.
+ */
+ int max_slot;
+ int move_src_end_slot;
+ int move_dst_end_slot;
n = btrfs_header_nritems(eb);
+ max_slot = n - 1;
read_lock(&fs_info->tree_mod_log_lock);
while (tm && tm->seq >= time_seq) {
+ ASSERT(max_slot >= -1);
/*
* All the operations are recorded with the operator used for
* the modification. As we're going backwards, we do the
@@ -684,6 +836,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
n++;
+ if (tm->slot > max_slot)
+ max_slot = tm->slot;
break;
case BTRFS_MOD_LOG_KEY_REPLACE:
BUG_ON(tm->slot >= n);
@@ -693,14 +847,37 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
tm->generation);
break;
case BTRFS_MOD_LOG_KEY_ADD:
+ /*
+ * It is possible we could have already removed keys
+ * behind the known max slot, so this will be an
+ * overestimate. In practice, the copy operation
+ * inserts them in increasing order, and overestimating
+ * just means we miss some warnings, so it's OK. It
+ * isn't worth carefully tracking the full array of
+ * valid slots to check against when moving.
+ */
+ if (tm->slot == max_slot)
+ max_slot--;
/* if a move operation is needed it's in the log */
n--;
break;
case BTRFS_MOD_LOG_MOVE_KEYS:
+ ASSERT(tm->move.nr_items > 0);
+ move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
+ move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
+ if (WARN_ON(move_src_end_slot > max_slot ||
+ tm->move.nr_items <= 0)) {
+ btrfs_warn(fs_info,
+"move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
+ eb->start, tm->slot,
+ tm->move.dst_slot, tm->move.nr_items,
+ tm->seq, n, max_slot);
+ }
memmove_extent_buffer(eb, o_dst, o_src,
tm->move.nr_items * p_size);
+ max_slot = move_dst_end_slot;
break;
case BTRFS_MOD_LOG_ROOT_REPLACE:
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 72a838c97534..4193ace3fb5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -370,6 +370,8 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
{
struct btrfs_fs_devices *fs_devs;
+ ASSERT(fsid || !metadata_fsid);
+
fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
if (!fs_devs)
return ERR_PTR(-ENOMEM);
@@ -380,18 +382,17 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
INIT_LIST_HEAD(&fs_devs->seed_list);
- if (fsid)
- memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
- if (metadata_fsid)
- memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
- else if (fsid)
- memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
+ if (fsid) {
+ memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_devs->metadata_uuid,
+ metadata_fsid ?: fsid, BTRFS_FSID_SIZE);
+ }
return fs_devs;
}
-void btrfs_free_device(struct btrfs_device *device)
+static void btrfs_free_device(struct btrfs_device *device)
{
WARN_ON(!list_empty(&device->post_commit_list));
rcu_string_free(device->name);
@@ -426,6 +427,21 @@ void __exit btrfs_cleanup_fs_uuids(void)
}
}
+static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
+ const u8 *fsid, const u8 *metadata_fsid)
+{
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0)
+ return false;
+
+ if (!metadata_fsid)
+ return true;
+
+ if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0)
+ return false;
+
+ return true;
+}
+
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
@@ -435,19 +451,25 @@ static noinline struct btrfs_fs_devices *find_fsid(
/* Handle non-split brain cases */
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (metadata_fsid) {
- if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
- && memcmp(metadata_fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) == 0)
- return fs_devices;
- } else {
- if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
- return fs_devices;
- }
+ if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
+ return fs_devices;
}
return NULL;
}
+/*
+ * First check if the metadata_uuid is different from the fsid in the given
+ * fs_devices. Then check if the given fsid is the same as the metadata_uuid
+ * in the fs_devices. If it is, return true; otherwise, return false.
+ */
+static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices,
+ const u8 *fsid)
+{
+ return memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) != 0 &&
+ memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0;
+}
+
static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
struct btrfs_super_block *disk_super)
{
@@ -461,14 +483,14 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
* at all and the CHANGING_FSID_V2 flag set.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change &&
- memcmp(disk_super->metadata_uuid, fs_devices->fsid,
- BTRFS_FSID_SIZE) == 0 &&
- memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) == 0) {
+ if (!fs_devices->fsid_change)
+ continue;
+
+ if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid,
+ fs_devices->fsid))
return fs_devices;
- }
}
+
/*
* Handle scanned device having completed its fsid change but
* belonging to a fs_devices that was created by a device that
@@ -476,13 +498,11 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
* CHANGING_FSID_V2 flag set.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change &&
- memcmp(fs_devices->metadata_uuid,
- fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
- memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) == 0) {
+ if (!fs_devices->fsid_change)
+ continue;
+
+ if (check_fsid_changed(fs_devices, disk_super->metadata_uuid))
return fs_devices;
- }
}
return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
@@ -673,18 +693,16 @@ static struct btrfs_fs_devices *find_fsid_inprogress(
struct btrfs_fs_devices *fs_devices;
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
- BTRFS_FSID_SIZE) != 0 &&
- memcmp(fs_devices->metadata_uuid, disk_super->fsid,
- BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
+ if (fs_devices->fsid_change)
+ continue;
+
+ if (check_fsid_changed(fs_devices, disk_super->fsid))
return fs_devices;
- }
}
return find_fsid(disk_super->fsid, NULL);
}
-
static struct btrfs_fs_devices *find_fsid_changed(
struct btrfs_super_block *disk_super)
{
@@ -701,10 +719,7 @@ static struct btrfs_fs_devices *find_fsid_changed(
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
/* Changed UUIDs */
- if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
- BTRFS_FSID_SIZE) != 0 &&
- memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
- BTRFS_FSID_SIZE) == 0 &&
+ if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) &&
memcmp(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE) != 0)
return fs_devices;
@@ -735,11 +750,10 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata(
* fs_devices equal to the FSID of the disk.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) != 0 &&
- memcmp(fs_devices->metadata_uuid, disk_super->fsid,
- BTRFS_FSID_SIZE) == 0 &&
- fs_devices->fsid_change)
+ if (!fs_devices->fsid_change)
+ continue;
+
+ if (check_fsid_changed(fs_devices, disk_super->fsid))
return fs_devices;
}
@@ -790,12 +804,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (!fs_devices) {
- if (has_metadata_uuid)
- fs_devices = alloc_fs_devices(disk_super->fsid,
- disk_super->metadata_uuid);
- else
- fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
-
+ fs_devices = alloc_fs_devices(disk_super->fsid,
+ has_metadata_uuid ? disk_super->metadata_uuid : NULL);
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
@@ -1918,7 +1928,7 @@ static void update_dev_time(const char *device_path)
return;
now = current_time(d_inode(path.dentry));
- inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+ inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION);
path_put(&path);
}
@@ -6163,17 +6173,10 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->replace_nr_stripes = nr_extra_stripes;
}
-static bool need_full_stripe(enum btrfs_map_op op)
-{
- return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
-}
-
static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
u64 offset, u32 *stripe_nr, u64 *stripe_offset,
u64 *full_stripe_start)
{
- ASSERT(op != BTRFS_MAP_DISCARD);
-
/*
* Stripe_nr is the stripe where this block falls. stripe_offset is
* the offset of this block in its stripe.
@@ -6226,11 +6229,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *
stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
}
-int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret,
- struct btrfs_io_stripe *smap, int *mirror_num_ret,
- int need_raid_map)
+int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap, int *mirror_num_ret,
+ int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
@@ -6253,7 +6256,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 max_len;
ASSERT(bioc_ret);
- ASSERT(op != BTRFS_MAP_DISCARD);
num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
if (mirror_num > num_copies)
@@ -6285,21 +6287,21 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_index = stripe_nr % map->num_stripes;
stripe_nr /= map->num_stripes;
- if (!need_full_stripe(op))
+ if (op == BTRFS_MAP_READ)
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- if (need_full_stripe(op))
+ if (op != BTRFS_MAP_READ) {
num_stripes = map->num_stripes;
- else if (mirror_num)
+ } else if (mirror_num) {
stripe_index = mirror_num - 1;
- else {
+ } else {
stripe_index = find_live_mirror(fs_info, map, 0,
dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (need_full_stripe(op)) {
+ if (op != BTRFS_MAP_READ) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
@@ -6313,7 +6315,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
stripe_index = (stripe_nr % factor) * map->sub_stripes;
stripe_nr /= factor;
- if (need_full_stripe(op))
+ if (op != BTRFS_MAP_READ)
num_stripes = map->sub_stripes;
else if (mirror_num)
stripe_index += mirror_num - 1;
@@ -6326,7 +6328,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
+ if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) {
/*
* Push stripe_nr back to the start of the full stripe
* For those cases needing a full stripe, @stripe_nr
@@ -6362,7 +6364,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
/* We distribute the parity blocks across stripes */
stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
- if (!need_full_stripe(op) && mirror_num <= 1)
+ if (op == BTRFS_MAP_READ && mirror_num <= 1)
mirror_num = 1;
}
} else {
@@ -6402,7 +6404,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
*/
if (smap && num_alloc_stripes == 1 &&
!((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
- (!need_full_stripe(op) || !dev_replace_is_ongoing ||
+ (op == BTRFS_MAP_READ || !dev_replace_is_ongoing ||
!dev_replace->tgtdev)) {
set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
*mirror_num_ret = mirror_num;
@@ -6426,7 +6428,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* It's still mostly the same as other profiles, just with extra rotation.
*/
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
- (need_full_stripe(op) || mirror_num > 1)) {
+ (op != BTRFS_MAP_READ || mirror_num > 1)) {
/*
* For RAID56 @stripe_nr is already the number of full stripes
* before us, which is also the rotation value (needs to modulo
@@ -6453,11 +6455,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
}
- if (need_full_stripe(op))
+ if (op != BTRFS_MAP_READ)
max_errors = btrfs_chunk_max_errors(map);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
- need_full_stripe(op)) {
+ op != BTRFS_MAP_READ) {
handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
&num_stripes, &max_errors);
}
@@ -6477,23 +6479,6 @@ out:
return ret;
}
-int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret, int mirror_num)
-{
- return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
- NULL, &mirror_num, 0);
-}
-
-/* For Scrub/replace */
-int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret)
-{
- return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
- NULL, NULL, 1);
-}
-
static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
const struct btrfs_fs_devices *fs_devices)
{
@@ -8070,8 +8055,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
ASSERT(mirror_num > 0);
- ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
- &bioc, smap, &mirror_ret, true);
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
+ &bioc, smap, &mirror_ret, true);
if (ret < 0)
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 64066d48dce1..d44cb9d22d63 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -280,8 +280,19 @@ enum btrfs_read_policy {
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+
+ /*
+ * UUID written into the btree blocks:
+ *
+ * - If metadata_uuid != fsid then super block must have
+ * BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag set.
+ *
+ * - Following shall be true at all times:
+ * - metadata_uuid == btrfs_header::fsid
+ * - metadata_uuid == btrfs_dev_item::fsid
+ */
u8 metadata_uuid[BTRFS_FSID_SIZE];
- bool fsid_change;
+
struct list_head fs_list;
/*
@@ -319,34 +330,32 @@ struct btrfs_fs_devices {
*/
struct btrfs_device *latest_dev;
- /* all of the devices in the FS, protected by a mutex
- * so we can safely walk it to write out the supers without
- * worrying about add/remove by the multi-device code.
- * Scrubbing super can kick off supers writing by holding
- * this mutex lock.
+ /*
+ * All of the devices in the filesystem, protected by a mutex so we can
+ * safely walk it to write out the super blocks without worrying about
+ * adding/removing by the multi-device code. Scrubbing super block can
+ * kick off supers writing by holding this mutex lock.
*/
struct mutex device_list_mutex;
/* List of all devices, protected by device_list_mutex */
struct list_head devices;
- /*
- * Devices which can satisfy space allocation. Protected by
- * chunk_mutex
- */
+ /* Devices which can satisfy space allocation. Protected by * chunk_mutex. */
struct list_head alloc_list;
struct list_head seed_list;
- bool seeding;
+ /* Count fs-devices opened. */
int opened;
- /* set when we find or add a device that doesn't have the
- * nonrot flag set
- */
+ /* Set when we find or add a device that doesn't have the nonrot flag set. */
bool rotating;
- /* Devices support TRIM/discard commands */
+ /* Devices support TRIM/discard commands. */
bool discardable;
+ bool fsid_change;
+ /* The filesystem is a seed filesystem. */
+ bool seeding;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
@@ -357,7 +366,7 @@ struct btrfs_fs_devices {
enum btrfs_chunk_allocation_policy chunk_alloc_policy;
- /* Policy used to read the mirrored stripes */
+ /* Policy used to read the mirrored stripes. */
enum btrfs_read_policy read_policy;
};
@@ -547,15 +556,12 @@ struct btrfs_dev_lookup_args {
enum btrfs_map_op {
BTRFS_MAP_READ,
BTRFS_MAP_WRITE,
- BTRFS_MAP_DISCARD,
BTRFS_MAP_GET_READ_MIRRORS,
};
static inline enum btrfs_map_op btrfs_op(struct bio *bio)
{
switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- return BTRFS_MAP_DISCARD;
case REQ_OP_WRITE:
case REQ_OP_ZONE_APPEND:
return BTRFS_MAP_WRITE;
@@ -589,15 +595,9 @@ void btrfs_get_bioc(struct btrfs_io_context *bioc);
void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret, int mirror_num);
-int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret);
-int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret,
- struct btrfs_io_stripe *smap, int *mirror_num_ret,
- int need_raid_map);
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap, int *mirror_num_ret,
+ int need_raid_map);
int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
struct btrfs_io_stripe *smap, u64 logical,
u32 length, int mirror_num);
@@ -628,7 +628,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid, const u8 *uuid,
const char *path);
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
-void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
struct block_device **bdev, fmode_t *mode);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8acb05e176c5..6c231a116a29 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -63,7 +63,7 @@ struct list_head *zlib_alloc_workspace(unsigned int level)
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
- workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL);
+ workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN);
workspace->level = level;
workspace->buf = NULL;
/*
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 39828af4a4e8..85b8b332add9 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -15,6 +15,7 @@
#include "transaction.h"
#include "dev-replace.h"
#include "space-info.h"
+#include "super.h"
#include "fs.h"
#include "accessors.h"
#include "bio.h"
@@ -1057,7 +1058,7 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
/* Check if zones in the region are all empty */
if (btrfs_dev_is_sequential(device, pos) &&
- find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
+ !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) {
pos += zinfo->zone_size;
continue;
}
@@ -1156,23 +1157,23 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
struct btrfs_zoned_device_info *zinfo = device->zone_info;
const u8 shift = zinfo->zone_size_shift;
unsigned long begin = start >> shift;
- unsigned long end = (start + size) >> shift;
+ unsigned long nbits = size >> shift;
u64 pos;
int ret;
ASSERT(IS_ALIGNED(start, zinfo->zone_size));
ASSERT(IS_ALIGNED(size, zinfo->zone_size));
- if (end > zinfo->nr_zones)
+ if (begin + nbits > zinfo->nr_zones)
return -ERANGE;
/* All the zones are conventional */
- if (find_next_bit(zinfo->seq_zones, end, begin) == end)
+ if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits))
return 0;
/* All the zones are sequential and empty */
- if (find_next_zero_bit(zinfo->seq_zones, end, begin) == end &&
- find_next_zero_bit(zinfo->empty_zones, end, begin) == end)
+ if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) &&
+ bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits))
return 0;
for (pos = start; pos < start + size; pos += zinfo->zone_size) {
@@ -1602,37 +1603,17 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
-
- if (!btrfs_is_zoned(fs_info) ||
- btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
- !list_empty(&eb->release_list))
+ if (!btrfs_is_zoned(eb->fs_info) ||
+ btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
return;
+ ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+
memzero_extent_buffer(eb, 0, eb->len);
set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
set_extent_buffer_dirty(eb);
- set_extent_bits_nowait(&trans->dirty_pages, eb->start,
- eb->start + eb->len - 1, EXTENT_DIRTY);
-
- spin_lock(&trans->releasing_ebs_lock);
- list_add_tail(&eb->release_list, &trans->releasing_ebs);
- spin_unlock(&trans->releasing_ebs_lock);
- atomic_inc(&eb->refs);
-}
-
-void btrfs_free_redirty_list(struct btrfs_transaction *trans)
-{
- spin_lock(&trans->releasing_ebs_lock);
- while (!list_empty(&trans->releasing_ebs)) {
- struct extent_buffer *eb;
-
- eb = list_first_entry(&trans->releasing_ebs,
- struct extent_buffer, release_list);
- list_del_init(&eb->release_list);
- free_extent_buffer(eb);
- }
- spin_unlock(&trans->releasing_ebs_lock);
+ set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
+ EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
}
bool btrfs_use_zone_append(struct btrfs_bio *bbio)
@@ -1677,63 +1658,89 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
{
const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
- struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_sum *sum = bbio->sums;
- ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
- if (WARN_ON(!ordered))
- return;
-
- ordered->physical = physical;
- btrfs_put_ordered_extent(ordered);
+ if (physical < bbio->orig_physical)
+ sum->logical -= bbio->orig_physical - physical;
+ else
+ sum->logical += physical - bbio->orig_physical;
}
-void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
+static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
+ u64 logical)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_map_tree *em_tree;
+ struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
struct extent_map *em;
- struct btrfs_ordered_sum *sum;
- u64 orig_logical = ordered->disk_bytenr;
- struct map_lookup *map;
- u64 physical = ordered->physical;
- u64 chunk_start_phys;
- u64 logical;
-
- em = btrfs_get_chunk_map(fs_info, orig_logical, 1);
- if (IS_ERR(em))
- return;
- map = em->map_lookup;
- chunk_start_phys = map->stripes[0].physical;
-
- if (WARN_ON_ONCE(map->num_stripes > 1) ||
- WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) ||
- WARN_ON_ONCE(physical < chunk_start_phys) ||
- WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) {
- free_extent_map(em);
- return;
- }
- logical = em->start + (physical - map->stripes[0].physical);
- free_extent_map(em);
-
- if (orig_logical == logical)
- return;
ordered->disk_bytenr = logical;
- em_tree = &inode->extent_tree;
write_lock(&em_tree->lock);
em = search_extent_mapping(em_tree, ordered->file_offset,
ordered->num_bytes);
em->block_start = logical;
free_extent_map(em);
write_unlock(&em_tree->lock);
+}
- list_for_each_entry(sum, &ordered->list, list) {
- if (logical < orig_logical)
- sum->bytenr -= orig_logical - logical;
- else
- sum->bytenr += logical - orig_logical;
+static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
+ u64 logical, u64 len)
+{
+ struct btrfs_ordered_extent *new;
+
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
+ split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
+ ordered->num_bytes, len, logical))
+ return false;
+
+ new = btrfs_split_ordered_extent(ordered, len);
+ if (IS_ERR(new))
+ return false;
+ new->disk_bytenr = logical;
+ btrfs_finish_one_ordered(new);
+ return true;
+}
+
+void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_ordered_sum *sum =
+ list_first_entry(&ordered->list, typeof(*sum), list);
+ u64 logical = sum->logical;
+ u64 len = sum->len;
+
+ while (len < ordered->disk_num_bytes) {
+ sum = list_next_entry(sum, list);
+ if (sum->logical == logical + len) {
+ len += sum->len;
+ continue;
+ }
+ if (!btrfs_zoned_split_ordered(ordered, logical, len)) {
+ set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+ btrfs_err(fs_info, "failed to split ordered extent");
+ goto out;
+ }
+ logical = sum->logical;
+ len = sum->len;
+ }
+
+ if (ordered->disk_bytenr != logical)
+ btrfs_rewrite_logical_zoned(ordered, logical);
+
+out:
+ /*
+ * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
+ * were allocated by btrfs_alloc_dummy_sum only to record the logical
+ * addresses and don't contain actual checksums. We thus must free them
+ * here so that we don't attempt to log the csums later.
+ */
+ if ((inode->flags & BTRFS_INODE_NODATASUM) ||
+ test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
+ while ((sum = list_first_entry_or_null(&ordered->list,
+ typeof(*sum), list))) {
+ list_del(&sum->list);
+ kfree(sum);
+ }
}
}
@@ -1792,8 +1799,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
int nmirrors;
int i, ret;
- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &mapped_length, &bioc);
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
+ &mapped_length, &bioc, NULL, NULL, 1);
if (ret || !bioc || mapped_length < PAGE_SIZE) {
ret = -EIO;
goto out_put_bioc;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index c0570d35fea2..27322b926038 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -30,6 +30,8 @@ struct btrfs_zoned_device_info {
struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
};
+void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered);
+
#ifdef CONFIG_BLK_DEV_ZONED
int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone);
@@ -54,10 +56,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb);
-void btrfs_free_redirty_list(struct btrfs_transaction *trans);
bool btrfs_use_zone_append(struct btrfs_bio *bbio);
void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
-void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb,
struct btrfs_block_group **cache_ret);
@@ -179,7 +179,6 @@ static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb) { }
-static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
@@ -190,9 +189,6 @@ static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
{
}
-static inline void btrfs_rewrite_logical_zoned(
- struct btrfs_ordered_extent *ordered) { }
-
static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb,
struct btrfs_block_group **cache_ret)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index f798da267590..e7ac4ec809a4 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -356,7 +356,7 @@ struct list_head *zstd_alloc_workspace(unsigned int level)
workspace->level = level;
workspace->req_level = level;
workspace->last_used = jiffies;
- workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
+ workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN);
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->mem || !workspace->buf)
goto fail;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 8ea9cea9bfeb..a8206f5332e9 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -661,6 +661,35 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_mark_finished,
TP_ARGS(inode, ordered)
);
+TRACE_EVENT(btrfs_finish_ordered_extent,
+
+ TP_PROTO(const struct btrfs_inode *inode, u64 start, u64 len,
+ bool uptodate),
+
+ TP_ARGS(inode, start, len, uptodate),
+
+ TP_STRUCT__entry_btrfs(
+ __field( u64, ino )
+ __field( u64, start )
+ __field( u64, len )
+ __field( bool, uptodate )
+ __field( u64, root_objectid )
+ ),
+
+ TP_fast_assign_btrfs(inode->root->fs_info,
+ __entry->ino = btrfs_ino(inode);
+ __entry->start = start;
+ __entry->len = len;
+ __entry->uptodate = uptodate;
+ __entry->root_objectid = inode->root->root_key.objectid;
+ ),
+
+ TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu uptodate=%d",
+ show_root_type(__entry->root_objectid),
+ __entry->ino, __entry->start,
+ __entry->len, !!__entry->uptodate)
+);
+
DECLARE_EVENT_CLASS(btrfs__writepage,
TP_PROTO(const struct page *page, const struct inode *inode,
@@ -1982,25 +2011,27 @@ DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert,
);
TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
- TP_PROTO(const struct btrfs_root *root, u64 ino, int mod),
+ TP_PROTO(const struct btrfs_root *root, u64 ino, int mod, unsigned outstanding),
- TP_ARGS(root, ino, mod),
+ TP_ARGS(root, ino, mod, outstanding),
TP_STRUCT__entry_btrfs(
__field( u64, root_objectid )
__field( u64, ino )
__field( int, mod )
+ __field( unsigned, outstanding )
),
TP_fast_assign_btrfs(root->fs_info,
__entry->root_objectid = root->root_key.objectid;
__entry->ino = ino;
__entry->mod = mod;
+ __entry->outstanding = outstanding;
),
- TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d",
+ TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d outstanding=%u",
show_root_type(__entry->root_objectid),
- __entry->ino, __entry->mod)
+ __entry->ino, __entry->mod, __entry->outstanding)
);
DECLARE_EVENT_CLASS(btrfs__block_group,
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 0fcf99c91400..c19b1103e4ec 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -204,7 +204,6 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
"__ubsan_handle_builtin_unreachable",
"arch_call_rest_init",
"arch_cpu_idle_dead",
- "btrfs_assertfail",
"cpu_bringup_and_idle",
"cpu_startup_entry",
"do_exit",