diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2021-05-21 23:57:37 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:09:04 -0400 |
commit | 1ce0cf5fe9300f28e834e6fa001cc5a114cd0493 (patch) | |
tree | 235b7e55958463781d9e884bbdcfc8ea6ba73ede /fs/bcachefs/btree_io.c | |
parent | 596d3bdc1e749c63e4defc6abbe2552d035e4470 (diff) | |
download | linux-stable-1ce0cf5fe9300f28e834e6fa001cc5a114cd0493.tar.gz linux-stable-1ce0cf5fe9300f28e834e6fa001cc5a114cd0493.tar.bz2 linux-stable-1ce0cf5fe9300f28e834e6fa001cc5a114cd0493.zip |
bcachefs: Add a debug mode that always reads from every btree replica
There's a new module parameter, verify_all_btree_replicas, that enables
reading from every btree replica when reading in btree nodes and
comparing them against each other. We've been seeing some strange btree
corruption - this will hopefully aid in tracking it down and catching it
more often.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Diffstat (limited to 'fs/bcachefs/btree_io.c')
-rw-r--r-- | fs/bcachefs/btree_io.c | 273 |
1 files changed, 266 insertions, 7 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index dbaa05ac764c..69b1435653a4 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -521,7 +521,7 @@ enum btree_validate_ret { \ switch (write) { \ case READ: \ - bch_err(c, "%s", _buf2); \ + bch_err(c, "%s", _buf2); \ \ switch (type) { \ case BTREE_ERR_FIXABLE: \ @@ -1036,8 +1036,8 @@ static void btree_node_read_work(struct work_struct *work) struct btree_read_bio *rb = container_of(work, struct btree_read_bio, work); struct bch_fs *c = rb->c; + struct btree *b = rb->b; struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); - struct btree *b = rb->bio.bi_private; struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; char buf[200]; @@ -1112,6 +1112,262 @@ static void btree_node_read_endio(struct bio *bio) queue_work(system_unbound_wq, &rb->work); } +struct btree_node_read_all { + struct closure cl; + struct bch_fs *c; + struct btree *b; + unsigned nr; + void *buf[BCH_REPLICAS_MAX]; + struct bio *bio[BCH_REPLICAS_MAX]; + int err[BCH_REPLICAS_MAX]; +}; + +static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) +{ + struct btree_node *bn = data; + struct btree_node_entry *bne; + unsigned offset = 0; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return 0; + + while (offset < c->opts.btree_node_size) { + if (!offset) { + offset += vstruct_sectors(bn, c->block_bits); + } else { + bne = data + (offset << 9); + if (bne->keys.seq != bn->keys.seq) + break; + offset += vstruct_sectors(bne, c->block_bits); + } + } + + return offset; +} + +static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) +{ + struct btree_node *bn = data; + struct btree_node_entry *bne; + + if (!offset) + return false; + + while (offset < c->opts.btree_node_size) { + bne = data + (offset << 9); + if (bne->keys.seq == bn->keys.seq) + return true; + offset++; + } + + return false; + return offset; +} + +static void btree_node_read_all_replicas_done(struct closure *cl) +{ + struct btree_node_read_all *ra = + container_of(cl, struct btree_node_read_all, cl); + struct bch_fs *c = ra->c; + struct btree *b = ra->b; + bool have_good_copy = false; + bool dump_bset_maps = false; + bool have_retry = false; + int ret = 0, write = READ; + unsigned i, written, written2; + __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + + for (i = 0; i < ra->nr; i++) { + if (ra->err[i]) + continue; + + if (!have_good_copy) { + memcpy(b->data, ra->buf[i], btree_bytes(c)); + have_good_copy = true; + written = btree_node_sectors_written(c, b->data); + } + + /* Try to get the right btree node: */ + if (have_good_copy && + seq && + b->data->keys.seq != seq && + ((struct btree_node *) ra->buf[i])->keys.seq == seq) { + memcpy(b->data, ra->buf[i], btree_bytes(c)); + written = btree_node_sectors_written(c, b->data); + } + + written2 = btree_node_sectors_written(c, ra->buf[i]); + if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "btree node sectors written mismatch: %u != %u", + written, written2) || + btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), + BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "found bset signature after last bset") || + btree_err_on(memcmp(b->data, ra->buf[i], written << 9), + BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "btree node replicas content mismatch")) + dump_bset_maps = true; + + if (written2 > written) { + written = written2; + memcpy(b->data, ra->buf[i], btree_bytes(c)); + } + } +fsck_err: + if (dump_bset_maps) { + for (i = 0; i < ra->nr; i++) { + char buf[200]; + struct printbuf out = PBUF(buf); + struct btree_node *bn = ra->buf[i]; + struct btree_node_entry *bne = NULL; + unsigned offset = 0, sectors; + bool gap = false; + + if (ra->err[i]) + continue; + + while (offset < c->opts.btree_node_size) { + if (!offset) { + sectors = vstruct_sectors(bn, c->block_bits); + } else { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq != bn->keys.seq) + break; + sectors = vstruct_sectors(bne, c->block_bits); + } + + pr_buf(&out, " %u-%u", offset, offset + sectors); + if (bne && bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) + pr_buf(&out, "*"); + offset += sectors; + } + + while (offset < c->opts.btree_node_size) { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq == bn->keys.seq) { + if (!gap) + pr_buf(&out, " GAP"); + gap = true; + + sectors = vstruct_sectors(bne, c->block_bits); + pr_buf(&out, " %u-%u", offset, offset + sectors); + if (bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) + pr_buf(&out, "*"); + } + offset++; + } + + bch_err(c, "replica %u:%s", i, buf); + } + } + + if (have_good_copy) + bch2_btree_node_read_done(c, NULL, b, false); + else + set_btree_node_read_error(b); + + for (i = 0; i < ra->nr; i++) { + mempool_free(ra->buf[i], &c->btree_bounce_pool); + bio_put(ra->bio[i]); + } + + closure_debug_destroy(&ra->cl); + kfree(ra); + + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +} + +static void btree_node_read_all_replicas_endio(struct bio *bio) +{ + struct btree_read_bio *rb = + container_of(bio, struct btree_read_bio, bio); + struct bch_fs *c = rb->c; + struct btree_node_read_all *ra = rb->ra; + + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + bch2_latency_acct(ca, rb->start_time, READ); + } + + ra->err[rb->idx] = bio->bi_status; + closure_put(&ra->cl); +} + +/* + * XXX This allocates multiple times from the same mempools, and can deadlock + * under sufficient memory pressure (but is only a debug path) + */ +static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) +{ + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded pick; + struct btree_node_read_all *ra; + unsigned i; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + closure_init(&ra->cl, NULL); + ra->c = c; + ra->b = b; + ra->nr = bch2_bkey_nr_ptrs(k); + + for (i = 0; i < ra->nr; i++) { + ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); + ra->bio[i] = bio_alloc_bioset(NULL, + buf_pages(ra->buf[i], btree_bytes(c)), + REQ_OP_READ|REQ_SYNC|REQ_META, + GFP_NOFS, + &c->btree_bio); + } + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct btree_read_bio *rb = + container_of(ra->bio[i], struct btree_read_bio, bio); + rb->c = c; + rb->b = b; + rb->ra = ra; + rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->idx = i; + rb->pick = pick; + rb->bio.bi_iter.bi_sector = pick.ptr.offset; + rb->bio.bi_end_io = btree_node_read_all_replicas_endio; + bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); + + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(&rb->bio)); + bio_set_dev(&rb->bio, ca->disk_sb.bdev); + + closure_get(&ra->cl); + submit_bio(&rb->bio); + } else { + ra->err[i] = BLK_STS_REMOVED; + } + + i++; + } + + if (sync) { + closure_sync(&ra->cl); + btree_node_read_all_replicas_done(&ra->cl); + } else { + continue_at(&ra->cl, btree_node_read_all_replicas_done, system_unbound_wq); + } + + return 0; +} + void bch2_btree_node_read(struct bch_fs *c, struct btree *b, bool sync) { @@ -1125,6 +1381,12 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, btree_pos_to_text(&PBUF(buf), c, b); trace_btree_read(c, b); + set_btree_node_read_in_flight(b); + + if (bch2_verify_all_btree_replicas && + !btree_node_read_all_replicas(c, b, sync)) + return; + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick); if (bch2_fs_fatal_err_on(ret <= 0, c, @@ -1143,17 +1405,16 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, &c->btree_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; + rb->b = b; + rb->ra = NULL; rb->start_time = local_clock(); rb->have_ioref = bch2_dev_get_ioref(ca, READ); rb->pick = pick; INIT_WORK(&rb->work, btree_node_read_work); bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_end_io = btree_node_read_endio; - bio->bi_private = b; bch2_bio_map(bio, b->data, btree_bytes(c)); - set_btree_node_read_in_flight(b); - if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], bio_sectors(bio)); @@ -1162,7 +1423,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, if (sync) { submit_bio_wait(bio); - bio->bi_private = b; btree_node_read_work(&rb->work); } else { submit_bio(bio); @@ -1174,7 +1434,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, btree_node_read_work(&rb->work); else queue_work(system_unbound_wq, &rb->work); - } } |