summaryrefslogtreecommitdiffstats
path: root/fs/bcachefs/super-io.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-10 16:34:17 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-10 16:34:17 -0800
commit999a36b52b1b11b2ca0590756e4f8cf21f2d9182 (patch)
tree1b51ea332b5258e51fc9209168f66246bf7b3875 /fs/bcachefs/super-io.c
parent84e9a2d5517bf62edda74f382757aa173b8e45fd (diff)
parent169de41985f53320580f3d347534966ea83343ca (diff)
downloadlinux-999a36b52b1b11b2ca0590756e4f8cf21f2d9182.tar.gz
linux-999a36b52b1b11b2ca0590756e4f8cf21f2d9182.tar.bz2
linux-999a36b52b1b11b2ca0590756e4f8cf21f2d9182.zip
Merge tag 'bcachefs-2024-01-10' of https://evilpiepirate.org/git/bcachefs
Pull bcachefs updates from Kent Overstreet: - btree write buffer rewrite: instead of adding keys to the btree write buffer at transaction commit time, we now journal them with a different journal entry type and copy them from the journal to the write buffer just prior to journal write. This reduces the number of atomic operations on shared cachelines in the transaction commit path and is a signicant performance improvement on some workloads: multithreaded 4k random writes went from ~650k iops to ~850k iops. - Bring back optimistic spinning for six locks: the new implementation doesn't use osq locks; instead we add to the lock waitlist as normal, and then spin on the lock_acquired bit in the waitlist entry, _not_ the lock itself. - New ioctls: - BCH_IOCTL_DEV_USAGE_V2, which allows for new data types - BCH_IOCTL_OFFLINE_FSCK, which runs the kernel implementation of fsck but without mounting: useful for transparently using the kernel version of fsck from 'bcachefs fsck' when the kernel version is a better match for the on disk filesystem. - BCH_IOCTL_ONLINE_FSCK: online fsck. Not all passes are supported yet, but the passes that are supported are fully featured - errors may be corrected as normal. The new ioctls use the new 'thread_with_file' abstraction for kicking off a kthread that's tied to a file descriptor returned to userspace via the ioctl. - btree_paths within a btree_trans are now dynamically growable, instead of being limited to 64. This is important for the check_directory_structure phase of fsck, and also fixes some issues we were having with btree path overflow in the reflink btree. - Trigger refactoring; prep work for the upcoming disk space accounting rewrite - Numerous bugfixes :) * tag 'bcachefs-2024-01-10' of https://evilpiepirate.org/git/bcachefs: (226 commits) bcachefs: eytzinger0_find() search should be const bcachefs: move "ptrs not changing" optimization to bch2_trigger_extent() bcachefs: fix simulateously upgrading & downgrading bcachefs: Restart recovery passes more reliably bcachefs: bch2_dump_bset() doesn't choke on u64s == 0 bcachefs: improve checksum error messages bcachefs: improve validate_bset_keys() bcachefs: print sb magic when relevant bcachefs: __bch2_sb_field_to_text() bcachefs: %pg is banished bcachefs: Improve would_deadlock trace event bcachefs: fsck_err()s don't need to manually check c->sb.version anymore bcachefs: Upgrades now specify errors to fix, like downgrades bcachefs: no thread_with_file in userspace bcachefs: Don't autofix errors we can't fix bcachefs: add missing bch2_latency_acct() call bcachefs: increase max_active on io_complete_wq bcachefs: add time_stats for btree_node_read_done() bcachefs: don't clear accessed bit in btree node fill bcachefs: Add an option to control btree node prefetching ...
Diffstat (limited to 'fs/bcachefs/super-io.c')
-rw-r--r--fs/bcachefs/super-io.c168
1 files changed, 103 insertions, 65 deletions
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 78013deda9df..6d3db5cce5f6 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -30,14 +30,12 @@ static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
struct bch2_metadata_version {
u16 version;
const char *name;
- u64 recovery_passes;
};
static const struct bch2_metadata_version bch2_metadata_versions[] = {
-#define x(n, v, _recovery_passes) { \
+#define x(n, v) { \
.version = v, \
.name = #n, \
- .recovery_passes = _recovery_passes, \
},
BCH_METADATA_VERSIONS()
#undef x
@@ -70,24 +68,6 @@ unsigned bch2_latest_compatible_version(unsigned v)
return v;
}
-u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
- unsigned old_version,
- unsigned new_version)
-{
- u64 ret = 0;
-
- for (const struct bch2_metadata_version *i = bch2_metadata_versions;
- i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions);
- i++)
- if (i->version > old_version && i->version <= new_version) {
- if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK)
- ret |= bch2_fsck_recovery_passes();
- ret |= i->recovery_passes;
- }
-
- return ret &= ~RECOVERY_PASS_ALL_FSCK;
-}
-
const char * const bch2_sb_fields[] = {
#define x(name, nr) #name,
BCH_SB_FIELDS()
@@ -101,8 +81,6 @@ static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
enum bch_sb_field_type type)
{
- struct bch_sb_field *f;
-
/* XXX: need locking around superblock to access optional fields */
vstruct_for_each(sb, f)
@@ -192,8 +170,12 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
if (new_bytes > max_bytes) {
- pr_err("%pg: superblock too big: want %zu but have %llu",
- sb->bdev, new_bytes, max_bytes);
+ struct printbuf buf = PRINTBUF;
+
+ prt_bdevname(&buf, sb->bdev);
+ prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes);
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
return -BCH_ERR_ENOSPC_sb;
}
}
@@ -241,14 +223,12 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
if (sb->fs_sb) {
struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
- struct bch_dev *ca;
- unsigned i;
lockdep_assert_held(&c->sb_lock);
/* XXX: we're not checking that offline device have enough space */
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
@@ -368,7 +348,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
int rw)
{
struct bch_sb *sb = disk_sb->sb;
- struct bch_sb_field *f;
struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
u16 block_size;
@@ -514,8 +493,6 @@ static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned
static void bch2_sb_update(struct bch_fs *c)
{
struct bch_sb *src = c->disk_sb.sb;
- struct bch_dev *ca;
- unsigned i;
lockdep_assert_held(&c->sb_lock);
@@ -546,7 +523,7 @@ static void bch2_sb_update(struct bch_fs *c)
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
ca->mi = bch2_mi_to_cpu(&m);
}
@@ -571,6 +548,7 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
dst->time_base_lo = src->time_base_lo;
dst->time_base_hi = src->time_base_hi;
dst->time_precision = src->time_precision;
+ dst->write_time = src->write_time;
memcpy(dst->flags, src->flags, sizeof(dst->flags));
memcpy(dst->features, src->features, sizeof(dst->features));
@@ -634,7 +612,6 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
{
- struct bch_csum csum;
size_t bytes;
int ret;
reread:
@@ -650,7 +627,9 @@ reread:
if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
!uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
- prt_printf(err, "Not a bcachefs superblock");
+ prt_str(err, "Not a bcachefs superblock (got magic ");
+ pr_uuid(err, sb->sb->magic.b);
+ prt_str(err, ")");
return -BCH_ERR_invalid_sb_magic;
}
@@ -673,17 +652,16 @@ reread:
goto reread;
}
- if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+ enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
+ if (csum_type >= BCH_CSUM_NR) {
prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
return -BCH_ERR_invalid_sb_csum_type;
}
/* XXX: verify MACs */
- csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
- null_nonce(), sb->sb);
-
+ struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum)) {
- prt_printf(err, "bad checksum");
+ bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum);
return -BCH_ERR_invalid_sb_csum;
}
@@ -692,12 +670,13 @@ reread:
return 0;
}
-int bch2_read_super(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb)
+static int __bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
{
u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
struct printbuf err = PRINTBUF;
+ struct printbuf err2 = PRINTBUF;
__le64 *i;
int ret;
#ifndef __KERNEL__
@@ -761,8 +740,14 @@ retry:
if (opt_defined(*opts, sb))
goto err;
- printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
+ prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
+ if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
+ printk(KERN_INFO "%s", err2.buf);
+ else
+ printk(KERN_ERR "%s", err2.buf);
+
+ printbuf_exit(&err2);
printbuf_reset(&err);
/*
@@ -838,6 +823,20 @@ err_no_print:
goto out;
}
+int bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ return __bch2_read_super(path, opts, sb, false);
+}
+
+/* provide a silenced version for mount.bcachefs */
+
+int bch2_read_super_silent(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ return __bch2_read_super(path, opts, sb, true);
+}
+
/* write superblock: */
static void write_super_endio(struct bio *bio)
@@ -906,9 +905,8 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
int bch2_write_super(struct bch_fs *c)
{
struct closure *cl = &c->sb_write;
- struct bch_dev *ca;
struct printbuf err = PRINTBUF;
- unsigned i, sb = 0, nr_wrote;
+ unsigned sb = 0, nr_wrote;
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
@@ -930,9 +928,14 @@ int bch2_write_super(struct bch_fs *c)
le64_add_cpu(&c->disk_sb.sb->seq, 1);
- if (test_bit(BCH_FS_ERROR, &c->flags))
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ for_each_online_member(c, ca)
+ __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq;
+ c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
+
+ if (test_bit(BCH_FS_error, &c->flags))
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
- if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
+ if (test_bit(BCH_FS_topology_error, &c->flags))
SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
@@ -943,10 +946,10 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_errors_from_cpu(c);
bch2_sb_downgrade_update(c);
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
bch2_sb_from_fs(c, ca);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
printbuf_reset(&err);
ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
@@ -967,16 +970,28 @@ int bch2_write_super(struct bch_fs *c)
if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
goto out;
- for_each_online_member(ca, c, i) {
+ if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "attempting to write superblock that wasn't version downgraded (");
+ bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version));
+ prt_str(&buf, " > ");
+ bch2_version_to_text(&buf, bcachefs_metadata_version_current);
+ prt_str(&buf, ")");
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_sb_not_downgraded;
+ }
+
+ for_each_online_member(c, ca) {
__set_bit(ca->dev_idx, sb_written.d);
ca->sb_write_error = 0;
}
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
read_back_super(c, ca);
closure_sync(cl);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->sb_write_error)
continue;
@@ -1003,7 +1018,7 @@ int bch2_write_super(struct bch_fs *c)
do {
wrote = false;
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
if (!ca->sb_write_error &&
sb < ca->disk_sb.sb->layout.nr_superblocks) {
write_one_super(c, ca, sb);
@@ -1013,7 +1028,7 @@ int bch2_write_super(struct bch_fs *c)
sb++;
} while (wrote);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->sb_write_error)
__clear_bit(ca->dev_idx, sb_written.d);
else
@@ -1025,7 +1040,7 @@ int bch2_write_super(struct bch_fs *c)
can_mount_with_written =
bch2_have_enough_devs(c, sb_written, degraded_flags, false);
- for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
@@ -1074,13 +1089,22 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
/*
* Downgrade, if superblock is at a higher version than currently
* supported:
+ *
+ * c->sb will be checked before we write the superblock, so update it as
+ * well:
*/
- if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
+ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) {
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
- if (c->sb.version > bcachefs_metadata_version_current)
+ c->sb.version_upgrade_complete = bcachefs_metadata_version_current;
+ }
+ if (c->sb.version > bcachefs_metadata_version_current) {
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
- if (c->sb.version_min > bcachefs_metadata_version_current)
+ c->sb.version = bcachefs_metadata_version_current;
+ }
+ if (c->sb.version_min > bcachefs_metadata_version_current) {
c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
+ c->sb.version_min = bcachefs_metadata_version_current;
+ }
c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
return ret;
}
@@ -1173,8 +1197,8 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
return ret;
}
-void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
+void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
{
unsigned type = le32_to_cpu(f->type);
const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
@@ -1182,6 +1206,15 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
+ if (ops->to_text)
+ ops->to_text(out, sb, f);
+}
+
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ unsigned type = le32_to_cpu(f->type);
+
if (type < BCH_SB_FIELD_NR)
prt_printf(out, "%s", bch2_sb_fields[type]);
else
@@ -1190,11 +1223,7 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, " (size %zu):", vstruct_bytes(f));
prt_newline(out);
- if (ops->to_text) {
- printbuf_indent_add(out, 2);
- ops->to_text(out, sb, f);
- printbuf_indent_sub(out, 2);
- }
+ __bch2_sb_field_to_text(out, sb, f);
}
void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
@@ -1223,7 +1252,6 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bool print_layout, unsigned fields)
{
- struct bch_sb_field *f;
u64 fields_have = 0;
unsigned nr_devices = 0;
@@ -1243,6 +1271,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
pr_uuid(out, sb->uuid.b);
prt_newline(out);
+ prt_printf(out, "Magic number:");
+ prt_tab(out);
+ pr_uuid(out, sb->magic.b);
+ prt_newline(out);
+
prt_str(out, "Device index:");
prt_tab(out);
prt_printf(out, "%u", sb->dev_idx);
@@ -1281,6 +1314,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "%llu", le64_to_cpu(sb->seq));
prt_newline(out);
+ prt_printf(out, "Time of last write:");
+ prt_tab(out);
+ bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
+ prt_newline(out);
+
prt_printf(out, "Superblock size:");
prt_tab(out);
prt_printf(out, "%zu", vstruct_bytes(sb));