summaryrefslogtreecommitdiffstats
path: root/fs/bcachefs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2022-03-21 00:15:53 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:09:30 -0400
commitce6201c456571d919e722eec3c17f868f0575b05 (patch)
treefa7281175f4d7d823da49cff85596b2d7b53e92f /fs/bcachefs
parent95752a02cb5d38bc97d76625de2607510ac94e69 (diff)
downloadlinux-ce6201c456571d919e722eec3c17f868f0575b05.tar.gz
linux-ce6201c456571d919e722eec3c17f868f0575b05.tar.bz2
linux-ce6201c456571d919e722eec3c17f868f0575b05.zip
bcachefs: Use a genradix for reading journal entries
Previously, the journal read path used a linked list for storing the journal entries we read from disk. But there's been a bug that's been causing journal_flush_delay to incorrectly be set to 0, leading to far more journal entries than is normal being written out, which then means filesystems are no longer able to start due to the O(n^2) behaviour of inserting into/searching that linked list. Fix this by switching to a radix tree. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/bcachefs.h3
-rw-r--r--fs/bcachefs/journal.c34
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/journal_io.c138
-rw-r--r--fs/bcachefs/journal_io.h3
-rw-r--r--fs/bcachefs/recovery.c68
-rw-r--r--fs/bcachefs/recovery.h2
-rw-r--r--fs/bcachefs/super.c3
8 files changed, 151 insertions, 102 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f2bb23162b4a..43e921b91d85 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -894,7 +894,8 @@ mempool_t bio_bounce_pages;
mempool_t btree_bounce_pool;
struct journal journal;
- struct list_head journal_entries;
+ GENRADIX(struct journal_replay *) journal_entries;
+ u64 journal_entries_base_seq;
struct journal_keys journal_keys;
struct list_head journal_iters;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f87f76553bf4..f8b57de31d93 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1039,17 +1039,25 @@ void bch2_fs_journal_stop(struct journal *j)
cancel_delayed_work_sync(&j->write_work);
}
-int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
- struct list_head *journal_entries)
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
- struct journal_replay *i;
+ struct journal_replay *i, **_i;
+ struct genradix_iter iter;
+ bool had_entries = false;
+ unsigned ptr;
u64 last_seq = cur_seq, nr, seq;
- if (!list_empty(journal_entries))
- last_seq = le64_to_cpu(list_last_entry(journal_entries,
- struct journal_replay, list)->j.last_seq);
+ genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ last_seq = le64_to_cpu(i->j.last_seq);
+ break;
+ }
nr = cur_seq - last_seq;
@@ -1071,14 +1079,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
- if (list_empty(journal_entries))
- j->last_empty_seq = cur_seq - 1;
-
fifo_for_each_entry_ptr(p, &j->pin, seq)
journal_pin_list_init(p, 1);
- list_for_each_entry(i, journal_entries, list) {
- unsigned ptr;
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
seq = le64_to_cpu(i->j.seq);
BUG_ON(seq >= cur_seq);
@@ -1094,9 +1102,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
p->devs.nr = 0;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+
+ had_entries = true;
}
- if (list_empty(journal_entries))
+ if (!had_entries)
j->last_empty_seq = cur_seq;
spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index c287ecf643aa..59453dcfa4e9 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -510,7 +510,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
+int bch2_fs_journal_start(struct journal *, u64);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index fad142196daa..c84c0b840906 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -16,12 +16,22 @@
#include "replicas.h"
#include "trace.h"
-static void __journal_replay_free(struct journal_replay *i)
+static inline u32 journal_entry_radix_idx(struct bch_fs *c,
+ struct jset *j)
{
- list_del(&i->list);
+ return (le64_to_cpu(j->seq) - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+ struct journal_replay *i)
+{
+ struct journal_replay **p =
+ genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, &i->j));
+
+ BUG_ON(*p != i);
+ *p = NULL;
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
-
}
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@@ -29,13 +39,12 @@ static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
i->ignore = true;
if (!c->opts.read_entire_journal)
- __journal_replay_free(i);
+ __journal_replay_free(c, i);
}
struct journal_list {
struct closure cl;
struct mutex lock;
- struct list_head *head;
int ret;
};
@@ -51,19 +60,30 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct journal_list *jlist, struct jset *j,
bool bad)
{
- struct journal_replay *i, *pos, *dup = NULL;
+ struct genradix_iter iter;
+ struct journal_replay **_i, *i, *dup;
struct journal_ptr *ptr;
- struct list_head *where;
size_t bytes = vstruct_bytes(j);
u64 last_seq = 0;
int ret = JOURNAL_ENTRY_ADD_OK;
+ /*
+ * Xarrays are indexed by a ulong, not a u64, so we can't index them by
+ * sequence number directly:
+ * Assume instead that they will all fall within the range of +-2billion
+ * of the filrst one we find.
+ */
+ if (!c->journal_entries_base_seq)
+ c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
+
+#if 0
list_for_each_entry_reverse(i, jlist->head, list) {
if (!JSET_NO_FLUSH(&i->j)) {
last_seq = le64_to_cpu(i->j.last_seq);
break;
}
}
+#endif
/* Is this entry older than the range we need? */
if (!c->opts.read_entire_journal &&
@@ -73,29 +93,21 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
}
/* Drop entries we don't need anymore */
- if (!JSET_NO_FLUSH(j)) {
- list_for_each_entry_safe(i, pos, jlist->head, list) {
+ if (!JSET_NO_FLUSH(j) && !c->opts.read_entire_journal) {
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i)
+ continue;
+
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
journal_replay_free(c, i);
}
}
- list_for_each_entry_reverse(i, jlist->head, list) {
- if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
- where = &i->list;
- goto add;
- }
- }
-
- where = jlist->head;
-add:
- dup = where->next != jlist->head
- ? container_of(where->next, struct journal_replay, list)
- : NULL;
-
- if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
- dup = NULL;
+ _i = genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, j));
+ dup = _i ? *_i : NULL;
/*
* Duplicate journal entries? If so we want the one that didn't have a
@@ -131,10 +143,19 @@ add:
if (dup) {
i->nr_ptrs = dup->nr_ptrs;
memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
- __journal_replay_free(dup);
+ __journal_replay_free(c, dup);
}
- list_add(&i->list, where);
+ _i = genradix_ptr_alloc(&c->journal_entries,
+ journal_entry_radix_idx(c, &i->j),
+ GFP_KERNEL);
+ if (!_i) {
+ bch_err(c, "failed to allocate c->journal_entries entry");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ *_i = i;
found:
for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
if (ptr->dev == ca->dev_idx) {
@@ -913,7 +934,8 @@ static void bch2_journal_read_device(struct closure *cl)
struct bch_fs *c = ca->fs;
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
- struct journal_replay *r;
+ struct journal_replay *r, **_r;
+ struct genradix_iter iter;
struct journal_read_buf buf = { NULL, 0 };
u64 min_seq = U64_MAX;
unsigned i;
@@ -956,7 +978,12 @@ static void bch2_journal_read_device(struct closure *cl)
ja->sectors_free = ca->mi.bucket_size;
mutex_lock(&jlist->lock);
- list_for_each_entry(r, jlist->head, list) {
+ genradix_for_each(&c->journal_entries, iter, _r) {
+ r = *_r;
+
+ if (!r)
+ continue;
+
for (i = 0; i < r->nr_ptrs; i++) {
if (r->ptrs[i].dev == ca->dev_idx &&
sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
@@ -1022,11 +1049,11 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
}
}
-int bch2_journal_read(struct bch_fs *c, struct list_head *list,
- u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
{
struct journal_list jlist;
- struct journal_replay *i, *t;
+ struct journal_replay *i, **_i, *prev = NULL;
+ struct genradix_iter radix_iter;
struct bch_dev *ca;
unsigned iter;
struct printbuf buf = PRINTBUF;
@@ -1037,7 +1064,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
closure_init_stack(&jlist.cl);
mutex_init(&jlist.lock);
- jlist.head = list;
jlist.ret = 0;
for_each_member_device(ca, c, iter) {
@@ -1061,22 +1087,21 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
if (jlist.ret)
return jlist.ret;
- if (list_empty(list)) {
- bch_info(c, "journal read done, but no entries found");
- return 0;
- }
-
- i = list_last_entry(list, struct journal_replay, list);
- *start_seq = le64_to_cpu(i->j.seq) + 1;
+ *start_seq = 0;
/*
* Find most recent flush entry, and ignore newer non flush entries -
* those entries will be blacklisted:
*/
- list_for_each_entry_safe_reverse(i, t, list, list) {
- if (i->ignore)
+ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
+ if (!*start_seq)
+ *start_seq = le64_to_cpu(i->j.seq) + 1;
+
if (!JSET_NO_FLUSH(&i->j)) {
last_seq = le64_to_cpu(i->j.last_seq);
*blacklist_seq = le64_to_cpu(i->j.seq) + 1;
@@ -1086,6 +1111,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
journal_replay_free(c, i);
}
+ if (!*start_seq) {
+ bch_info(c, "journal read done, but no entries found");
+ return 0;
+ }
+
if (!last_seq) {
fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
ret = -1;
@@ -1093,8 +1123,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
}
/* Drop blacklisted entries and entries older than last_seq: */
- list_for_each_entry_safe(i, t, list, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
seq = le64_to_cpu(i->j.seq);
@@ -1113,8 +1145,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
/* Check for missing entries: */
seq = last_seq;
- list_for_each_entry(i, list, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1136,11 +1170,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
!bch2_journal_seq_is_blacklisted(c, seq, false))
seq++;
- if (i->list.prev != list) {
- struct journal_replay *p = list_prev_entry(i, list);
-
- bch2_journal_ptrs_to_text(&buf1, c, p);
- pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits));
+ if (prev) {
+ bch2_journal_ptrs_to_text(&buf1, c, prev);
+ pr_buf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
} else
pr_buf(&buf1, "(none)");
bch2_journal_ptrs_to_text(&buf2, c, i);
@@ -1157,10 +1189,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
printbuf_exit(&buf2);
}
+ prev = i;
seq++;
}
- list_for_each_entry(i, list, list) {
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct bch_replicas_padded replicas = {
@@ -1169,7 +1202,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
};
unsigned ptr;
- if (i->ignore)
+ i = *_i;
+ if (!i || i->ignore)
continue;
ret = jset_validate_entries(c, &i->j, READ);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index f2001835e43e..30e995c81fc4 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -7,7 +7,6 @@
* during cache_registration
*/
struct journal_replay {
- struct list_head list;
struct journal_ptr {
u8 dev;
u32 bucket;
@@ -53,7 +52,7 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct journal_replay *);
-int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *);
void bch2_journal_write(struct closure *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9120ed26250e..07ce6a540856 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -433,16 +433,16 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
/* sort and dedup all keys in the journal: */
-void bch2_journal_entries_free(struct list_head *list)
+void bch2_journal_entries_free(struct bch_fs *c)
{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
+ struct journal_replay **i;
+ struct genradix_iter iter;
+
+ genradix_for_each(&c->journal_entries, iter, i)
+ if (*i)
+ kvpfree(*i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&(*i)->j));
+ genradix_free(&c->journal_entries);
}
/*
@@ -476,15 +476,18 @@ void bch2_journal_keys_free(struct journal_keys *keys)
static int journal_keys_sort(struct bch_fs *c)
{
- struct journal_replay *i;
+ struct genradix_iter iter;
+ struct journal_replay *i, **_i;
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct journal_keys *keys = &c->journal_keys;
struct journal_key *src, *dst;
size_t nr_keys = 0;
- list_for_each_entry(i, &c->journal_entries, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
if (!keys->journal_seq_base)
@@ -503,8 +506,10 @@ static int journal_keys_sort(struct bch_fs *c)
if (!keys->d)
return -ENOMEM;
- list_for_each_entry(i, &c->journal_entries, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
BUG_ON(le64_to_cpu(i->j.seq) - keys->journal_seq_base > U32_MAX);
@@ -751,10 +756,8 @@ static int journal_replay_entry_early(struct bch_fs *c,
}
static int journal_replay_early(struct bch_fs *c,
- struct bch_sb_field_clean *clean,
- struct list_head *journal)
+ struct bch_sb_field_clean *clean)
{
- struct journal_replay *i;
struct jset_entry *entry;
int ret;
@@ -767,8 +770,13 @@ static int journal_replay_early(struct bch_fs *c,
return ret;
}
} else {
- list_for_each_entry(i, journal, list) {
- if (i->ignore)
+ struct genradix_iter iter;
+ struct journal_replay *i, **_i;
+
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
vstruct_for_each(&i->j, entry) {
@@ -1093,17 +1101,17 @@ int bch2_fs_recovery(struct bch_fs *c)
}
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
- struct journal_replay *i;
+ struct genradix_iter iter;
+ struct journal_replay **i;
bch_verbose(c, "starting journal read");
- ret = bch2_journal_read(c, &c->journal_entries,
- &blacklist_seq, &journal_seq);
+ ret = bch2_journal_read(c, &blacklist_seq, &journal_seq);
if (ret)
goto err;
- list_for_each_entry_reverse(i, &c->journal_entries, list)
- if (!i->ignore) {
- last_journal_entry = &i->j;
+ genradix_for_each_reverse(&c->journal_entries, iter, i)
+ if (*i && !(*i)->ignore) {
+ last_journal_entry = &(*i)->j;
break;
}
@@ -1152,7 +1160,7 @@ use_clean:
zero_out_btree_mem_ptr(&c->journal_keys);
- ret = journal_replay_early(c, clean, &c->journal_entries);
+ ret = journal_replay_early(c, clean);
if (ret)
goto err;
@@ -1175,8 +1183,7 @@ use_clean:
}
}
- ret = bch2_fs_journal_start(&c->journal, journal_seq,
- &c->journal_entries);
+ ret = bch2_fs_journal_start(&c->journal, journal_seq);
if (ret)
goto err;
@@ -1380,7 +1387,7 @@ out:
if (!c->opts.keep_journal) {
bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(&c->journal_entries);
+ bch2_journal_entries_free(c);
}
kfree(clean);
if (ret)
@@ -1401,7 +1408,6 @@ int bch2_fs_initialize(struct bch_fs *c)
struct qstr lostfound = QSTR("lost+found");
const char *err = "cannot allocate memory";
struct bch_dev *ca;
- LIST_HEAD(journal);
unsigned i;
int ret;
@@ -1441,7 +1447,7 @@ int bch2_fs_initialize(struct bch_fs *c)
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
- bch2_fs_journal_start(&c->journal, 1, &journal);
+ bch2_fs_journal_start(&c->journal, 1);
bch2_journal_set_replay_done(&c->journal);
err = "error going read-write";
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 30580a8984a1..ab8b116ac7db 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -55,7 +55,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct btree *);
void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct list_head *);
+void bch2_journal_entries_free(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2c3d0546f2b6..689c82f6cb5d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -457,7 +457,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(&c->journal_entries);
+ bch2_journal_entries_free(c);
percpu_free_rwsem(&c->mark_lock);
free_percpu(c->online_reserved);
@@ -676,7 +676,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_WORK(&c->journal_seq_blacklist_gc_work,
bch2_blacklist_entries_gc);
- INIT_LIST_HEAD(&c->journal_entries);
INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_errors);