diff options
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 7 | ||||
-rw-r--r-- | fs/bcachefs/journal.c | 54 | ||||
-rw-r--r-- | fs/bcachefs/journal.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/journal_io.c | 208 | ||||
-rw-r--r-- | fs/bcachefs/journal_io.h | 3 | ||||
-rw-r--r-- | fs/bcachefs/journal_reclaim.c | 10 | ||||
-rw-r--r-- | fs/bcachefs/journal_seq_blacklist.c | 5 | ||||
-rw-r--r-- | fs/bcachefs/journal_types.h | 8 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 166 |
9 files changed, 312 insertions, 151 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f072e865e43f..7df2bc7ecd4f 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1336,14 +1336,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(extents_above_btree_updates, 12) \ x(btree_updates_journalled, 13) \ x(reflink_inline_data, 14) \ - x(new_varint, 15) + x(new_varint, 15) \ + x(journal_no_flush, 16) #define BCH_SB_FEATURES_ALL \ ((1ULL << BCH_FEATURE_new_siphash)| \ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ - (1ULL << BCH_FEATURE_new_varint))\ + (1ULL << BCH_FEATURE_new_varint)| \ + (1ULL << BCH_FEATURE_journal_no_flush)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1582,6 +1584,7 @@ struct jset { LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); +LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); #define BCH_JOURNAL_BUCKETS_MIN 8 diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 3bbb23d7739a..31168754d6b8 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -79,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j) struct journal_buf *buf = journal_cur_buf(j); bkey_extent_init(&buf->key); + buf->noflush = false; + buf->must_flush = false; memset(buf->has_inode, 0, sizeof(buf->has_inode)); @@ -574,7 +576,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct journal_buf *buf; int ret = 0; - if (seq <= j->seq_ondisk) + if (seq <= j->flushed_seq_ondisk) return 1; spin_lock(&j->lock); @@ -585,16 +587,53 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, goto out; } - if (seq <= j->seq_ondisk) { + if (seq <= j->flushed_seq_ondisk) { ret = 1; goto out; } - if (parent && - (buf = journal_seq_to_buf(j, seq))) - if (!closure_wait(&buf->wait, parent)) + /* if seq was written, but not flushed - flush a newer one instead */ + seq = max(seq, last_unwritten_seq(j)); + +recheck_need_open: + if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) { + struct journal_res res = { 0 }; + + spin_unlock(&j->lock); + + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + if (ret) + return ret; + + seq = res.seq; + buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + set_bit(JOURNAL_NEED_WRITE, &j->flags); + + if (parent && !closure_wait(&buf->wait, parent)) BUG(); + bch2_journal_res_put(j, &res); + + spin_lock(&j->lock); + goto want_write; + } + + /* + * if write was kicked off without a flush, flush the next sequence + * number instead + */ + buf = journal_seq_to_buf(j, seq); + if (buf->noflush) { + seq++; + goto recheck_need_open; + } + + buf->must_flush = true; + + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); +want_write: if (seq == journal_cur_seq(j)) journal_entry_want_write(j); out: @@ -979,6 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, spin_lock(&j->lock); set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; journal_pin_new_entry(j, 1); @@ -1116,6 +1156,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) "last_seq:\t\t%llu\n" "last_seq_ondisk:\t%llu\n" "prereserved:\t\t%u/%u\n" + "nr flush writes:\t%llu\n" + "nr noflush writes:\t%llu\n" "nr direct reclaim:\t%llu\n" "nr background reclaim:\t%llu\n" "current entry sectors:\t%u\n" @@ -1127,6 +1169,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) j->last_seq_ondisk, j->prereserved.reserved, j->prereserved.remaining, + j->nr_flush_writes, + j->nr_noflush_writes, j->nr_direct_reclaim, j->nr_background_reclaim, j->cur_entry_sectors, diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 1b6175cd6f1b..2c0014c3c02f 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -136,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j) static inline u64 journal_cur_seq(struct journal *j) { - BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); return j->pin.back - 1; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 1aeeb58d3c2a..26556bb381b2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -10,9 +10,26 @@ #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "replicas.h" #include "trace.h" +static void __journal_replay_free(struct journal_replay *i) +{ + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + +} + +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +{ + i->ignore = true; + + if (!c->opts.read_entire_journal) + __journal_replay_free(i); +} + struct journal_list { struct closure cl; struct mutex lock; @@ -35,28 +52,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct bch_devs_list devs = { .nr = 0 }; struct list_head *where; size_t bytes = vstruct_bytes(j); - __le64 last_seq; + u64 last_seq = 0; int ret; - last_seq = !list_empty(jlist->head) - ? list_last_entry(jlist->head, struct journal_replay, - list)->j.last_seq - : 0; - - if (!c->opts.read_entire_journal) { - /* Is this entry older than the range we need? */ - if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { - ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - goto out; + list_for_each_entry_reverse(i, jlist->head, list) { + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + break; } + } - /* Drop entries we don't need anymore */ + /* Is this entry older than the range we need? */ + if (!c->opts.read_entire_journal && + le64_to_cpu(j->seq) < last_seq) { + ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + goto out; + } + + /* Drop entries we don't need anymore */ + if (!JSET_NO_FLUSH(j)) { list_for_each_entry_safe(i, pos, jlist->head, list) { if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) break; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + journal_replay_free(c, i); } } @@ -80,9 +98,7 @@ add: if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { if (i->bad) { devs = i->devs; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + __journal_replay_free(i); } else if (bad) { goto found; } else { @@ -104,6 +120,7 @@ add: list_add(&i->list, where); i->devs = devs; i->bad = bad; + i->ignore = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); found: if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) @@ -698,14 +715,16 @@ err: goto out; } -int bch2_journal_read(struct bch_fs *c, struct list_head *list) +int bch2_journal_read(struct bch_fs *c, struct list_head *list, + u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i; + struct journal_replay *i, *t; struct bch_dev *ca; unsigned iter; size_t keys = 0, entries = 0; bool degraded = false; + u64 seq, last_seq = 0; int ret = 0; closure_init_stack(&jlist.cl); @@ -734,12 +753,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (jlist.ret) return jlist.ret; + if (list_empty(list)) { + bch_info(c, "journal read done, but no entries found"); + return 0; + } + + i = list_last_entry(list, struct journal_replay, list); + *start_seq = le64_to_cpu(i->j.seq) + 1; + + /* + * Find most recent flush entry, and ignore newer non flush entries - + * those entries will be blacklisted: + */ + list_for_each_entry_safe_reverse(i, t, list, list) { + if (i->ignore) + continue; + + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + break; + } + + journal_replay_free(c, i); + } + + if (!last_seq) { + fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); + return -1; + } + + /* Drop blacklisted entries and entries older than last_seq: */ + list_for_each_entry_safe(i, t, list, list) { + if (i->ignore) + continue; + + seq = le64_to_cpu(i->j.seq); + if (seq < last_seq) { + journal_replay_free(c, i); + continue; + } + + if (bch2_journal_seq_is_blacklisted(c, seq, true)) { + fsck_err_on(!JSET_NO_FLUSH(&i->j), c, + "found blacklisted journal entry %llu", seq); + + journal_replay_free(c, i); + } + } + + /* Check for missing entries: */ + seq = last_seq; + list_for_each_entry(i, list, list) { + if (i->ignore) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + missing_end = seq - 1; + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + missing_start, missing_end, + last_seq, *blacklist_seq - 1); + } + + seq++; + } + list_for_each_entry(i, list, list) { struct jset_entry *entry; struct bkey_i *k, *_n; struct bch_replicas_padded replicas; char buf[80]; + if (i->ignore) + continue; + ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -767,12 +871,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) entries++; } - if (!list_empty(list)) { - i = list_last_entry(list, struct journal_replay, list); + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", + keys, entries, *start_seq); - bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", - keys, entries, le64_to_cpu(i->j.seq)); - } + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); fsck_err: return ret; } @@ -990,8 +1094,12 @@ static void journal_write_done(struct closure *cl) j->seq_ondisk = seq; if (err && (!j->err_seq || seq < j->err_seq)) j->err_seq = seq; - j->last_seq_ondisk = last_seq; - bch2_journal_space_available(j); + + if (!w->noflush) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + bch2_journal_space_available(j); + } /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard @@ -1067,6 +1175,22 @@ void bch2_journal_write(struct closure *cl) j->write_start_time = local_clock(); + spin_lock(&j->lock); + if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && + !w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); + jset->last_seq = cpu_to_le64(j->last_seq_ondisk); + + j->nr_noflush_writes++; + } else { + j->last_flush_write = jiffies; + j->nr_flush_writes++; + } + spin_unlock(&j->lock); + /* * New btree roots are set by journalling them; when the journal entry * gets written we have to propagate them to c->btree_roots @@ -1183,11 +1307,12 @@ retry_alloc: sectors); bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; + if (!JSET_NO_FLUSH(jset)) + bio->bi_opf |= REQ_PREFLUSH|REQ_FUA; bch2_bio_map(bio, jset, sectors << 9); trace_journal_write(bio); @@ -1196,18 +1321,19 @@ retry_alloc: ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); } - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { - percpu_ref_get(&ca->io_ref); - - bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - + if (!JSET_NO_FLUSH(jset)) { + for_each_rw_member(ca, c, i) + if (journal_flushes_device(ca) && + !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { + percpu_ref_get(&ca->io_ref); + + bio = ca->journal.bio; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } + } no_io: bch2_bucket_seq_cleanup(c); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 6958ee0f8cf2..6b4c80968f52 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -11,6 +11,7 @@ struct journal_replay { struct bch_devs_list devs; /* checksum error, but we may want to try using it anyways: */ bool bad; + bool ignore; /* must be last: */ struct jset j; }; @@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_read(struct bch_fs *, struct list_head *); +int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); void bch2_journal_write(struct closure *); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index c6267284a028..a3d5405991b9 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -158,7 +158,7 @@ void bch2_journal_space_available(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - unsigned clean; + unsigned clean, clean_ondisk, total; unsigned overhead, u64s_remaining = 0; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); @@ -204,13 +204,21 @@ void bch2_journal_space_available(struct journal *j) for (i = 0; i < journal_space_nr; i++) j->space[i] = __journal_space_available(j, nr_devs_want, i); + clean_ondisk = j->space[journal_space_clean_ondisk].total; clean = j->space[journal_space_clean].total; + total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) ret = cur_entry_journal_full; else if (!fifo_free(&j->pin)) ret = cur_entry_journal_pin_full; + if ((clean - clean_ondisk <= total / 8) && + (clean_ondisk * 2 > clean )) + set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + else + clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + overhead = DIV_ROUND_UP(clean, max_entry_size) * journal_entry_overhead(j); u64s_remaining = clean << 6; diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index d0f1bbf8f6a7..e1b63f3879f4 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -118,7 +118,7 @@ out_write_sb: out: mutex_unlock(&c->sb_lock); - return ret; + return ret ?: bch2_blacklist_table_initialize(c); } static int journal_seq_blacklist_table_cmp(const void *_l, @@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) struct journal_seq_blacklist_table *t; unsigned i, nr = blacklist_nr_entries(bl); - BUG_ON(c->journal_seq_blacklist_table); - if (!bl) return 0; @@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) journal_seq_blacklist_table_cmp, NULL); + kfree(c->journal_seq_blacklist_table); c->journal_seq_blacklist_table = t; return 0; } diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 6b525dc6ab7c..cf9675310f2b 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -29,6 +29,8 @@ struct journal_buf { unsigned disk_sectors; /* maximum size entry could have been, if buf_size was bigger */ unsigned u64s_reserved; + bool noflush; /* write has already been kicked off, and was noflush */ + bool must_flush; /* something wants a flush */ /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; }; @@ -146,6 +148,7 @@ enum { JOURNAL_RECLAIM_STARTED, JOURNAL_NEED_WRITE, JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, }; /* Embedded in struct bch_fs */ @@ -203,6 +206,7 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; + u64 flushed_seq_ondisk; u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; @@ -252,11 +256,15 @@ struct journal { unsigned write_delay_ms; unsigned reclaim_delay_ms; + unsigned long last_flush_write; u64 res_get_blocked_start; u64 need_write_time; u64 write_start_time; + u64 nr_flush_writes; + u64 nr_noflush_writes; + struct bch2_time_stats *write_time; struct bch2_time_stats *delay_time; struct bch2_time_stats *blocked_time; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 7ad5b8234747..ecd51d45743a 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys) static struct journal_keys journal_keys_sort(struct list_head *journal_entries) { - struct journal_replay *p; + struct journal_replay *i; struct jset_entry *entry; struct bkey_i *k, *_n; struct journal_keys keys = { NULL }; @@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) if (list_empty(journal_entries)) return keys; - keys.journal_seq_base = - le64_to_cpu(list_last_entry(journal_entries, - struct journal_replay, list)->j.last_seq); - - list_for_each_entry(p, journal_entries, list) { - if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) + list_for_each_entry(i, journal_entries, list) { + if (i->ignore) continue; - for_each_jset_key(k, _n, entry, &p->j) + if (!keys.journal_seq_base) + keys.journal_seq_base = le64_to_cpu(i->j.seq); + + for_each_jset_key(k, _n, entry, &i->j) nr_keys++; } - keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); if (!keys.d) goto err; - list_for_each_entry(p, journal_entries, list) { - if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) + list_for_each_entry(i, journal_entries, list) { + if (i->ignore) continue; - for_each_jset_key(k, _n, entry, &p->j) + BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX); + + for_each_jset_key(k, _n, entry, &i->j) keys.d[keys.nr++] = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, - .journal_seq = le64_to_cpu(p->j.seq) - + .journal_seq = le64_to_cpu(i->j.seq) - keys.journal_seq_base, - .journal_offset = k->_data - p->j._data, + .journal_offset = k->_data - i->j._data, }; } @@ -643,46 +643,6 @@ err: return ret; } -static bool journal_empty(struct list_head *journal) -{ - return list_empty(journal) || - journal_entry_empty(&list_last_entry(journal, - struct journal_replay, list)->j); -} - -static int -verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, - struct list_head *journal) -{ - struct journal_replay *i = - list_last_entry(journal, struct journal_replay, list); - u64 start_seq = le64_to_cpu(i->j.last_seq); - u64 end_seq = le64_to_cpu(i->j.seq); - u64 seq = start_seq; - int ret = 0; - - list_for_each_entry(i, journal, list) { - if (le64_to_cpu(i->j.seq) < start_seq) - continue; - - fsck_err_on(seq != le64_to_cpu(i->j.seq), c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - seq, le64_to_cpu(i->j.seq) - 1, - start_seq, end_seq); - - seq = le64_to_cpu(i->j.seq); - - fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, - "found blacklisted journal entry %llu", seq); - - do { - seq++; - } while (bch2_journal_seq_is_blacklisted(c, seq, false)); - } -fsck_err: - return ret; -} - /* journal replay early: */ static int journal_replay_entry_early(struct bch_fs *c, @@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c, struct bch_sb_field_clean *clean, struct list_head *journal) { + struct journal_replay *i; struct jset_entry *entry; int ret; @@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c, return ret; } } else { - struct journal_replay *i = - list_last_entry(journal, struct journal_replay, list); + list_for_each_entry(i, journal, list) { + if (i->ignore) + continue; - c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); + c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); - list_for_each_entry(i, journal, list) vstruct_for_each(&i->j, entry) { ret = journal_replay_entry_early(c, entry); if (ret) return ret; } + } } bch2_fs_usage_initialize(c); @@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c, struct bch_sb_field_clean *clean = *cleanp; int ret = 0; - if (!c->sb.clean || !j) - return 0; - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", le64_to_cpu(clean->journal_seq), @@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_clean *clean = NULL; - u64 journal_seq; + struct jset *last_journal_entry = NULL; + u64 blacklist_seq, journal_seq; bool write_sb = false, need_write_alloc = false; int ret; @@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } + ret = bch2_blacklist_table_initialize(c); + if (ret) { + bch_err(c, "error initializing blacklist table"); + goto err; + } + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { - struct jset *j; + struct journal_replay *i; - ret = bch2_journal_read(c, &c->journal_entries); + ret = bch2_journal_read(c, &c->journal_entries, + &blacklist_seq, &journal_seq); if (ret) goto err; - if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, + list_for_each_entry_reverse(i, &c->journal_entries, list) + if (!i->ignore) { + last_journal_entry = &i->j; + break; + } + + if (mustfix_fsck_err_on(c->sb.clean && + last_journal_entry && + !journal_entry_empty(last_journal_entry), c, "filesystem marked clean but journal not empty")) { c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; } - if (!c->sb.clean && list_empty(&c->journal_entries)) { - bch_err(c, "no journal entries found"); - ret = BCH_FSCK_REPAIR_IMPOSSIBLE; - goto err; + if (!last_journal_entry) { + fsck_err_on(!c->sb.clean, c, "no journal entries found"); + goto use_clean; } c->journal_keys = journal_keys_sort(&c->journal_entries); @@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - j = &list_last_entry(&c->journal_entries, - struct journal_replay, list)->j; - - ret = verify_superblock_clean(c, &clean, j); - if (ret) + if (c->sb.clean && last_journal_entry) { + ret = verify_superblock_clean(c, &clean, + last_journal_entry); + if (ret) + goto err; + } + } else { +use_clean: + if (!clean) { + bch_err(c, "no superblock clean section found"); + ret = BCH_FSCK_REPAIR_IMPOSSIBLE; goto err; - journal_seq = le64_to_cpu(j->seq) + 1; - } else { - journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; } if (!c->sb.clean && @@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; - if (!c->sb.clean) { + /* + * After an unclean shutdown, skip then next few journal sequence + * numbers as they may have been referenced by btree writes that + * happened before their corresponding journal writes - those btree + * writes need to be ignored, by skipping and blacklisting the next few + * journal sequence numbers: + */ + if (!c->sb.clean) + journal_seq += 8; + + if (blacklist_seq != journal_seq) { ret = bch2_journal_seq_blacklist_add(c, - journal_seq, - journal_seq + 8); + blacklist_seq, journal_seq); if (ret) { bch_err(c, "error creating new journal seq blacklist entry"); goto err; } - - journal_seq += 8; - - /* - * The superblock needs to be written before we do any btree - * node writes: it will be in the read_write() path - */ - } - - ret = bch2_blacklist_table_initialize(c); - - if (!list_empty(&c->journal_entries)) { - ret = verify_journal_entries_not_blacklisted_or_missing(c, - &c->journal_entries); - if (ret) - goto err; } ret = bch2_fs_journal_start(&c->journal, journal_seq, |