9 files changed, 312 insertions, 151 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f072e865e43f..7df2bc7ecd4f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1336,14 +1336,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(extents_above_btree_updates,	12)	\
 	x(btree_updates_journalled,	13)	\
 	x(reflink_inline_data,		14)	\
-	x(new_varint,			15)
+	x(new_varint,			15)	\
+	x(journal_no_flush,		16)
 
 #define BCH_SB_FEATURES_ALL				\
 	((1ULL << BCH_FEATURE_new_siphash)|		\
 	 (1ULL << BCH_FEATURE_new_extent_overwrite)|	\
 	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
 	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
-	 (1ULL << BCH_FEATURE_new_varint))\
+	 (1ULL << BCH_FEATURE_new_varint)|		\
+	 (1ULL << BCH_FEATURE_journal_no_flush))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
@@ -1582,6 +1584,7 @@ struct jset {
 
 LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
 LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
 
 #define BCH_JOURNAL_BUCKETS_MIN		8
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3bbb23d7739a..31168754d6b8 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -79,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j)
 	struct journal_buf *buf = journal_cur_buf(j);
 
 	bkey_extent_init(&buf->key);
+	buf->noflush	= false;
+	buf->must_flush	= false;
 
 	memset(buf->has_inode, 0, sizeof(buf->has_inode));
 
@@ -574,7 +576,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 	struct journal_buf *buf;
 	int ret = 0;
 
-	if (seq <= j->seq_ondisk)
+	if (seq <= j->flushed_seq_ondisk)
 		return 1;
 
 	spin_lock(&j->lock);
@@ -585,16 +587,53 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 		goto out;
 	}
 
-	if (seq <= j->seq_ondisk) {
+	if (seq <= j->flushed_seq_ondisk) {
 		ret = 1;
 		goto out;
 	}
 
-	if (parent &&
-	    (buf = journal_seq_to_buf(j, seq)))
-		if (!closure_wait(&buf->wait, parent))
+	/* if seq was written, but not flushed - flush a newer one instead */
+	seq = max(seq, last_unwritten_seq(j));
+
+recheck_need_open:
+	if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+		struct journal_res res = { 0 };
+
+		spin_unlock(&j->lock);
+
+		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+		if (ret)
+			return ret;
+
+		seq = res.seq;
+		buf = j->buf + (seq & JOURNAL_BUF_MASK);
+		buf->must_flush = true;
+		set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+		if (parent && !closure_wait(&buf->wait, parent))
 			BUG();
 
+		bch2_journal_res_put(j, &res);
+
+		spin_lock(&j->lock);
+		goto want_write;
+	}
+
+	/*
+	 * if write was kicked off without a flush, flush the next sequence
+	 * number instead
+	 */
+	buf = journal_seq_to_buf(j, seq);
+	if (buf->noflush) {
+		seq++;
+		goto recheck_need_open;
+	}
+
+	buf->must_flush = true;
+
+	if (parent && !closure_wait(&buf->wait, parent))
+		BUG();
+want_write:
 	if (seq == journal_cur_seq(j))
 		journal_entry_want_write(j);
 out:
@@ -979,6 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	spin_lock(&j->lock);
 
 	set_bit(JOURNAL_STARTED, &j->flags);
+	j->last_flush_write = jiffies;
 
 	journal_pin_new_entry(j, 1);
 
@@ -1116,6 +1156,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       "last_seq:\t\t%llu\n"
 	       "last_seq_ondisk:\t%llu\n"
 	       "prereserved:\t\t%u/%u\n"
+	       "nr flush writes:\t%llu\n"
+	       "nr noflush writes:\t%llu\n"
 	       "nr direct reclaim:\t%llu\n"
 	       "nr background reclaim:\t%llu\n"
 	       "current entry sectors:\t%u\n"
@@ -1127,6 +1169,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	       j->last_seq_ondisk,
 	       j->prereserved.reserved,
 	       j->prereserved.remaining,
+	       j->nr_flush_writes,
+	       j->nr_noflush_writes,
 	       j->nr_direct_reclaim,
 	       j->nr_background_reclaim,
 	       j->cur_entry_sectors,
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 1b6175cd6f1b..2c0014c3c02f 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -136,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j)
 
 static inline u64 journal_cur_seq(struct journal *j)
 {
-	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+	EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
 
 	return j->pin.back - 1;
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1aeeb58d3c2a..26556bb381b2 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -10,9 +10,26 @@
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "trace.h"
 
+static void __journal_replay_free(struct journal_replay *i)
+{
+	list_del(&i->list);
+	kvpfree(i, offsetof(struct journal_replay, j) +
+		vstruct_bytes(&i->j));
+
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+	i->ignore = true;
+
+	if (!c->opts.read_entire_journal)
+		__journal_replay_free(i);
+}
+
 struct journal_list {
 	struct closure		cl;
 	struct mutex		lock;
@@ -35,28 +52,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	struct bch_devs_list devs = { .nr = 0 };
 	struct list_head *where;
 	size_t bytes = vstruct_bytes(j);
-	__le64 last_seq;
+	u64 last_seq = 0;
 	int ret;
 
-	last_seq = !list_empty(jlist->head)
-		? list_last_entry(jlist->head, struct journal_replay,
-				  list)->j.last_seq
-		: 0;
-
-	if (!c->opts.read_entire_journal) {
-		/* Is this entry older than the range we need? */
-		if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-			ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-			goto out;
+	list_for_each_entry_reverse(i, jlist->head, list) {
+		if (!JSET_NO_FLUSH(&i->j)) {
+			last_seq = le64_to_cpu(i->j.last_seq);
+			break;
 		}
+	}
 
-		/* Drop entries we don't need anymore */
+	/* Is this entry older than the range we need? */
+	if (!c->opts.read_entire_journal &&
+	    le64_to_cpu(j->seq) < last_seq) {
+		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+		goto out;
+	}
+
+	/* Drop entries we don't need anymore */
+	if (!JSET_NO_FLUSH(j)) {
 		list_for_each_entry_safe(i, pos, jlist->head, list) {
 			if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
 				break;
-			list_del(&i->list);
-			kvpfree(i, offsetof(struct journal_replay, j) +
-				vstruct_bytes(&i->j));
+			journal_replay_free(c, i);
 		}
 	}
 
@@ -80,9 +98,7 @@ add:
 	if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
 		if (i->bad) {
 			devs = i->devs;
-			list_del(&i->list);
-			kvpfree(i, offsetof(struct journal_replay, j) +
-				vstruct_bytes(&i->j));
+			__journal_replay_free(i);
 		} else if (bad) {
 			goto found;
 		} else {
@@ -104,6 +120,7 @@ add:
 	list_add(&i->list, where);
 	i->devs = devs;
 	i->bad	= bad;
+	i->ignore = false;
 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
 found:
 	if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -698,14 +715,16 @@ err:
 	goto out;
 }
 
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
+		      u64 *blacklist_seq, u64 *start_seq)
 {
 	struct journal_list jlist;
-	struct journal_replay *i;
+	struct journal_replay *i, *t;
 	struct bch_dev *ca;
 	unsigned iter;
 	size_t keys = 0, entries = 0;
 	bool degraded = false;
+	u64 seq, last_seq = 0;
 	int ret = 0;
 
 	closure_init_stack(&jlist.cl);
@@ -734,12 +753,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	if (jlist.ret)
 		return jlist.ret;
 
+	if (list_empty(list)) {
+		bch_info(c, "journal read done, but no entries found");
+		return 0;
+	}
+
+	i = list_last_entry(list, struct journal_replay, list);
+	*start_seq = le64_to_cpu(i->j.seq) + 1;
+
+	/*
+	 * Find most recent flush entry, and ignore newer non flush entries -
+	 * those entries will be blacklisted:
+	 */
+	list_for_each_entry_safe_reverse(i, t, list, list) {
+		if (i->ignore)
+			continue;
+
+		if (!JSET_NO_FLUSH(&i->j)) {
+			last_seq	= le64_to_cpu(i->j.last_seq);
+			*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
+			break;
+		}
+
+		journal_replay_free(c, i);
+	}
+
+	if (!last_seq) {
+		fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+		return -1;
+	}
+
+	/* Drop blacklisted entries and entries older than last_seq: */
+	list_for_each_entry_safe(i, t, list, list) {
+		if (i->ignore)
+			continue;
+
+		seq = le64_to_cpu(i->j.seq);
+		if (seq < last_seq) {
+			journal_replay_free(c, i);
+			continue;
+		}
+
+		if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+				    "found blacklisted journal entry %llu", seq);
+
+			journal_replay_free(c, i);
+		}
+	}
+
+	/* Check for missing entries: */
+	seq = last_seq;
+	list_for_each_entry(i, list, list) {
+		if (i->ignore)
+			continue;
+
+		BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+		while (seq < le64_to_cpu(i->j.seq)) {
+			u64 missing_start, missing_end;
+
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
+
+			if (seq == le64_to_cpu(i->j.seq))
+				break;
+
+			missing_start = seq;
+
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       !bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
+
+			missing_end = seq - 1;
+			fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+				 missing_start, missing_end,
+				 last_seq, *blacklist_seq - 1);
+		}
+
+		seq++;
+	}
+
 	list_for_each_entry(i, list, list) {
 		struct jset_entry *entry;
 		struct bkey_i *k, *_n;
 		struct bch_replicas_padded replicas;
 		char buf[80];
 
+		if (i->ignore)
+			continue;
+
 		ret = jset_validate_entries(c, &i->j, READ);
 		if (ret)
 			goto fsck_err;
@@ -767,12 +871,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 		entries++;
 	}
 
-	if (!list_empty(list)) {
-		i = list_last_entry(list, struct journal_replay, list);
+	bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+		 keys, entries, *start_seq);
 
-		bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-			 keys, entries, le64_to_cpu(i->j.seq));
-	}
+	if (*start_seq != *blacklist_seq)
+		bch_info(c, "dropped unflushed entries %llu-%llu",
+			 *blacklist_seq, *start_seq - 1);
 fsck_err:
 	return ret;
 }
@@ -990,8 +1094,12 @@ static void journal_write_done(struct closure *cl)
 	j->seq_ondisk		= seq;
 	if (err && (!j->err_seq || seq < j->err_seq))
 		j->err_seq	= seq;
-	j->last_seq_ondisk	= last_seq;
-	bch2_journal_space_available(j);
+
+	if (!w->noflush) {
+		j->flushed_seq_ondisk = seq;
+		j->last_seq_ondisk = last_seq;
+		bch2_journal_space_available(j);
+	}
 
 	/*
 	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@@ -1067,6 +1175,22 @@ void bch2_journal_write(struct closure *cl)
 
 	j->write_start_time = local_clock();
 
+	spin_lock(&j->lock);
+	if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
+	    !w->must_flush &&
+	    (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+	    test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+		w->noflush = true;
+		SET_JSET_NO_FLUSH(jset, true);
+		jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
+
+		j->nr_noflush_writes++;
+	} else {
+		j->last_flush_write = jiffies;
+		j->nr_flush_writes++;
+	}
+	spin_unlock(&j->lock);
+
 	/*
 	 * New btree roots are set by journalling them; when the journal entry
 	 * gets written we have to propagate them to c->btree_roots
@@ -1183,11 +1307,12 @@ retry_alloc:
 			     sectors);
 
 		bio = ca->journal.bio;
-		bio_reset(bio, ca->disk_sb.bdev,
-			  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= ptr->offset;
 		bio->bi_end_io		= journal_write_endio;
 		bio->bi_private		= ca;
+		if (!JSET_NO_FLUSH(jset))
+			bio->bi_opf    |= REQ_PREFLUSH|REQ_FUA;
 		bch2_bio_map(bio, jset, sectors << 9);
 
 		trace_journal_write(bio);
@@ -1196,18 +1321,19 @@ retry_alloc:
 		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
 	}
 
-	for_each_rw_member(ca, c, i)
-		if (journal_flushes_device(ca) &&
-		    !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
-			percpu_ref_get(&ca->io_ref);
-
-			bio = ca->journal.bio;
-			bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
-			bio->bi_end_io		= journal_write_endio;
-			bio->bi_private		= ca;
-			closure_bio_submit(bio, cl);
-		}
-
+	if (!JSET_NO_FLUSH(jset)) {
+		for_each_rw_member(ca, c, i)
+			if (journal_flushes_device(ca) &&
+			    !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
+				percpu_ref_get(&ca->io_ref);
+
+				bio = ca->journal.bio;
+				bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+				bio->bi_end_io		= journal_write_endio;
+				bio->bi_private		= ca;
+				closure_bio_submit(bio, cl);
+			}
+	}
 no_io:
 	bch2_bucket_seq_cleanup(c);
 
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 6958ee0f8cf2..6b4c80968f52 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -11,6 +11,7 @@ struct journal_replay {
 	struct bch_devs_list	devs;
 	/* checksum error, but we may want to try using it anyways: */
 	bool			bad;
+	bool			ignore;
 	/* must be last: */
 	struct jset		j;
 };
@@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
 		vstruct_for_each_safe(entry, k, _n)
 
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
 
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c6267284a028..a3d5405991b9 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -158,7 +158,7 @@ void bch2_journal_space_available(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	unsigned clean;
+	unsigned clean, clean_ondisk, total;
 	unsigned overhead, u64s_remaining = 0;
 	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
 				       j->buf[1].buf_size >> 9);
@@ -204,13 +204,21 @@ void bch2_journal_space_available(struct journal *j)
 	for (i = 0; i < journal_space_nr; i++)
 		j->space[i] = __journal_space_available(j, nr_devs_want, i);
 
+	clean_ondisk	= j->space[journal_space_clean_ondisk].total;
 	clean		= j->space[journal_space_clean].total;
+	total		= j->space[journal_space_total].total;
 
 	if (!j->space[journal_space_discarded].next_entry)
 		ret = cur_entry_journal_full;
 	else if (!fifo_free(&j->pin))
 		ret = cur_entry_journal_pin_full;
 
+	if ((clean - clean_ondisk <= total / 8) &&
+	    (clean_ondisk * 2 > clean ))
+		set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+	else
+		clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
 	overhead = DIV_ROUND_UP(clean, max_entry_size) *
 		journal_entry_overhead(j);
 	u64s_remaining = clean << 6;
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index d0f1bbf8f6a7..e1b63f3879f4 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -118,7 +118,7 @@ out_write_sb:
 out:
 	mutex_unlock(&c->sb_lock);
 
-	return ret;
+	return ret ?: bch2_blacklist_table_initialize(c);
 }
 
 static int journal_seq_blacklist_table_cmp(const void *_l,
@@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
 	struct journal_seq_blacklist_table *t;
 	unsigned i, nr = blacklist_nr_entries(bl);
 
-	BUG_ON(c->journal_seq_blacklist_table);
-
 	if (!bl)
 		return 0;
 
@@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
 			journal_seq_blacklist_table_cmp,
 			NULL);
 
+	kfree(c->journal_seq_blacklist_table);
 	c->journal_seq_blacklist_table = t;
 	return 0;
 }
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 6b525dc6ab7c..cf9675310f2b 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -29,6 +29,8 @@ struct journal_buf {
 	unsigned		disk_sectors;	/* maximum size entry could have been, if
 						   buf_size was bigger */
 	unsigned		u64s_reserved;
+	bool			noflush;	/* write has already been kicked off, and was noflush */
+	bool			must_flush;	/* something wants a flush */
 	/* bloom filter: */
 	unsigned long		has_inode[1024 / sizeof(unsigned long)];
 };
@@ -146,6 +148,7 @@ enum {
 	JOURNAL_RECLAIM_STARTED,
 	JOURNAL_NEED_WRITE,
 	JOURNAL_MAY_GET_UNRESERVED,
+	JOURNAL_MAY_SKIP_FLUSH,
 };
 
 /* Embedded in struct bch_fs */
@@ -203,6 +206,7 @@ struct journal {
 
 	/* seq, last_seq from the most recent journal entry successfully written */
 	u64			seq_ondisk;
+	u64			flushed_seq_ondisk;
 	u64			last_seq_ondisk;
 	u64			err_seq;
 	u64			last_empty_seq;
@@ -252,11 +256,15 @@ struct journal {
 
 	unsigned		write_delay_ms;
 	unsigned		reclaim_delay_ms;
+	unsigned long		last_flush_write;
 
 	u64			res_get_blocked_start;
 	u64			need_write_time;
 	u64			write_start_time;
 
+	u64			nr_flush_writes;
+	u64			nr_noflush_writes;
+
 	struct bch2_time_stats	*write_time;
 	struct bch2_time_stats	*delay_time;
 	struct bch2_time_stats	*blocked_time;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 7ad5b8234747..ecd51d45743a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 
 static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 {
-	struct journal_replay *p;
+	struct journal_replay *i;
 	struct jset_entry *entry;
 	struct bkey_i *k, *_n;
 	struct journal_keys keys = { NULL };
@@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 	if (list_empty(journal_entries))
 		return keys;
 
-	keys.journal_seq_base =
-		le64_to_cpu(list_last_entry(journal_entries,
-				struct journal_replay, list)->j.last_seq);
-
-	list_for_each_entry(p, journal_entries, list) {
-		if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+	list_for_each_entry(i, journal_entries, list) {
+		if (i->ignore)
 			continue;
 
-		for_each_jset_key(k, _n, entry, &p->j)
+		if (!keys.journal_seq_base)
+			keys.journal_seq_base = le64_to_cpu(i->j.seq);
+
+		for_each_jset_key(k, _n, entry, &i->j)
 			nr_keys++;
 	}
 
-
 	keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
 	if (!keys.d)
 		goto err;
 
-	list_for_each_entry(p, journal_entries, list) {
-		if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+	list_for_each_entry(i, journal_entries, list) {
+		if (i->ignore)
 			continue;
 
-		for_each_jset_key(k, _n, entry, &p->j)
+		BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+
+		for_each_jset_key(k, _n, entry, &i->j)
 			keys.d[keys.nr++] = (struct journal_key) {
 				.btree_id	= entry->btree_id,
 				.level		= entry->level,
 				.k		= k,
-				.journal_seq	= le64_to_cpu(p->j.seq) -
+				.journal_seq	= le64_to_cpu(i->j.seq) -
 					keys.journal_seq_base,
-				.journal_offset	= k->_data - p->j._data,
+				.journal_offset	= k->_data - i->j._data,
 			};
 	}
 
@@ -643,46 +643,6 @@ err:
 	return ret;
 }
 
-static bool journal_empty(struct list_head *journal)
-{
-	return list_empty(journal) ||
-		journal_entry_empty(&list_last_entry(journal,
-					struct journal_replay, list)->j);
-}
-
-static int
-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
-						  struct list_head *journal)
-{
-	struct journal_replay *i =
-		list_last_entry(journal, struct journal_replay, list);
-	u64 start_seq	= le64_to_cpu(i->j.last_seq);
-	u64 end_seq	= le64_to_cpu(i->j.seq);
-	u64 seq		= start_seq;
-	int ret = 0;
-
-	list_for_each_entry(i, journal, list) {
-		if (le64_to_cpu(i->j.seq) < start_seq)
-			continue;
-
-		fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
-			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
-			seq, le64_to_cpu(i->j.seq) - 1,
-			start_seq, end_seq);
-
-		seq = le64_to_cpu(i->j.seq);
-
-		fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
-			    "found blacklisted journal entry %llu", seq);
-
-		do {
-			seq++;
-		} while (bch2_journal_seq_is_blacklisted(c, seq, false));
-	}
-fsck_err:
-	return ret;
-}
-
 /* journal replay early: */
 
 static int journal_replay_entry_early(struct bch_fs *c,
@@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c,
 				struct bch_sb_field_clean *clean,
 				struct list_head *journal)
 {
+	struct journal_replay *i;
 	struct jset_entry *entry;
 	int ret;
 
@@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c,
 				return ret;
 		}
 	} else {
-		struct journal_replay *i =
-			list_last_entry(journal, struct journal_replay, list);
+		list_for_each_entry(i, journal, list) {
+			if (i->ignore)
+				continue;
 
-		c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+			c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+			c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
 
-		list_for_each_entry(i, journal, list)
 			vstruct_for_each(&i->j, entry) {
 				ret = journal_replay_entry_early(c, entry);
 				if (ret)
 					return ret;
 			}
+		}
 	}
 
 	bch2_fs_usage_initialize(c);
@@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c,
 	struct bch_sb_field_clean *clean = *cleanp;
 	int ret = 0;
 
-	if (!c->sb.clean || !j)
-		return 0;
-
 	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
 			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
 			le64_to_cpu(clean->journal_seq),
@@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_clean *clean = NULL;
-	u64 journal_seq;
+	struct jset *last_journal_entry = NULL;
+	u64 blacklist_seq, journal_seq;
 	bool write_sb = false, need_write_alloc = false;
 	int ret;
 
@@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c)
 		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	}
 
+	ret = bch2_blacklist_table_initialize(c);
+	if (ret) {
+		bch_err(c, "error initializing blacklist table");
+		goto err;
+	}
+
 	if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
-		struct jset *j;
+		struct journal_replay *i;
 
-		ret = bch2_journal_read(c, &c->journal_entries);
+		ret = bch2_journal_read(c, &c->journal_entries,
+					&blacklist_seq, &journal_seq);
 		if (ret)
 			goto err;
 
-		if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
+		list_for_each_entry_reverse(i, &c->journal_entries, list)
+			if (!i->ignore) {
+				last_journal_entry = &i->j;
+				break;
+			}
+
+		if (mustfix_fsck_err_on(c->sb.clean &&
+					last_journal_entry &&
+					!journal_entry_empty(last_journal_entry), c,
 				"filesystem marked clean but journal not empty")) {
 			c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
 			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 			c->sb.clean = false;
 		}
 
-		if (!c->sb.clean && list_empty(&c->journal_entries)) {
-			bch_err(c, "no journal entries found");
-			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
-			goto err;
+		if (!last_journal_entry) {
+			fsck_err_on(!c->sb.clean, c, "no journal entries found");
+			goto use_clean;
 		}
 
 		c->journal_keys = journal_keys_sort(&c->journal_entries);
@@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c)
 			goto err;
 		}
 
-		j = &list_last_entry(&c->journal_entries,
-				     struct journal_replay, list)->j;
-
-		ret = verify_superblock_clean(c, &clean, j);
-		if (ret)
+		if (c->sb.clean && last_journal_entry) {
+			ret = verify_superblock_clean(c, &clean,
+						      last_journal_entry);
+			if (ret)
+				goto err;
+		}
+	} else {
+use_clean:
+		if (!clean) {
+			bch_err(c, "no superblock clean section found");
+			ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
 			goto err;
 
-		journal_seq = le64_to_cpu(j->seq) + 1;
-	} else {
-		journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+		}
+		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}
 
 	if (!c->sb.clean &&
@@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	if (!c->sb.clean) {
+	/*
+	 * After an unclean shutdown, skip then next few journal sequence
+	 * numbers as they may have been referenced by btree writes that
+	 * happened before their corresponding journal writes - those btree
+	 * writes need to be ignored, by skipping and blacklisting the next few
+	 * journal sequence numbers:
+	 */
+	if (!c->sb.clean)
+		journal_seq += 8;
+
+	if (blacklist_seq != journal_seq) {
 		ret = bch2_journal_seq_blacklist_add(c,
-						     journal_seq,
-						     journal_seq + 8);
+					blacklist_seq, journal_seq);
 		if (ret) {
 			bch_err(c, "error creating new journal seq blacklist entry");
 			goto err;
 		}
-
-		journal_seq += 8;
-
-		/*
-		 * The superblock needs to be written before we do any btree
-		 * node writes: it will be in the read_write() path
-		 */
-	}
-
-	ret = bch2_blacklist_table_initialize(c);
-
-	if (!list_empty(&c->journal_entries)) {
-		ret = verify_journal_entries_not_blacklisted_or_missing(c,
-							&c->journal_entries);
-		if (ret)
-			goto err;
 	}
 
 	ret = bch2_fs_journal_start(&c->journal, journal_seq,