summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/bcachefs/bcachefs_format.h7
-rw-r--r--fs/bcachefs/journal.c54
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/journal_io.c208
-rw-r--r--fs/bcachefs/journal_io.h3
-rw-r--r--fs/bcachefs/journal_reclaim.c10
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c5
-rw-r--r--fs/bcachefs/journal_types.h8
-rw-r--r--fs/bcachefs/recovery.c166
9 files changed, 312 insertions, 151 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f072e865e43f..7df2bc7ecd4f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1336,14 +1336,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(extents_above_btree_updates, 12) \
x(btree_updates_journalled, 13) \
x(reflink_inline_data, 14) \
- x(new_varint, 15)
+ x(new_varint, 15) \
+ x(journal_no_flush, 16)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
- (1ULL << BCH_FEATURE_new_varint))\
+ (1ULL << BCH_FEATURE_new_varint)| \
+ (1ULL << BCH_FEATURE_journal_no_flush))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@@ -1582,6 +1584,7 @@ struct jset {
LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
#define BCH_JOURNAL_BUCKETS_MIN 8
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3bbb23d7739a..31168754d6b8 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -79,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j)
struct journal_buf *buf = journal_cur_buf(j);
bkey_extent_init(&buf->key);
+ buf->noflush = false;
+ buf->must_flush = false;
memset(buf->has_inode, 0, sizeof(buf->has_inode));
@@ -574,7 +576,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
struct journal_buf *buf;
int ret = 0;
- if (seq <= j->seq_ondisk)
+ if (seq <= j->flushed_seq_ondisk)
return 1;
spin_lock(&j->lock);
@@ -585,16 +587,53 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
goto out;
}
- if (seq <= j->seq_ondisk) {
+ if (seq <= j->flushed_seq_ondisk) {
ret = 1;
goto out;
}
- if (parent &&
- (buf = journal_seq_to_buf(j, seq)))
- if (!closure_wait(&buf->wait, parent))
+ /* if seq was written, but not flushed - flush a newer one instead */
+ seq = max(seq, last_unwritten_seq(j));
+
+recheck_need_open:
+ if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+ struct journal_res res = { 0 };
+
+ spin_unlock(&j->lock);
+
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ if (ret)
+ return ret;
+
+ seq = res.seq;
+ buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ buf->must_flush = true;
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+ if (parent && !closure_wait(&buf->wait, parent))
BUG();
+ bch2_journal_res_put(j, &res);
+
+ spin_lock(&j->lock);
+ goto want_write;
+ }
+
+ /*
+ * if write was kicked off without a flush, flush the next sequence
+ * number instead
+ */
+ buf = journal_seq_to_buf(j, seq);
+ if (buf->noflush) {
+ seq++;
+ goto recheck_need_open;
+ }
+
+ buf->must_flush = true;
+
+ if (parent && !closure_wait(&buf->wait, parent))
+ BUG();
+want_write:
if (seq == journal_cur_seq(j))
journal_entry_want_write(j);
out:
@@ -979,6 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
spin_lock(&j->lock);
set_bit(JOURNAL_STARTED, &j->flags);
+ j->last_flush_write = jiffies;
journal_pin_new_entry(j, 1);
@@ -1116,6 +1156,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n"
+ "nr flush writes:\t%llu\n"
+ "nr noflush writes:\t%llu\n"
"nr direct reclaim:\t%llu\n"
"nr background reclaim:\t%llu\n"
"current entry sectors:\t%u\n"
@@ -1127,6 +1169,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->last_seq_ondisk,
j->prereserved.reserved,
j->prereserved.remaining,
+ j->nr_flush_writes,
+ j->nr_noflush_writes,
j->nr_direct_reclaim,
j->nr_background_reclaim,
j->cur_entry_sectors,
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 1b6175cd6f1b..2c0014c3c02f 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -136,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j)
static inline u64 journal_cur_seq(struct journal *j)
{
- BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+ EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
return j->pin.back - 1;
}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 1aeeb58d3c2a..26556bb381b2 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -10,9 +10,26 @@
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
#include "replicas.h"
#include "trace.h"
+static void __journal_replay_free(struct journal_replay *i)
+{
+ list_del(&i->list);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
+
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+ i->ignore = true;
+
+ if (!c->opts.read_entire_journal)
+ __journal_replay_free(i);
+}
+
struct journal_list {
struct closure cl;
struct mutex lock;
@@ -35,28 +52,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct bch_devs_list devs = { .nr = 0 };
struct list_head *where;
size_t bytes = vstruct_bytes(j);
- __le64 last_seq;
+ u64 last_seq = 0;
int ret;
- last_seq = !list_empty(jlist->head)
- ? list_last_entry(jlist->head, struct journal_replay,
- list)->j.last_seq
- : 0;
-
- if (!c->opts.read_entire_journal) {
- /* Is this entry older than the range we need? */
- if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
- goto out;
+ list_for_each_entry_reverse(i, jlist->head, list) {
+ if (!JSET_NO_FLUSH(&i->j)) {
+ last_seq = le64_to_cpu(i->j.last_seq);
+ break;
}
+ }
- /* Drop entries we don't need anymore */
+ /* Is this entry older than the range we need? */
+ if (!c->opts.read_entire_journal &&
+ le64_to_cpu(j->seq) < last_seq) {
+ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+ goto out;
+ }
+
+ /* Drop entries we don't need anymore */
+ if (!JSET_NO_FLUSH(j)) {
list_for_each_entry_safe(i, pos, jlist->head, list) {
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ journal_replay_free(c, i);
}
}
@@ -80,9 +98,7 @@ add:
if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
if (i->bad) {
devs = i->devs;
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ __journal_replay_free(i);
} else if (bad) {
goto found;
} else {
@@ -104,6 +120,7 @@ add:
list_add(&i->list, where);
i->devs = devs;
i->bad = bad;
+ i->ignore = false;
unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
found:
if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -698,14 +715,16 @@ err:
goto out;
}
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
+ u64 *blacklist_seq, u64 *start_seq)
{
struct journal_list jlist;
- struct journal_replay *i;
+ struct journal_replay *i, *t;
struct bch_dev *ca;
unsigned iter;
size_t keys = 0, entries = 0;
bool degraded = false;
+ u64 seq, last_seq = 0;
int ret = 0;
closure_init_stack(&jlist.cl);
@@ -734,12 +753,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
if (jlist.ret)
return jlist.ret;
+ if (list_empty(list)) {
+ bch_info(c, "journal read done, but no entries found");
+ return 0;
+ }
+
+ i = list_last_entry(list, struct journal_replay, list);
+ *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+ /*
+ * Find most recent flush entry, and ignore newer non flush entries -
+ * those entries will be blacklisted:
+ */
+ list_for_each_entry_safe_reverse(i, t, list, list) {
+ if (i->ignore)
+ continue;
+
+ if (!JSET_NO_FLUSH(&i->j)) {
+ last_seq = le64_to_cpu(i->j.last_seq);
+ *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
+ break;
+ }
+
+ journal_replay_free(c, i);
+ }
+
+ if (!last_seq) {
+ fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+ return -1;
+ }
+
+ /* Drop blacklisted entries and entries older than last_seq: */
+ list_for_each_entry_safe(i, t, list, list) {
+ if (i->ignore)
+ continue;
+
+ seq = le64_to_cpu(i->j.seq);
+ if (seq < last_seq) {
+ journal_replay_free(c, i);
+ continue;
+ }
+
+ if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+ fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+ "found blacklisted journal entry %llu", seq);
+
+ journal_replay_free(c, i);
+ }
+ }
+
+ /* Check for missing entries: */
+ seq = last_seq;
+ list_for_each_entry(i, list, list) {
+ if (i->ignore)
+ continue;
+
+ BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+ while (seq < le64_to_cpu(i->j.seq)) {
+ u64 missing_start, missing_end;
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ if (seq == le64_to_cpu(i->j.seq))
+ break;
+
+ missing_start = seq;
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ !bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ missing_end = seq - 1;
+ fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ missing_start, missing_end,
+ last_seq, *blacklist_seq - 1);
+ }
+
+ seq++;
+ }
+
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct bch_replicas_padded replicas;
char buf[80];
+ if (i->ignore)
+ continue;
+
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
@@ -767,12 +871,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
entries++;
}
- if (!list_empty(list)) {
- i = list_last_entry(list, struct journal_replay, list);
+ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+ keys, entries, *start_seq);
- bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
- keys, entries, le64_to_cpu(i->j.seq));
- }
+ if (*start_seq != *blacklist_seq)
+ bch_info(c, "dropped unflushed entries %llu-%llu",
+ *blacklist_seq, *start_seq - 1);
fsck_err:
return ret;
}
@@ -990,8 +1094,12 @@ static void journal_write_done(struct closure *cl)
j->seq_ondisk = seq;
if (err && (!j->err_seq || seq < j->err_seq))
j->err_seq = seq;
- j->last_seq_ondisk = last_seq;
- bch2_journal_space_available(j);
+
+ if (!w->noflush) {
+ j->flushed_seq_ondisk = seq;
+ j->last_seq_ondisk = last_seq;
+ bch2_journal_space_available(j);
+ }
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@@ -1067,6 +1175,22 @@ void bch2_journal_write(struct closure *cl)
j->write_start_time = local_clock();
+ spin_lock(&j->lock);
+ if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
+ !w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+ w->noflush = true;
+ SET_JSET_NO_FLUSH(jset, true);
+ jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
+
+ j->nr_noflush_writes++;
+ } else {
+ j->last_flush_write = jiffies;
+ j->nr_flush_writes++;
+ }
+ spin_unlock(&j->lock);
+
/*
* New btree roots are set by journalling them; when the journal entry
* gets written we have to propagate them to c->btree_roots
@@ -1183,11 +1307,12 @@ retry_alloc:
sectors);
bio = ca->journal.bio;
- bio_reset(bio, ca->disk_sb.bdev,
- REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
+ if (!JSET_NO_FLUSH(jset))
+ bio->bi_opf |= REQ_PREFLUSH|REQ_FUA;
bch2_bio_map(bio, jset, sectors << 9);
trace_journal_write(bio);
@@ -1196,18 +1321,19 @@ retry_alloc:
ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
}
- for_each_rw_member(ca, c, i)
- if (journal_flushes_device(ca) &&
- !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
- percpu_ref_get(&ca->io_ref);
-
- bio = ca->journal.bio;
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
-
+ if (!JSET_NO_FLUSH(jset)) {
+ for_each_rw_member(ca, c, i)
+ if (journal_flushes_device(ca) &&
+ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
+ percpu_ref_get(&ca->io_ref);
+
+ bio = ca->journal.bio;
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
+ }
+ }
no_io:
bch2_bucket_seq_cleanup(c);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 6958ee0f8cf2..6b4c80968f52 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -11,6 +11,7 @@ struct journal_replay {
struct bch_devs_list devs;
/* checksum error, but we may want to try using it anyways: */
bool bad;
+ bool ignore;
/* must be last: */
struct jset j;
};
@@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
void bch2_journal_write(struct closure *);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c6267284a028..a3d5405991b9 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -158,7 +158,7 @@ void bch2_journal_space_available(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
- unsigned clean;
+ unsigned clean, clean_ondisk, total;
unsigned overhead, u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
@@ -204,13 +204,21 @@ void bch2_journal_space_available(struct journal *j)
for (i = 0; i < journal_space_nr; i++)
j->space[i] = __journal_space_available(j, nr_devs_want, i);
+ clean_ondisk = j->space[journal_space_clean_ondisk].total;
clean = j->space[journal_space_clean].total;
+ total = j->space[journal_space_total].total;
if (!j->space[journal_space_discarded].next_entry)
ret = cur_entry_journal_full;
else if (!fifo_free(&j->pin))
ret = cur_entry_journal_pin_full;
+ if ((clean - clean_ondisk <= total / 8) &&
+ (clean_ondisk * 2 > clean ))
+ set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+ else
+ clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
overhead = DIV_ROUND_UP(clean, max_entry_size) *
journal_entry_overhead(j);
u64s_remaining = clean << 6;
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index d0f1bbf8f6a7..e1b63f3879f4 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -118,7 +118,7 @@ out_write_sb:
out:
mutex_unlock(&c->sb_lock);
- return ret;
+ return ret ?: bch2_blacklist_table_initialize(c);
}
static int journal_seq_blacklist_table_cmp(const void *_l,
@@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
struct journal_seq_blacklist_table *t;
unsigned i, nr = blacklist_nr_entries(bl);
- BUG_ON(c->journal_seq_blacklist_table);
-
if (!bl)
return 0;
@@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
journal_seq_blacklist_table_cmp,
NULL);
+ kfree(c->journal_seq_blacklist_table);
c->journal_seq_blacklist_table = t;
return 0;
}
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 6b525dc6ab7c..cf9675310f2b 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -29,6 +29,8 @@ struct journal_buf {
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
+ bool noflush; /* write has already been kicked off, and was noflush */
+ bool must_flush; /* something wants a flush */
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
};
@@ -146,6 +148,7 @@ enum {
JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE,
JOURNAL_MAY_GET_UNRESERVED,
+ JOURNAL_MAY_SKIP_FLUSH,
};
/* Embedded in struct bch_fs */
@@ -203,6 +206,7 @@ struct journal {
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
+ u64 flushed_seq_ondisk;
u64 last_seq_ondisk;
u64 err_seq;
u64 last_empty_seq;
@@ -252,11 +256,15 @@ struct journal {
unsigned write_delay_ms;
unsigned reclaim_delay_ms;
+ unsigned long last_flush_write;
u64 res_get_blocked_start;
u64 need_write_time;
u64 write_start_time;
+ u64 nr_flush_writes;
+ u64 nr_noflush_writes;
+
struct bch2_time_stats *write_time;
struct bch2_time_stats *delay_time;
struct bch2_time_stats *blocked_time;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 7ad5b8234747..ecd51d45743a 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys)
static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
{
- struct journal_replay *p;
+ struct journal_replay *i;
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct journal_keys keys = { NULL };
@@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
if (list_empty(journal_entries))
return keys;
- keys.journal_seq_base =
- le64_to_cpu(list_last_entry(journal_entries,
- struct journal_replay, list)->j.last_seq);
-
- list_for_each_entry(p, journal_entries, list) {
- if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+ list_for_each_entry(i, journal_entries, list) {
+ if (i->ignore)
continue;
- for_each_jset_key(k, _n, entry, &p->j)
+ if (!keys.journal_seq_base)
+ keys.journal_seq_base = le64_to_cpu(i->j.seq);
+
+ for_each_jset_key(k, _n, entry, &i->j)
nr_keys++;
}
-
keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
if (!keys.d)
goto err;
- list_for_each_entry(p, journal_entries, list) {
- if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+ list_for_each_entry(i, journal_entries, list) {
+ if (i->ignore)
continue;
- for_each_jset_key(k, _n, entry, &p->j)
+ BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+
+ for_each_jset_key(k, _n, entry, &i->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
- .journal_seq = le64_to_cpu(p->j.seq) -
+ .journal_seq = le64_to_cpu(i->j.seq) -
keys.journal_seq_base,
- .journal_offset = k->_data - p->j._data,
+ .journal_offset = k->_data - i->j._data,
};
}
@@ -643,46 +643,6 @@ err:
return ret;
}
-static bool journal_empty(struct list_head *journal)
-{
- return list_empty(journal) ||
- journal_entry_empty(&list_last_entry(journal,
- struct journal_replay, list)->j);
-}
-
-static int
-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
- struct list_head *journal)
-{
- struct journal_replay *i =
- list_last_entry(journal, struct journal_replay, list);
- u64 start_seq = le64_to_cpu(i->j.last_seq);
- u64 end_seq = le64_to_cpu(i->j.seq);
- u64 seq = start_seq;
- int ret = 0;
-
- list_for_each_entry(i, journal, list) {
- if (le64_to_cpu(i->j.seq) < start_seq)
- continue;
-
- fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)",
- seq, le64_to_cpu(i->j.seq) - 1,
- start_seq, end_seq);
-
- seq = le64_to_cpu(i->j.seq);
-
- fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
- "found blacklisted journal entry %llu", seq);
-
- do {
- seq++;
- } while (bch2_journal_seq_is_blacklisted(c, seq, false));
- }
-fsck_err:
- return ret;
-}
-
/* journal replay early: */
static int journal_replay_entry_early(struct bch_fs *c,
@@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct list_head *journal)
{
+ struct journal_replay *i;
struct jset_entry *entry;
int ret;
@@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c,
return ret;
}
} else {
- struct journal_replay *i =
- list_last_entry(journal, struct journal_replay, list);
+ list_for_each_entry(i, journal, list) {
+ if (i->ignore)
+ continue;
- c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+ c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+ c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
- list_for_each_entry(i, journal, list)
vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
+ }
}
bch2_fs_usage_initialize(c);
@@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c,
struct bch_sb_field_clean *clean = *cleanp;
int ret = 0;
- if (!c->sb.clean || !j)
- return 0;
-
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
le64_to_cpu(clean->journal_seq),
@@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL;
- u64 journal_seq;
+ struct jset *last_journal_entry = NULL;
+ u64 blacklist_seq, journal_seq;
bool write_sb = false, need_write_alloc = false;
int ret;
@@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c)
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
+ ret = bch2_blacklist_table_initialize(c);
+ if (ret) {
+ bch_err(c, "error initializing blacklist table");
+ goto err;
+ }
+
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
- struct jset *j;
+ struct journal_replay *i;
- ret = bch2_journal_read(c, &c->journal_entries);
+ ret = bch2_journal_read(c, &c->journal_entries,
+ &blacklist_seq, &journal_seq);
if (ret)
goto err;
- if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
+ list_for_each_entry_reverse(i, &c->journal_entries, list)
+ if (!i->ignore) {
+ last_journal_entry = &i->j;
+ break;
+ }
+
+ if (mustfix_fsck_err_on(c->sb.clean &&
+ last_journal_entry &&
+ !journal_entry_empty(last_journal_entry), c,
"filesystem marked clean but journal not empty")) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
}
- if (!c->sb.clean && list_empty(&c->journal_entries)) {
- bch_err(c, "no journal entries found");
- ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
- goto err;
+ if (!last_journal_entry) {
+ fsck_err_on(!c->sb.clean, c, "no journal entries found");
+ goto use_clean;
}
c->journal_keys = journal_keys_sort(&c->journal_entries);
@@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- j = &list_last_entry(&c->journal_entries,
- struct journal_replay, list)->j;
-
- ret = verify_superblock_clean(c, &clean, j);
- if (ret)
+ if (c->sb.clean && last_journal_entry) {
+ ret = verify_superblock_clean(c, &clean,
+ last_journal_entry);
+ if (ret)
+ goto err;
+ }
+ } else {
+use_clean:
+ if (!clean) {
+ bch_err(c, "no superblock clean section found");
+ ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
goto err;
- journal_seq = le64_to_cpu(j->seq) + 1;
- } else {
- journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+ }
+ blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
if (!c->sb.clean &&
@@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c)
if (ret)
goto err;
- if (!c->sb.clean) {
+ /*
+ * After an unclean shutdown, skip then next few journal sequence
+ * numbers as they may have been referenced by btree writes that
+ * happened before their corresponding journal writes - those btree
+ * writes need to be ignored, by skipping and blacklisting the next few
+ * journal sequence numbers:
+ */
+ if (!c->sb.clean)
+ journal_seq += 8;
+
+ if (blacklist_seq != journal_seq) {
ret = bch2_journal_seq_blacklist_add(c,
- journal_seq,
- journal_seq + 8);
+ blacklist_seq, journal_seq);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
goto err;
}
-
- journal_seq += 8;
-
- /*
- * The superblock needs to be written before we do any btree
- * node writes: it will be in the read_write() path
- */
- }
-
- ret = bch2_blacklist_table_initialize(c);
-
- if (!list_empty(&c->journal_entries)) {
- ret = verify_journal_entries_not_blacklisted_or_missing(c,
- &c->journal_entries);
- if (ret)
- goto err;
}
ret = bch2_fs_journal_start(&c->journal, journal_seq,