summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-07-17 13:50:15 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:08:12 -0400
commit3636ed489ac05e61d59be29b8e69111ef781d528 (patch)
tree902e9108089d77777bf1dd002893f84ce24cd16c /fs
parenteb8632657f79ee29941f4013b81cdd4aaeeca1a8 (diff)
downloadlinux-3636ed489ac05e61d59be29b8e69111ef781d528.tar.gz
linux-3636ed489ac05e61d59be29b8e69111ef781d528.tar.bz2
linux-3636ed489ac05e61d59be29b8e69111ef781d528.zip
bcachefs: Deferred btree updates
Will be used in the future for inode updates, which will be very helpful for multithreaded workloads that have to update the inode with every extent update (appends, or updates that change i_sectors) Also will be used eventually for fully persistent alloc info However - we still need a mechanism for reserving space in the journal prior to getting a journal reservation, so it's not technically safe to make use of this just yet, we could deadlock with the journal full (although not likely to be an issue in practice) Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/btree_types.h23
-rw-r--r--fs/bcachefs/btree_update.h12
-rw-r--r--fs/bcachefs/btree_update_leaf.c258
-rw-r--r--fs/bcachefs/journal_reclaim.c19
-rw-r--r--fs/bcachefs/journal_reclaim.h2
5 files changed, 259 insertions, 55 deletions
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index f34f340ff034..ce5127301cb2 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -245,9 +245,28 @@ struct btree_iter {
#define BTREE_ITER_MAX 8
+struct deferred_update {
+ struct journal_entry_pin journal;
+
+ spinlock_t lock;
+ unsigned gen;
+
+ u8 allocated_u64s;
+ enum btree_id btree_id;
+
+ /* must be last: */
+ struct bkey_i k;
+};
+
struct btree_insert_entry {
- struct btree_iter *iter;
- struct bkey_i *k;
+ struct bkey_i *k;
+
+ union {
+ struct btree_iter *iter;
+ struct deferred_update *d;
+ };
+
+ bool deferred;
};
struct btree_trans {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index d1647f6eb476..824fb0d1b7f0 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -16,6 +16,11 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
struct bkey_i *);
+void bch2_deferred_update_free(struct bch_fs *,
+ struct deferred_update *);
+struct deferred_update *
+bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned);
+
/* Normal update interface: */
struct btree_insert {
@@ -38,6 +43,13 @@ int __bch2_btree_insert_at(struct btree_insert *);
.k = (_k), \
})
+#define BTREE_INSERT_DEFERRED(_d, _k) \
+ ((struct btree_insert_entry) { \
+ .k = (_k), \
+ .d = (_d), \
+ .deferred = true, \
+ })
+
/**
* bch_btree_insert_at - insert one or more keys at iterator positions
* @iter: btree iterator
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index fd27334cf2a4..12fd7fba3e9a 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -8,6 +8,7 @@
#include "btree_locking.h"
#include "buckets.h"
#include "debug.h"
+#include "error.h"
#include "extents.h"
#include "journal.h"
#include "journal_reclaim.h"
@@ -126,6 +127,27 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
return __btree_node_flush(j, pin, 1, seq);
}
+static inline void __btree_journal_key(struct btree_insert *trans,
+ enum btree_id btree_id,
+ struct bkey_i *insert)
+{
+ struct journal *j = &trans->c->journal;
+ u64 seq = trans->journal_res.seq;
+ bool needs_whiteout = insert->k.needs_whiteout;
+
+ /* ick */
+ insert->k.needs_whiteout = false;
+ bch2_journal_add_keys(j, &trans->journal_res,
+ btree_id, insert);
+ insert->k.needs_whiteout = needs_whiteout;
+
+ bch2_journal_set_has_inode(j, &trans->journal_res,
+ insert->k.p.inode);
+
+ if (trans->journal_seq)
+ *trans->journal_seq = seq;
+}
+
void bch2_btree_journal_key(struct btree_insert *trans,
struct btree_iter *iter,
struct bkey_i *insert)
@@ -140,21 +162,9 @@ void bch2_btree_journal_key(struct btree_insert *trans,
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- u64 seq = trans->journal_res.seq;
- bool needs_whiteout = insert->k.needs_whiteout;
-
- /* ick */
- insert->k.needs_whiteout = false;
- bch2_journal_add_keys(j, &trans->journal_res,
- iter->btree_id, insert);
- insert->k.needs_whiteout = needs_whiteout;
-
- bch2_journal_set_has_inode(j, &trans->journal_res,
- insert->k.p.inode);
-
- if (trans->journal_seq)
- *trans->journal_seq = seq;
- btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
+ __btree_journal_key(trans, iter->btree_id, insert);
+ btree_bset_last(b)->journal_seq =
+ cpu_to_le64(trans->journal_res.seq);
}
if (unlikely(!journal_pin_active(&w->journal))) {
@@ -227,8 +237,109 @@ btree_insert_key_leaf(struct btree_insert *trans,
return ret;
}
-#define trans_for_each_entry(trans, i) \
- for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+/* Deferred btree updates: */
+
+static void deferred_update_flush(struct journal *j,
+ struct journal_entry_pin *pin,
+ u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct deferred_update *d =
+ container_of(pin, struct deferred_update, journal);
+ u64 tmp[32];
+ struct bkey_i *k = (void *) tmp;
+ unsigned gen;
+ int ret;
+
+ if (d->allocated_u64s > ARRAY_SIZE(tmp)) {
+ k = kmalloc(d->allocated_u64s * sizeof(u64), GFP_NOFS);
+
+ BUG_ON(!k); /* XXX */
+ }
+
+ spin_lock(&d->lock);
+ gen = d->gen;
+
+ if (journal_pin_active(&d->journal)) {
+ BUG_ON(d->k.k.u64s > d->allocated_u64s);
+ bkey_copy(k, &d->k);
+
+ spin_unlock(&d->lock);
+
+ ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+ bch2_fs_fatal_err_on(ret && !bch2_journal_error(j),
+ c, "error flushing deferred btree update: %i", ret);
+
+ spin_lock(&d->lock);
+ }
+
+ if (gen == d->gen)
+ bch2_journal_pin_drop(j, &d->journal);
+ spin_unlock(&d->lock);
+
+ if (k != (void *) tmp)
+ kfree(k);
+}
+
+static enum btree_insert_ret
+btree_insert_key_deferred(struct btree_insert *trans,
+ struct btree_insert_entry *insert)
+{
+ struct bch_fs *c = trans->c;
+ struct journal *j = &c->journal;
+ struct deferred_update *d = insert->d;
+
+ BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY);
+ BUG_ON(insert->k->u64s > d->allocated_u64s);
+
+ __btree_journal_key(trans, d->btree_id, insert->k);
+
+ spin_lock(&d->lock);
+ d->gen++;
+ bkey_copy(&d->k, insert->k);
+ spin_unlock(&d->lock);
+
+ bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal,
+ deferred_update_flush);
+
+ return BTREE_INSERT_OK;
+}
+
+void bch2_deferred_update_free(struct bch_fs *c,
+ struct deferred_update *d)
+{
+ deferred_update_flush(&c->journal, &d->journal, 0);
+
+ BUG_ON(journal_pin_active(&d->journal));
+
+ bch2_journal_pin_flush(&c->journal, &d->journal);
+ kfree(d);
+}
+
+struct deferred_update *
+bch2_deferred_update_alloc(struct bch_fs *c,
+ enum btree_id btree_id,
+ unsigned u64s)
+{
+ struct deferred_update *d;
+
+ BUG_ON(u64s > U8_MAX);
+
+ d = kmalloc(offsetof(struct deferred_update, k) +
+ u64s * sizeof(u64), GFP_NOFS);
+ BUG_ON(!d);
+
+ memset(d, 0, offsetof(struct deferred_update, k));
+
+ spin_lock_init(&d->lock);
+ d->allocated_u64s = u64s;
+ d->btree_id = btree_id;
+
+ return d;
+}
+
+/* struct btree_insert operations: */
/*
* We sort transaction entries so that if multiple iterators point to the same
@@ -238,25 +349,32 @@ static bool same_leaf_as_prev(struct btree_insert *trans,
struct btree_insert_entry *i)
{
return i != trans->entries &&
+ !i->deferred &&
i[0].iter->l[0].b == i[-1].iter->l[0].b;
}
-static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
- struct btree_insert_entry *i)
-{
- struct btree *b = i->iter->l[0].b;
+#define __trans_next_entry(_trans, _i, _filter) \
+({ \
+ while ((_i) < (_trans)->entries + (_trans->nr) && !(_filter)) \
+ (_i)++; \
+ \
+ (_i) < (_trans)->entries + (_trans->nr); \
+})
- do {
- i++;
- } while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
+#define __trans_for_each_entry(_trans, _i, _filter) \
+ for ((_i) = (_trans)->entries; \
+ __trans_next_entry(_trans, _i, _filter); \
+ (_i)++)
- return i;
-}
+#define trans_for_each_entry(trans, i) \
+ __trans_for_each_entry(trans, i, true)
+
+#define trans_for_each_iter(trans, i) \
+ __trans_for_each_entry(trans, i, !(i)->deferred)
#define trans_for_each_leaf(trans, i) \
- for ((i) = (trans)->entries; \
- (i) < (trans)->entries + (trans)->nr; \
- (i) = trans_next_leaf(trans, i))
+ __trans_for_each_entry(trans, i, !(i)->deferred && \
+ !same_leaf_as_prev(trans, i))
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
@@ -294,7 +412,8 @@ static void multi_unlock_write(struct btree_insert *trans)
static inline int btree_trans_cmp(struct btree_insert_entry l,
struct btree_insert_entry r)
{
- return btree_iter_cmp(l.iter, r.iter);
+ return (l.deferred > r.deferred) - (l.deferred < r.deferred) ?:
+ btree_iter_cmp(l.iter, r.iter);
}
/* Normal update interface: */
@@ -328,6 +447,15 @@ btree_key_can_insert(struct btree_insert *trans,
return BTREE_INSERT_OK;
}
+static inline enum btree_insert_ret
+do_btree_insert_one(struct btree_insert *trans,
+ struct btree_insert_entry *insert)
+{
+ return likely(!insert->deferred)
+ ? btree_insert_key_leaf(trans, insert)
+ : btree_insert_key_deferred(trans, insert);
+}
+
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
@@ -340,9 +468,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
unsigned u64s;
int ret;
- trans_for_each_entry(trans, i)
+ trans_for_each_iter(trans, i)
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
+ /* reserve space for deferred updates */
+ __trans_for_each_entry(trans, i, i->deferred) {
+
+ }
+
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
@@ -353,9 +486,13 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
while ((ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s,
JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
- struct btree_iter *iter = trans->entries[0].iter;
+ struct btree_iter *iter = NULL;
+
+ trans_for_each_iter(trans, i)
+ iter = i->iter;
- bch2_btree_iter_unlock(iter);
+ if (iter)
+ bch2_btree_iter_unlock(iter);
ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s,
@@ -363,7 +500,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
if (ret)
return ret;
- if (!bch2_btree_iter_relock(iter)) {
+ if (iter && !bch2_btree_iter_relock(iter)) {
trans_restart(" (iter relock after journal res get blocked)");
return -EINTR;
}
@@ -387,7 +524,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
* amount of space available:
*/
u64s = 0;
- trans_for_each_entry(trans, i) {
+ trans_for_each_iter(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
@@ -415,14 +552,17 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
* have been traversed/locked, depending on what the caller was
* doing:
*/
- for_each_btree_iter(trans->entries[0].iter, linked)
- if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
- linked->flags |= BTREE_ITER_NOUNLOCK;
+ trans_for_each_iter(trans, i) {
+ for_each_btree_iter(i->iter, linked)
+ if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
+ linked->flags |= BTREE_ITER_NOUNLOCK;
+ break;
+ }
}
trans->did_work = true;
trans_for_each_entry(trans, i) {
- switch (btree_insert_key_leaf(trans, i)) {
+ switch (do_btree_insert_one(trans, i)) {
case BTREE_INSERT_OK:
break;
case BTREE_INSERT_NEED_TRAVERSE:
@@ -444,12 +584,20 @@ out:
static inline void btree_insert_entry_checks(struct bch_fs *c,
struct btree_insert_entry *i)
{
- BUG_ON(i->iter->level);
- BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+ enum btree_id btree_id = !i->deferred
+ ? i->iter->btree_id
+ : i->d->btree_id;
+
+ if (!i->deferred) {
+ BUG_ON(i->iter->level);
+ BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+
+ bch2_btree_iter_verify_locks(i->iter);
+ }
+
BUG_ON(debug_check_bkeys(c) &&
!bkey_deleted(&i->k->k) &&
- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
- i->iter->btree_id));
+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id));
}
/**
@@ -473,20 +621,18 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
BUG_ON(!trans->nr);
- bch2_btree_iter_verify_locks(trans->entries[0].iter);
-
/* for the sake of sanity: */
BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
+ bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
+
trans_for_each_entry(trans, i)
btree_insert_entry_checks(c, i);
- bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
-
if (unlikely(!percpu_ref_tryget(&c->writes)))
return -EROFS;
retry:
- trans_for_each_entry(trans, i) {
+ trans_for_each_iter(trans, i) {
unsigned old_locks_want = i->iter->locks_want;
unsigned old_uptodate = i->iter->uptodate;
@@ -510,16 +656,22 @@ retry:
trans_for_each_leaf(trans, i)
bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
- trans_for_each_entry(trans, i)
+ trans_for_each_iter(trans, i)
bch2_btree_iter_downgrade(i->iter);
out:
percpu_ref_put(&c->writes);
/* make sure we didn't drop or screw up locks: */
- bch2_btree_iter_verify_locks(trans->entries[0].iter);
+ trans_for_each_iter(trans, i) {
+ bch2_btree_iter_verify_locks(i->iter);
+ break;
+ }
- for_each_btree_iter(trans->entries[0].iter, linked)
- linked->flags &= ~BTREE_ITER_NOUNLOCK;
+ trans_for_each_iter(trans, i) {
+ for_each_btree_iter(i->iter, linked)
+ linked->flags &= ~BTREE_ITER_NOUNLOCK;
+ break;
+ }
BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
@@ -598,7 +750,7 @@ err:
goto out;
}
- trans_for_each_entry(trans, i) {
+ trans_for_each_iter(trans, i) {
int ret2 = bch2_btree_iter_traverse(i->iter);
if (ret2) {
ret = ret2;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 6ada63f1bb25..770a6e0c7d97 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -75,6 +75,25 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
+void bch2_journal_pin_update(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ spin_lock(&j->lock);
+
+ if (pin->seq != seq) {
+ __journal_pin_drop(j, pin);
+ __journal_pin_add(j, seq, pin, flush_fn);
+ } else {
+ struct journal_entry_pin_list *pin_list =
+ journal_seq_pin(j, seq);
+
+ list_move(&pin->list, &pin_list->list);
+ }
+
+ spin_unlock(&j->lock);
+}
+
void bch2_journal_pin_add_if_older(struct journal *j,
struct journal_entry_pin *src_pin,
struct journal_entry_pin *pin,
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index f5af4252c88a..e06ac0492960 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -19,6 +19,8 @@ journal_seq_pin(struct journal *j, u64 seq)
void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
journal_pin_flush_fn);
+void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
+ journal_pin_flush_fn);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,