19 files changed, 615 insertions, 999 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5d553d9b6151..3ba2b35fad53 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -27,13 +27,6 @@
 #include <linux/sched/task.h>
 #include <linux/sort.h>
 
-const char * const bch2_allocator_states[] = {
-#define x(n)	#n,
-	ALLOC_THREAD_STATES()
-#undef x
-	NULL
-};
-
 /* Persistent alloc info: */
 
 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@@ -431,7 +424,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
 		g->_mark.gen		= a.gen;
 		g->io_time[READ]	= a.io_time[READ];
 		g->io_time[WRITE]	= a.io_time[WRITE];
-		g->oldest_gen		= !gc ? a.oldest_gen : a.gen;
 		g->gen_valid		= 1;
 
 		if (!gc ||
@@ -553,7 +545,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
 		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
-		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
 	}
 
 	if (old_a.data_type && !new_a->data_type &&
@@ -698,493 +689,6 @@ out:
 	return ret;
 }
 
-/* Background allocator thread: */
-
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
-
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
-{
-	return g->mark.gen - g->oldest_gen;
-}
-
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
-				       struct bucket_mark m)
-{
-	u8 gc_gen;
-
-	if (!is_available_bucket(m))
-		return false;
-
-	if (m.owned_by_allocator)
-		return false;
-
-	if (ca->buckets_nouse &&
-	    test_bit(b, ca->buckets_nouse))
-		return false;
-
-	if (ca->new_fs_bucket_idx) {
-		/*
-		 * Device or filesystem is still being initialized, and we
-		 * haven't fully marked superblocks & journal:
-		 */
-		if (is_superblock_bucket(ca, b))
-			return false;
-
-		if (b < ca->new_fs_bucket_idx)
-			return false;
-	}
-
-	gc_gen = bucket_gc_gen(bucket(ca, b));
-
-	ca->inc_gen_needs_gc		+= gc_gen >= BUCKET_GC_GEN_MAX / 2;
-	ca->inc_gen_really_needs_gc	+= gc_gen >= BUCKET_GC_GEN_MAX;
-
-	return gc_gen < BUCKET_GC_GEN_MAX;
-}
-
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- */
-
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
-				u64 now, u64 last_seq_ondisk)
-{
-	unsigned used = m.cached_sectors;
-
-	if (used) {
-		/*
-		 * Prefer to keep buckets that have been read more recently, and
-		 * buckets that have more data in them:
-		 */
-		u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
-		u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
-
-		return -last_read_scaled;
-	} else {
-		/*
-		 * Prefer to use buckets with smaller gc_gen so that we don't
-		 * have to walk the btree and recalculate oldest_gen - but shift
-		 * off the low bits so that buckets will still have equal sort
-		 * keys when there's only a small difference, so that we can
-		 * keep sequential buckets together:
-		 */
-		return bucket_gc_gen(g) >> 4;
-	}
-}
-
-static inline int bucket_alloc_cmp(alloc_heap *h,
-				   struct alloc_heap_entry l,
-				   struct alloc_heap_entry r)
-{
-	return  cmp_int(l.key, r.key) ?:
-		cmp_int(r.nr, l.nr) ?:
-		cmp_int(l.bucket, r.bucket);
-}
-
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
-{
-	const struct alloc_heap_entry *l = _l, *r = _r;
-
-	return cmp_int(l->bucket, r->bucket);
-}
-
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets;
-	struct alloc_heap_entry e = { 0 };
-	u64 now, last_seq_ondisk;
-	size_t b, i, nr = 0;
-
-	down_read(&ca->bucket_lock);
-
-	buckets = bucket_array(ca);
-	ca->alloc_heap.used = 0;
-	now = atomic64_read(&c->io_clock[READ].now);
-	last_seq_ondisk = c->journal.flushed_seq_ondisk;
-
-	/*
-	 * Find buckets with lowest read priority, by building a maxheap sorted
-	 * by read priority and repeatedly replacing the maximum element until
-	 * all buckets have been visited.
-	 */
-	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-		struct bucket *g = &buckets->b[b];
-		struct bucket_mark m = READ_ONCE(g->mark);
-		unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
-
-		cond_resched();
-
-		if (!bch2_can_invalidate_bucket(ca, b, m))
-			continue;
-
-		if (!m.data_type &&
-		    bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-						     last_seq_ondisk,
-						     ca->dev_idx, b)) {
-			ca->buckets_waiting_on_journal++;
-			continue;
-		}
-
-		if (e.nr && e.bucket + e.nr == b && e.key == key) {
-			e.nr++;
-		} else {
-			if (e.nr)
-				heap_add_or_replace(&ca->alloc_heap, e,
-					-bucket_alloc_cmp, NULL);
-
-			e = (struct alloc_heap_entry) {
-				.bucket = b,
-				.nr	= 1,
-				.key	= key,
-			};
-		}
-	}
-
-	if (e.nr)
-		heap_add_or_replace(&ca->alloc_heap, e,
-				-bucket_alloc_cmp, NULL);
-
-	for (i = 0; i < ca->alloc_heap.used; i++)
-		nr += ca->alloc_heap.data[i].nr;
-
-	while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
-		nr -= ca->alloc_heap.data[0].nr;
-		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
-	}
-
-	up_read(&ca->bucket_lock);
-}
-
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	size_t i, nr = 0;
-
-	ca->inc_gen_needs_gc			= 0;
-	ca->inc_gen_really_needs_gc		= 0;
-	ca->buckets_waiting_on_journal		= 0;
-
-	find_reclaimable_buckets_lru(c, ca);
-
-	heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
-
-	for (i = 0; i < ca->alloc_heap.used; i++)
-		nr += ca->alloc_heap.data[i].nr;
-
-	return nr;
-}
-
-static int bucket_invalidate_btree(struct btree_trans *trans,
-				   struct bch_dev *ca, u64 b,
-				   struct bkey_i_alloc_v4 *a)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     POS(ca->dev_idx, b),
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_INTENT);
-
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	bkey_alloc_v4_init(&a->k_i);
-	a->k.p = iter.pos;
-	bch2_alloc_to_v4(k, &a->v);
-	a->v.gen++;
-	a->v.data_type		= 0;
-	a->v.dirty_sectors	= 0;
-	a->v.cached_sectors	= 0;
-	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
-	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
-
-	ret = bch2_trans_update(trans, &iter, &a->k_i,
-			       BTREE_TRIGGER_BUCKET_INVALIDATE|
-			       BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-				      u64 *journal_seq, unsigned flags)
-{
-	struct bkey_i_alloc_v4 a;
-	size_t b;
-	u64 commit_seq = 0;
-	int ret = 0;
-
-	/*
-	 * If the read-only path is trying to shut down, we can't be generating
-	 * new btree updates:
-	 */
-	if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
-		return 1;
-
-	BUG_ON(!ca->alloc_heap.used ||
-	       !ca->alloc_heap.data[0].nr);
-	b = ca->alloc_heap.data[0].bucket;
-
-	/* first, put on free_inc and mark as owned by allocator: */
-	percpu_down_read(&c->mark_lock);
-
-	bch2_mark_alloc_bucket(c, ca, b, true);
-
-	spin_lock(&c->freelist_lock);
-	verify_not_on_freelist(c, ca, b);
-	BUG_ON(!fifo_push(&ca->free_inc, b));
-	spin_unlock(&c->freelist_lock);
-
-	percpu_up_read(&c->mark_lock);
-
-	ret = bch2_trans_do(c, NULL, &commit_seq,
-			    BTREE_INSERT_NOCHECK_RW|
-			    BTREE_INSERT_NOFAIL|
-			    flags,
-			    bucket_invalidate_btree(&trans, ca, b, &a));
-
-	if (!ret) {
-		/* remove from alloc_heap: */
-		struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
-		top->bucket++;
-		top->nr--;
-
-		if (!top->nr)
-			heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-
-		/*
-		 * If we invalidating cached data then we need to wait on the
-		 * journal commit:
-		 */
-		if (a.v.data_type)
-			*journal_seq = max(*journal_seq, commit_seq);
-
-		/*
-		 * We already waiting on u.alloc_seq when we filtered out
-		 * buckets that need journal commit:
-		 */
-		BUG_ON(*journal_seq > a.v.journal_seq);
-	} else {
-		size_t b2;
-
-		/* remove from free_inc: */
-		percpu_down_read(&c->mark_lock);
-		spin_lock(&c->freelist_lock);
-
-		bch2_mark_alloc_bucket(c, ca, b, false);
-
-		BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
-		BUG_ON(b != b2);
-
-		spin_unlock(&c->freelist_lock);
-		percpu_up_read(&c->mark_lock);
-	}
-
-	return ret < 0 ? ret : 0;
-}
-
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	u64 journal_seq = 0;
-	int ret = 0;
-
-	/* Only use nowait if we've already invalidated at least one bucket: */
-	while (!ret &&
-	       !fifo_full(&ca->free_inc) &&
-	       ca->alloc_heap.used) {
-		if (kthread_should_stop()) {
-			ret = 1;
-			break;
-		}
-
-		ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
-				(!fifo_empty(&ca->free_inc)
-				 ? BTREE_INSERT_NOWAIT : 0));
-		/*
-		 * We only want to batch up invalidates when they're going to
-		 * require flushing the journal:
-		 */
-		if (!journal_seq)
-			break;
-	}
-
-	/* If we used NOWAIT, don't return the error: */
-	if (!fifo_empty(&ca->free_inc))
-		ret = 0;
-	if (ret < 0)
-		bch_err(ca, "error invalidating buckets: %i", ret);
-	if (ret)
-		return ret;
-
-	if (journal_seq)
-		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-	if (ret) {
-		bch_err(ca, "journal error: %i", ret);
-		return ret;
-	}
-
-	return 0;
-}
-
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-{
-	if (ca->allocator_state != new_state) {
-		ca->allocator_state = new_state;
-		closure_wake_up(&ca->fs->freelist_wait);
-	}
-}
-
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
-	unsigned i;
-	int ret = 0;
-
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++) {
-		/*
-		 * Don't strand buckets on the copygc freelist until
-		 * after recovery is finished:
-		 */
-		if (i == RESERVE_movinggc &&
-		    !test_bit(BCH_FS_STARTED, &c->flags))
-			continue;
-
-		if (fifo_push(&ca->free[i], b)) {
-			fifo_pop(&ca->free_inc, b);
-			ret = 1;
-			break;
-		}
-	}
-	spin_unlock(&c->freelist_lock);
-
-	ca->allocator_state = ret
-		? ALLOCATOR_running
-		: ALLOCATOR_blocked_full;
-	closure_wake_up(&c->freelist_wait);
-	return ret;
-}
-
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
-	if (!c->opts.nochanges &&
-	    ca->mi.discard &&
-	    bdev_max_discard_sectors(ca->disk_sb.bdev))
-		blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
-				     ca->mi.bucket_size, GFP_NOFS);
-}
-
-static bool allocator_thread_running(struct bch_dev *ca)
-{
-	unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
-		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
-		? ALLOCATOR_running
-		: ALLOCATOR_stopped;
-	alloc_thread_set_state(ca, state);
-	return state == ALLOCATOR_running;
-}
-
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-{
-	s64 available = dev_buckets_reclaimable(ca) -
-		(gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
-	bool ret = available > 0;
-
-	alloc_thread_set_state(ca, ret
-			       ? ALLOCATOR_running
-			       : ALLOCATOR_blocked);
-	return ret;
-}
-
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
-{
-	struct bch_dev *ca = arg;
-	struct bch_fs *c = ca->fs;
-	unsigned long gc_count = c->gc_count;
-	size_t nr;
-	int ret;
-
-	set_freezable();
-
-	while (1) {
-		ret = kthread_wait_freezable(allocator_thread_running(ca));
-		if (ret)
-			goto stop;
-
-		while (!ca->alloc_heap.used) {
-			cond_resched();
-
-			ret = kthread_wait_freezable(buckets_available(ca, gc_count));
-			if (ret)
-				goto stop;
-
-			gc_count = c->gc_count;
-			nr = find_reclaimable_buckets(c, ca);
-
-			if (!nr && ca->buckets_waiting_on_journal) {
-				ret = bch2_journal_flush(&c->journal);
-				if (ret)
-					goto stop;
-			} else if (nr < (ca->mi.nbuckets >> 6) &&
-				   ca->buckets_waiting_on_journal >= nr / 2) {
-				bch2_journal_flush_async(&c->journal, NULL);
-			}
-
-			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
-			     ca->inc_gen_really_needs_gc) &&
-			    c->gc_thread) {
-				atomic_inc(&c->kick_gc);
-				wake_up_process(c->gc_thread);
-			}
-
-			trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
-					 ca->inc_gen_really_needs_gc);
-		}
-
-		ret = bch2_invalidate_buckets(c, ca);
-		if (ret)
-			goto stop;
-
-		while (!fifo_empty(&ca->free_inc)) {
-			u64 b = fifo_peek(&ca->free_inc);
-
-			discard_one_bucket(c, ca, b);
-
-			ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
-			if (ret)
-				goto stop;
-		}
-	}
-stop:
-	alloc_thread_set_state(ca, ALLOCATOR_stopped);
-	return 0;
-}
-
 /* Startup/shutdown (ro/rw): */
 
 void bch2_recalc_capacity(struct bch_fs *c)
@@ -1193,7 +697,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
 	unsigned bucket_size_max = 0;
 	unsigned long ra_pages = 0;
-	unsigned i, j;
+	unsigned i;
 
 	lockdep_assert_held(&c->state_lock);
 
@@ -1224,8 +728,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
 		 * allocations for foreground writes must wait -
 		 * not -ENOSPC calculations.
 		 */
-		for (j = 0; j < RESERVE_none; j++)
-			dev_reserve += ca->free[j].size;
+
+		dev_reserve += ca->nr_btree_reserve * 2;
+		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
 
 		dev_reserve += 1;	/* btree write point */
 		dev_reserve += 1;	/* copygc write point */
@@ -1281,8 +786,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned i;
 
-	BUG_ON(ca->alloc_thread);
-
 	/* First, remove device from allocation groups: */
 
 	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -1356,61 +859,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 			set_bit(ca->dev_idx, c->rw_devs[i].d);
 }
 
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
-	if (ca->alloc_thread)
-		closure_wait_event(&c->freelist_wait,
-				   ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	p = rcu_dereference_protected(ca->alloc_thread, 1);
-	ca->alloc_thread = NULL;
-
-	/*
-	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
-	 * the thread shutting down to avoid bch2_wake_allocator() racing:
-	 *
-	 * XXX: it would be better to have the rcu barrier be asynchronous
-	 * instead of blocking us here
-	 */
-	synchronize_rcu();
-
-	if (p) {
-		kthread_stop(p);
-		put_task_struct(p);
-	}
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	/*
-	 * allocator thread already started?
-	 */
-	if (ca->alloc_thread)
-		return 0;
-
-	p = kthread_create(bch2_allocator_thread, ca,
-			   "bch-alloc/%s", ca->name);
-	if (IS_ERR(p)) {
-		bch_err(ca->fs, "error creating allocator thread: %li",
-			PTR_ERR(p));
-		return PTR_ERR(p);
-	}
-
-	get_task_struct(p);
-	rcu_assign_pointer(ca->alloc_thread, p);
-	wake_up_process(p);
-	return 0;
-}
-
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 8de109674f0f..74b23f9b1bd3 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,8 +8,6 @@
 #include "debug.h"
 #include "super.h"
 
-extern const char * const bch2_allocator_states[];
-
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX	96U
 
@@ -117,42 +115,11 @@ int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
 			  struct bkey_i *, unsigned);
 int bch2_fs_freespace_init(struct bch_fs *);
 
-static inline void bch2_wake_allocator(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	rcu_read_lock();
-	p = rcu_dereference(ca->alloc_thread);
-	if (p)
-		wake_up_process(p);
-	rcu_read_unlock();
-}
-
-static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
-					  size_t bucket)
-{
-	if (bch2_expensive_debug_checks) {
-		size_t iter;
-		long i;
-		unsigned j;
-
-		for (j = 0; j < RESERVE_NR; j++)
-			fifo_for_each_entry(i, &ca->free[j], iter)
-				BUG_ON(i == bucket);
-		fifo_for_each_entry(i, &ca->free_inc, iter)
-			BUG_ON(i == bucket);
-	}
-}
-
 void bch2_recalc_capacity(struct bch_fs *);
 
 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 
-void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index c4b4689fdd0f..01abcf43341f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -14,13 +14,18 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
 #include "ec.h"
+#include "error.h"
 #include "io.h"
+#include "journal.h"
 #include "trace.h"
 
 #include <linux/math64.h>
@@ -50,6 +55,17 @@ const char * const bch2_alloc_reserves[] = {
  * reference _after_ doing the index update that makes its allocation reachable.
  */
 
+void bch2_reset_alloc_cursors(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		ca->alloc_cursor = 0;
+	rcu_read_unlock();
+}
+
 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
 {
 	open_bucket_idx_t idx = ob - c->open_buckets;
@@ -85,7 +101,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	percpu_down_read(&c->mark_lock);
 	spin_lock(&ob->lock);
 
-	bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
 	ob->valid = false;
 	ob->data_type = 0;
 
@@ -185,39 +200,35 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 	}
 }
 
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-				      enum alloc_reserve reserve,
-				      bool may_alloc_partial,
-				      struct closure *cl)
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+					      u64 bucket,
+					      enum alloc_reserve reserve,
+					      struct bch_alloc_v4 *a,
+					      u64 *skipped_open,
+					      u64 *skipped_need_journal_commit,
+					      u64 *skipped_nouse,
+					      struct closure *cl)
 {
 	struct open_bucket *ob;
-	long b = 0;
 
-	spin_lock(&c->freelist_lock);
+	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+		(*skipped_nouse)++;
+		return NULL;
+	}
 
-	if (may_alloc_partial) {
-		int i;
-
-		for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
-			ob = c->open_buckets + ca->open_buckets_partial[i];
-
-			if (reserve <= ob->alloc_reserve) {
-				array_remove_item(ca->open_buckets_partial,
-						  ca->open_buckets_partial_nr,
-						  i);
-				ob->on_partial_list = false;
-				ob->alloc_reserve = reserve;
-				spin_unlock(&c->freelist_lock);
-				return ob;
-			}
-		}
+	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+		(*skipped_open)++;
+		return NULL;
+	}
+
+	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+		(*skipped_need_journal_commit)++;
+		return NULL;
 	}
 
+	spin_lock(&c->freelist_lock);
+
 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
 		if (cl)
 			closure_wait(&c->open_buckets_wait, cl);
@@ -226,36 +237,16 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 			c->blocked_allocate_open_bucket = local_clock();
 
 		spin_unlock(&c->freelist_lock);
-		trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
 		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
 	}
 
-	if (likely(fifo_pop(&ca->free[RESERVE_none], b)))
-		goto out;
-
-	switch (reserve) {
-	case RESERVE_btree_movinggc:
-	case RESERVE_movinggc:
-		if (fifo_pop(&ca->free[RESERVE_movinggc], b))
-			goto out;
-		break;
-	default:
-		break;
+	/* Recheck under lock: */
+	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+		spin_unlock(&c->freelist_lock);
+		(*skipped_open)++;
+		return NULL;
 	}
 
-	if (cl)
-		closure_wait(&c->freelist_wait, cl);
-
-	if (!c->blocked_allocate)
-		c->blocked_allocate = local_clock();
-
-	spin_unlock(&c->freelist_lock);
-
-	trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
-	return ERR_PTR(-FREELIST_EMPTY);
-out:
-	verify_not_on_freelist(c, ca, b);
-
 	ob = bch2_open_bucket_alloc(c);
 
 	spin_lock(&ob->lock);
@@ -264,8 +255,8 @@ out:
 	ob->sectors_free = ca->mi.bucket_size;
 	ob->alloc_reserve = reserve;
 	ob->dev		= ca->dev_idx;
-	ob->gen		= *bucket_gen(ca, b);
-	ob->bucket	= b;
+	ob->gen		= a->gen;
+	ob->bucket	= bucket;
 	spin_unlock(&ob->lock);
 
 	ca->nr_open_buckets++;
@@ -286,10 +277,326 @@ out:
 	}
 
 	spin_unlock(&c->freelist_lock);
+	return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+					    enum alloc_reserve reserve, u64 free_entry,
+					    u64 *skipped_open,
+					    u64 *skipped_need_journal_commit,
+					    u64 *skipped_nouse,
+					    struct bkey_s_c freespace_k,
+					    struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	struct open_bucket *ob;
+	struct bch_alloc_v4 a;
+	u64 b = free_entry & ~(~0ULL << 56);
+	unsigned genbits = free_entry >> 56;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
+		pr_buf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+		       "  freespace key ",
+			ca->mi.first_bucket, ca->mi.nbuckets);
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+	}
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret) {
+		ob = ERR_PTR(ret);
+		goto err;
+	}
+
+	bch2_alloc_to_v4(k, &a);
+
+	if (genbits != (alloc_freespace_genbits(a) >> 56)) {
+		pr_buf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+		       "  freespace key ",
+		       genbits, alloc_freespace_genbits(a) >> 56);
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		pr_buf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+
+	}
+
+	if (a.data_type != BUCKET_free) {
+		pr_buf(&buf, "non free bucket in freespace btree\n"
+		       "  freespace key ");
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		pr_buf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+	}
+
+	ob = __try_alloc_bucket(c, ca, b, reserve, &a,
+				skipped_open,
+				skipped_need_journal_commit,
+				skipped_nouse,
+				cl);
+	if (!ob)
+		iter.path->preserve = false;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ob;
+}
+
+static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
+						    enum alloc_reserve reserve)
+{
+	struct open_bucket *ob;
+	int i;
+
+	spin_lock(&c->freelist_lock);
+
+	for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+		ob = c->open_buckets + ca->open_buckets_partial[i];
+
+		if (reserve <= ob->alloc_reserve) {
+			array_remove_item(ca->open_buckets_partial,
+					  ca->open_buckets_partial_nr,
+					  i);
+			ob->on_partial_list = false;
+			ob->alloc_reserve = reserve;
+			spin_unlock(&c->freelist_lock);
+			return ob;
+		}
+	}
+
+	spin_unlock(&c->freelist_lock);
+	return NULL;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_early(struct btree_trans *trans,
+			struct bch_dev *ca,
+			enum alloc_reserve reserve,
+			u64 *buckets_seen,
+			u64 *skipped_open,
+			u64 *skipped_need_journal_commit,
+			u64 *skipped_nouse,
+			struct closure *cl)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct open_bucket *ob = NULL;
+	u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+	u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+	int ret;
+again:
+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
+			   BTREE_ITER_SLOTS, k, ret) {
+		struct bch_alloc_v4 a;
+
+		if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+			break;
+
+		if (ca->new_fs_bucket_idx &&
+		    is_superblock_bucket(ca, k.k->p.offset))
+			continue;
+
+		bch2_alloc_to_v4(k, &a);
+
+		if (bucket_state(a) != BUCKET_free)
+			continue;
+
+		(*buckets_seen)++;
+
+		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
+					skipped_open,
+					skipped_need_journal_commit,
+					skipped_nouse,
+					cl);
+		if (ob)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	ca->alloc_cursor = alloc_cursor;
+
+	if (!ob && alloc_cursor > alloc_start) {
+		alloc_cursor = alloc_start;
+		goto again;
+	}
+
+	return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
+}
+
+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
+						   struct bch_dev *ca,
+						   enum alloc_reserve reserve,
+						   u64 *buckets_seen,
+						   u64 *skipped_open,
+						   u64 *skipped_need_journal_commit,
+						   u64 *skipped_nouse,
+						   struct closure *cl)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct open_bucket *ob = NULL;
+	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+	u64 alloc_cursor = alloc_start;
+	int ret;
+
+	BUG_ON(ca->new_fs_bucket_idx);
+again:
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
+				     POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
+		if (k.k->p.inode != ca->dev_idx)
+			break;
+
+		for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
+		     alloc_cursor < k.k->p.offset;
+		     alloc_cursor++) {
+			if (btree_trans_too_many_iters(trans)) {
+				ob = ERR_PTR(-EINTR);
+				break;
+			}
+
+			(*buckets_seen)++;
+
+			ob = try_alloc_bucket(trans, ca, reserve,
+					      alloc_cursor,
+					      skipped_open,
+					      skipped_need_journal_commit,
+					      skipped_nouse,
+					      k, cl);
+			if (ob) {
+				iter.path->preserve = false;
+				break;
+			}
+		}
+		if (ob)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	ca->alloc_cursor = alloc_cursor;
+
+	if (!ob && ret)
+		ob = ERR_PTR(ret);
+
+	if (!ob && alloc_start > ca->mi.first_bucket) {
+		alloc_cursor = alloc_start = ca->mi.first_bucket;
+		goto again;
+	}
+
+	return ob;
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+				      struct bch_dev *ca,
+				      enum alloc_reserve reserve,
+				      bool may_alloc_partial,
+				      struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct open_bucket *ob = NULL;
+	u64 avail = dev_buckets_available(ca, reserve);
+	u64 buckets_seen = 0;
+	u64 skipped_open = 0;
+	u64 skipped_need_journal_commit = 0;
+	u64 skipped_nouse = 0;
+
+	if (may_alloc_partial) {
+		ob = try_alloc_partial_bucket(c, ca, reserve);
+		if (ob)
+			return ob;
+	}
+again:
+	if (!avail) {
+		if (cl) {
+			closure_wait(&c->freelist_wait, cl);
+			/* recheck after putting ourself on waitlist */
+			avail = dev_buckets_available(ca, reserve);
+			if (avail) {
+				closure_wake_up(&c->freelist_wait);
+				goto again;
+			}
+		}
+
+		if (!c->blocked_allocate)
+			c->blocked_allocate = local_clock();
+
+		ob = ERR_PTR(-FREELIST_EMPTY);
+		goto err;
+	}
+
+	ob = likely(ca->mi.freespace_initialized)
+		? bch2_bucket_alloc_freelist(trans, ca, reserve,
+					&buckets_seen,
+					&skipped_open,
+					&skipped_need_journal_commit,
+					&skipped_nouse,
+					cl)
+		: bch2_bucket_alloc_early(trans, ca, reserve,
+					&buckets_seen,
+					&skipped_open,
+					&skipped_need_journal_commit,
+					&skipped_nouse,
+					cl);
+
+	if (skipped_need_journal_commit * 2 > avail)
+		bch2_journal_flush_async(&c->journal, NULL);
+err:
+	if (!ob)
+		ob = ERR_PTR(-FREELIST_EMPTY);
+
+	if (!IS_ERR(ob)) {
+		trace_bucket_alloc(ca, bch2_alloc_reserves[reserve], avail,
+				   buckets_seen,
+				   skipped_open,
+				   skipped_need_journal_commit,
+				   skipped_nouse,
+				   cl == NULL, PTR_ERR_OR_ZERO(ob));
+	} else {
+		trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail,
+				   buckets_seen,
+				   skipped_open,
+				   skipped_need_journal_commit,
+				   skipped_nouse,
+				   cl == NULL, PTR_ERR_OR_ZERO(ob));
+		atomic_long_inc(&c->bucket_alloc_fail);
+	}
+
+	return ob;
+}
 
-	bch2_wake_allocator(ca);
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+				      enum alloc_reserve reserve,
+				      bool may_alloc_partial,
+				      struct closure *cl)
+{
+	struct open_bucket *ob;
 
-	trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]);
+	bch2_trans_do(c, NULL, NULL, 0,
+		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
+								   may_alloc_partial, cl)));
 	return ob;
 }
 
@@ -320,7 +627,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
 			       struct dev_stripe_state *stripe)
 {
 	u64 *v = stripe->next_alloc + ca->dev_idx;
-	u64 free_space = dev_buckets_available(ca);
+	u64 free_space = dev_buckets_available(ca, RESERVE_none);
 	u64 free_space_inv = free_space
 		? div64_u64(1ULL << 48, free_space)
 		: 1ULL << 48;
@@ -358,7 +665,7 @@ static void add_new_bucket(struct bch_fs *c,
 	ob_push(c, ptrs, ob);
 }
 
-int bch2_bucket_alloc_set(struct bch_fs *c,
+static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 		      struct open_buckets *ptrs,
 		      struct dev_stripe_state *stripe,
 		      struct bch_devs_mask *devs_may_alloc,
@@ -369,10 +676,12 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
 		      unsigned flags,
 		      struct closure *cl)
 {
+	struct bch_fs *c = trans->c;
 	struct dev_alloc_list devs_sorted =
 		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+	unsigned dev;
 	struct bch_dev *ca;
-	int ret = -INSUFFICIENT_DEVICES;
+	int ret = 0;
 	unsigned i;
 
 	BUG_ON(*nr_effective >= nr_replicas);
@@ -380,35 +689,68 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
 	for (i = 0; i < devs_sorted.nr; i++) {
 		struct open_bucket *ob;
 
-		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		dev = devs_sorted.devs[i];
+
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
 		if (!ca)
 			continue;
 
-		if (!ca->mi.durability && *have_cache)
+		if (!ca->mi.durability && *have_cache) {
+			percpu_ref_put(&ca->ref);
 			continue;
+		}
 
-		ob = bch2_bucket_alloc(c, ca, reserve,
+		ob = bch2_bucket_alloc_trans(trans, ca, reserve,
 				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
-		if (IS_ERR(ob)) {
-			ret = PTR_ERR(ob);
-
-			if (cl)
-				return ret;
+		if (!IS_ERR(ob))
+			bch2_dev_stripe_increment(ca, stripe);
+		percpu_ref_put(&ca->ref);
+
+		ret = PTR_ERR_OR_ZERO(ob);
+		if (ret) {
+			if (ret == -EINTR || cl)
+				break;
 			continue;
 		}
 
 		add_new_bucket(c, ptrs, devs_may_alloc,
 			       nr_effective, have_cache, flags, ob);
 
-		bch2_dev_stripe_increment(ca, stripe);
-
 		if (*nr_effective >= nr_replicas)
-			return 0;
+			break;
 	}
 
+	if (*nr_effective >= nr_replicas)
+		ret = 0;
+	else if (!ret)
+		ret = -INSUFFICIENT_DEVICES;
+
 	return ret;
 }
 
+int bch2_bucket_alloc_set(struct bch_fs *c,
+		      struct open_buckets *ptrs,
+		      struct dev_stripe_state *stripe,
+		      struct bch_devs_mask *devs_may_alloc,
+		      unsigned nr_replicas,
+		      unsigned *nr_effective,
+		      bool *have_cache,
+		      enum alloc_reserve reserve,
+		      unsigned flags,
+		      struct closure *cl)
+{
+	return bch2_trans_do(c, NULL, NULL, 0,
+		      bch2_bucket_alloc_set_trans(&trans, ptrs, stripe,
+					      devs_may_alloc, nr_replicas,
+					      nr_effective, have_cache, reserve,
+					      flags, cl));
+}
+
 /* Allocate from stripes: */
 
 /*
@@ -513,7 +855,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
 	wp->ptrs = ptrs_skip;
 }
 
-static int open_bucket_add_buckets(struct bch_fs *c,
+static int open_bucket_add_buckets(struct btree_trans *trans,
 			struct open_buckets *ptrs,
 			struct write_point *wp,
 			struct bch_devs_list *devs_have,
@@ -526,6 +868,7 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 			unsigned flags,
 			struct closure *_cl)
 {
+	struct bch_fs *c = trans->c;
 	struct bch_devs_mask devs;
 	struct open_bucket *ob;
 	struct closure *cl = NULL;
@@ -557,7 +900,8 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 						 target, erasure_code,
 						 nr_replicas, nr_effective,
 						 have_cache, flags, _cl);
-			if (ret == -FREELIST_EMPTY ||
+			if (ret == -EINTR ||
+			    ret == -FREELIST_EMPTY ||
 			    ret == -OPEN_BUCKETS_EMPTY)
 				return ret;
 			if (*nr_effective >= nr_replicas)
@@ -571,25 +915,22 @@ static int open_bucket_add_buckets(struct bch_fs *c,
 	if (*nr_effective >= nr_replicas)
 		return 0;
 
-	percpu_down_read(&c->mark_lock);
-	rcu_read_lock();
-
 retry_blocking:
 	/*
 	 * Try nonblocking first, so that if one device is full we'll try from
 	 * other devices:
 	 */
-	ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
+	ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
 				nr_replicas, nr_effective, have_cache,
 				reserve, flags, cl);
-	if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
+	if (ret &&
+	    ret != -EINTR &&
+	    ret != -INSUFFICIENT_DEVICES &&
+	    !cl && _cl) {
 		cl = _cl;
 		goto retry_blocking;
 	}
 
-	rcu_read_unlock();
-	percpu_up_read(&c->mark_lock);
-
 	return ret;
 }
 
@@ -703,15 +1044,25 @@ static bool try_decrease_writepoints(struct bch_fs *c,
 	return true;
 }
 
-static struct write_point *writepoint_find(struct bch_fs *c,
+static void bch2_trans_mutex_lock(struct btree_trans *trans,
+				  struct mutex *lock)
+{
+	if (!mutex_trylock(lock)) {
+		bch2_trans_unlock(trans);
+		mutex_lock(lock);
+	}
+}
+
+static struct write_point *writepoint_find(struct btree_trans *trans,
 					   unsigned long write_point)
 {
+	struct bch_fs *c = trans->c;
 	struct write_point *wp, *oldest;
 	struct hlist_head *head;
 
 	if (!(write_point & 1UL)) {
 		wp = (struct write_point *) write_point;
-		mutex_lock(&wp->lock);
+		bch2_trans_mutex_lock(trans, &wp->lock);
 		return wp;
 	}
 
@@ -720,7 +1071,7 @@ restart_find:
 	wp = __writepoint_find(head, write_point);
 	if (wp) {
 lock_wp:
-		mutex_lock(&wp->lock);
+		bch2_trans_mutex_lock(trans, &wp->lock);
 		if (wp->write_point == write_point)
 			goto out;
 		mutex_unlock(&wp->lock);
@@ -733,8 +1084,8 @@ restart_find_oldest:
 		if (!oldest || time_before64(wp->last_used, oldest->last_used))
 			oldest = wp;
 
-	mutex_lock(&oldest->lock);
-	mutex_lock(&c->write_points_hash_lock);
+	bch2_trans_mutex_lock(trans, &oldest->lock);
+	bch2_trans_mutex_lock(trans, &c->write_points_hash_lock);
 	if (oldest >= c->write_points + c->write_points_nr ||
 	    try_increase_writepoints(c)) {
 		mutex_unlock(&c->write_points_hash_lock);
@@ -762,7 +1113,7 @@ out:
 /*
  * Get us an open_bucket we can allocate from, return with it locked:
  */
-int bch2_alloc_sectors_start(struct bch_fs *c,
+int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 			     unsigned target,
 			     unsigned erasure_code,
 			     struct write_point_specifier write_point,
@@ -774,6 +1125,7 @@ int bch2_alloc_sectors_start(struct bch_fs *c,
 			     struct closure *cl,
 			     struct write_point **wp_ret)
 {
+	struct bch_fs *c = trans->c;
 	struct write_point *wp;
 	struct open_bucket *ob;
 	struct open_buckets ptrs;
@@ -793,7 +1145,7 @@ retry:
 	write_points_nr = c->write_points_nr;
 	have_cache	= false;
 
-	*wp_ret = wp = writepoint_find(c, write_point.v);
+	*wp_ret = wp = writepoint_find(trans, write_point.v);
 
 	if (wp->data_type == BCH_DATA_user)
 		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@@ -803,21 +1155,21 @@ retry:
 		have_cache = true;
 
 	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve,
 					      ob_flags, cl);
 	} else {
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve,
 					      ob_flags, NULL);
-		if (!ret)
+		if (!ret || ret == -EINTR)
 			goto alloc_done;
 
-		ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      0, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, reserve,
@@ -871,10 +1223,33 @@ err:
 	case -INSUFFICIENT_DEVICES:
 		return -EROFS;
 	default:
-		BUG();
+		return ret;
 	}
 }
 
+int bch2_alloc_sectors_start(struct bch_fs *c,
+			     unsigned target,
+			     unsigned erasure_code,
+			     struct write_point_specifier write_point,
+			     struct bch_devs_list *devs_have,
+			     unsigned nr_replicas,
+			     unsigned nr_replicas_required,
+			     enum alloc_reserve reserve,
+			     unsigned flags,
+			     struct closure *cl,
+			     struct write_point **wp_ret)
+{
+	return bch2_trans_do(c, NULL, NULL, 0,
+			     bch2_alloc_sectors_start_trans(&trans, target,
+							    erasure_code,
+							    write_point,
+							    devs_have,
+							    nr_replicas,
+							    nr_replicas_required,
+							    reserve,
+							    flags, cl, wp_ret));
+}
+
 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index efb6d155bd00..12583a7e7aa3 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -14,6 +14,8 @@ struct bch_devs_List;
 
 extern const char * const bch2_alloc_reserves[];
 
+void bch2_reset_alloc_cursors(struct bch_fs *);
+
 struct dev_alloc_list {
 	unsigned	nr;
 	u8		devs[BCH_SB_MEMBERS_MAX];
@@ -136,6 +138,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
 		      unsigned, unsigned *, bool *, enum alloc_reserve,
 		      unsigned, struct closure *);
 
+int bch2_alloc_sectors_start_trans(struct btree_trans *,
+				   unsigned, unsigned,
+				   struct write_point_specifier,
+				   struct bch_devs_list *,
+				   unsigned, unsigned,
+				   enum alloc_reserve,
+				   unsigned,
+				   struct closure *,
+				   struct write_point **);
 int bch2_alloc_sectors_start(struct bch_fs *,
 			     unsigned, unsigned,
 			     struct write_point_specifier,
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 9e00afb17559..b3bef7074511 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,18 +10,6 @@
 
 struct ec_bucket_buf;
 
-#define ALLOC_THREAD_STATES()		\
-	x(stopped)			\
-	x(running)			\
-	x(blocked)			\
-	x(blocked_full)
-
-enum allocator_states {
-#define x(n)	ALLOCATOR_##n,
-	ALLOC_THREAD_STATES()
-#undef x
-};
-
 #define BCH_ALLOC_RESERVES()		\
 	x(btree_movinggc)		\
 	x(btree)			\
@@ -32,11 +20,8 @@ enum alloc_reserve {
 #define x(name)	RESERVE_##name,
 	BCH_ALLOC_RESERVES()
 #undef x
-	RESERVE_NR
 };
 
-typedef FIFO(long)	alloc_fifo;
-
 #define OPEN_BUCKETS_COUNT	1024
 
 #define WRITE_POINT_HASH_NR	32
@@ -127,12 +112,4 @@ struct write_point_specifier {
 	unsigned long		v;
 };
 
-struct alloc_heap_entry {
-	size_t			bucket;
-	size_t			nr;
-	unsigned long		key;
-};
-
-typedef HEAP(struct alloc_heap_entry) alloc_heap;
-
 #endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 6c11ebee73a9..879b2adc8b42 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -462,34 +462,18 @@ struct bch_dev {
 
 	/* Allocator: */
 	u64			new_fs_bucket_idx;
-	struct task_struct __rcu *alloc_thread;
+	u64			alloc_cursor;
 
-	/*
-	 * free: Buckets that are ready to be used
-	 *
-	 * free_inc: Incoming buckets - these are buckets that currently have
-	 * cached data in them, and we can't reuse them until after we write
-	 * their new gen to disk. After prio_write() finishes writing the new
-	 * gens/prios, they'll be moved to the free list (and possibly discarded
-	 * in the process)
-	 */
-	alloc_fifo		free[RESERVE_NR];
-	alloc_fifo		free_inc;
 	unsigned		nr_open_buckets;
+	unsigned		nr_btree_reserve;
 
 	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
 	open_bucket_idx_t	open_buckets_partial_nr;
 
-	size_t			fifo_last_bucket;
-
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
 	size_t			buckets_waiting_on_journal;
 
-	enum allocator_states	allocator_state;
-
-	alloc_heap		alloc_heap;
-
 	atomic64_t		rebalance_work;
 
 	struct journal_device	journal;
@@ -511,8 +495,6 @@ struct bch_dev {
 enum {
 	/* startup: */
 	BCH_FS_ALLOC_CLEAN,
-	BCH_FS_ALLOCATOR_RUNNING,
-	BCH_FS_ALLOCATOR_STOPPING,
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_INITIAL_GC_UNFIXED,
 	BCH_FS_TOPOLOGY_REPAIR_DONE,
@@ -914,6 +896,7 @@ mempool_t		bio_bounce_pages;
 	atomic_long_t		read_realloc_races;
 	atomic_long_t		extent_migrate_done;
 	atomic_long_t		extent_migrate_raced;
+	atomic_long_t		bucket_alloc_fail;
 
 	unsigned		btree_gc_periodic:1;
 	unsigned		copy_gc_enabled:1;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index c3d6c62ef062..7078b277e23b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1684,9 +1684,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
  */
 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 {
-	struct bch_dev *ca;
 	u64 start_time = local_clock();
-	unsigned i, iter = 0;
+	unsigned iter = 0;
 	int ret;
 
 	lockdep_assert_held(&c->state_lock);
@@ -1788,13 +1787,6 @@ out:
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 
 	/*
-	 * Wake up allocator in case it was waiting for buckets
-	 * because of not being able to inc gens
-	 */
-	for_each_member_device(ca, c, i)
-		bch2_wake_allocator(ca);
-
-	/*
 	 * At startup, allocations can happen directly instead of via the
 	 * allocator thread - issue wakeup in case they blocked on gc_lock:
 	 */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index d1e3e2c76e30..2e958f88777b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -178,12 +178,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
 	six_unlock_intent(&b->c.lock);
 }
 
-static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 					     struct disk_reservation *res,
 					     struct closure *cl,
 					     bool interior_node,
 					     unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct write_point *wp;
 	struct btree *b;
 	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
@@ -214,7 +215,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-	ret = bch2_alloc_sectors_start(c,
+	ret = bch2_alloc_sectors_start_trans(trans,
 				      c->opts.metadata_target ?:
 				      c->opts.foreground_target,
 				      0,
@@ -414,7 +415,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
 	mutex_unlock(&c->btree_reserve_cache_lock);
 }
 
-static int bch2_btree_reserve_get(struct btree_update *as,
+static int bch2_btree_reserve_get(struct btree_trans *trans,
+				  struct btree_update *as,
 				  unsigned nr_nodes[2],
 				  unsigned flags,
 				  struct closure *cl)
@@ -441,7 +443,7 @@ static int bch2_btree_reserve_get(struct btree_update *as,
 		struct prealloc_nodes *p = as->prealloc_nodes + interior;
 
 		while (p->nr < nr_nodes[interior]) {
-			b = __bch2_btree_node_alloc(c, &as->disk_res,
+			b = __bch2_btree_node_alloc(trans, &as->disk_res,
 					flags & BTREE_INSERT_NOWAIT ? NULL : cl,
 					interior, flags);
 			if (IS_ERR(b)) {
@@ -1066,8 +1068,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	if (ret)
 		goto err;
 
-	ret = bch2_btree_reserve_get(as, nr_nodes, flags, NULL);
-	if (ret) {
+	ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+	if (ret == -EAGAIN ||
+	    ret == -ENOMEM) {
 		struct closure cl;
 
 		closure_init_stack(&cl);
@@ -1075,7 +1078,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		bch2_trans_unlock(trans);
 
 		do {
-			ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
+			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
 			closure_sync(&cl);
 		} while (ret == -EAGAIN);
 	}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5e247235ab69..2c6fdf385ba3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -296,11 +296,6 @@ static inline int bucket_sectors_fragmented(struct bch_dev *ca,
 		: 0;
 }
 
-static inline int is_stripe_data_bucket(struct bucket_mark m)
-{
-	return m.stripe && m.data_type != BCH_DATA_parity;
-}
-
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 {
 	return m.cached_sectors && !m.dirty_sectors
@@ -350,9 +345,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 	u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
 
 	preempt_enable();
-
-	if (!is_available_bucket(old) && is_available_bucket(new))
-		bch2_wake_allocator(ca);
 }
 
 static inline int __update_replicas(struct bch_fs *c,
@@ -488,19 +480,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
 	update_replicas_list(trans, &r.e, sectors);
 }
 
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, bool owned_by_allocator)
-{
-	struct bucket *g = bucket(ca, b);
-	struct bucket_mark old, new;
-
-	old = bucket_cmpxchg(g, new, ({
-		new.owned_by_allocator	= owned_by_allocator;
-	}));
-
-	BUG_ON(owned_by_allocator == old.owned_by_allocator);
-}
-
 int bch2_mark_alloc(struct btree_trans *trans,
 		    struct bkey_s_c old, struct bkey_s_c new,
 		    unsigned flags)
@@ -560,6 +539,10 @@ int bch2_mark_alloc(struct btree_trans *trans,
 		}
 	}
 
+	if (!new_a.data_type &&
+	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+		closure_wake_up(&c->freelist_wait);
+
 	if (bucket_state(new_a) == BUCKET_need_gc_gens) {
 		atomic_inc(&c->kick_gc);
 		wake_up_process(c->gc_thread);
@@ -583,7 +566,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
 
 	g->io_time[READ]	= new_a.io_time[READ];
 	g->io_time[WRITE]	= new_a.io_time[WRITE];
-	g->oldest_gen		= new_a.oldest_gen;
 	g->gen_valid		= 1;
 	g->stripe		= new_a.stripe;
 	g->stripe_redundancy	= new_a.stripe_redundancy;
@@ -1861,8 +1843,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	a->v.data_type		= type;
 	a->v.dirty_sectors	= sectors;
 
-	ret = bch2_trans_update(trans, &iter, &a->k_i,
-				BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
 	if (ret)
 		goto out;
 out:
@@ -2048,24 +2029,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	struct bucket_array *buckets = NULL, *old_buckets = NULL;
 	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
 	unsigned long *buckets_nouse = NULL;
-	alloc_fifo	free[RESERVE_NR];
-	alloc_fifo	free_inc;
-	alloc_heap	alloc_heap;
-
-	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
-			     ca->mi.bucket_size / btree_sectors(c));
-	/* XXX: these should be tunable */
-	size_t reserve_none	= max_t(size_t, 1, nbuckets >> 9);
-	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 6);
-	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
-				      btree_reserve * 2);
 	bool resize = ca->buckets[0] != NULL;
 	int ret = -ENOMEM;
-	unsigned i;
-
-	memset(&free,		0, sizeof(free));
-	memset(&free_inc,	0, sizeof(free_inc));
-	memset(&alloc_heap,	0, sizeof(alloc_heap));
 
 	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
 					    nbuckets * sizeof(struct bucket),
@@ -2075,12 +2040,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	    (c->opts.buckets_nouse &&
 	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO))) ||
-	    !init_fifo(&free[RESERVE_movinggc],
-		       copygc_reserve, GFP_KERNEL) ||
-	    !init_fifo(&free[RESERVE_none], reserve_none, GFP_KERNEL) ||
-	    !init_fifo(&free_inc,	free_inc_nr, GFP_KERNEL) ||
-	    !init_heap(&alloc_heap,	ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
+					    GFP_KERNEL|__GFP_ZERO))))
 		goto err;
 
 	buckets->first_bucket	= ca->mi.first_bucket;
@@ -2126,18 +2086,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		up_write(&c->gc_lock);
 	}
 
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++) {
-		fifo_move(&free[i], &ca->free[i]);
-		swap(ca->free[i], free[i]);
-	}
-	fifo_move(&free_inc, &ca->free_inc);
-	swap(ca->free_inc, free_inc);
-	spin_unlock(&c->freelist_lock);
-
-	/* with gc lock held, alloc_heap can't be in use: */
-	swap(ca->alloc_heap, alloc_heap);
-
 	nbuckets = ca->mi.nbuckets;
 
 	if (resize)
@@ -2145,10 +2093,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	ret = 0;
 err:
-	free_heap(&alloc_heap);
-	free_fifo(&free_inc);
-	for (i = 0; i < RESERVE_NR; i++)
-		free_fifo(&free[i]);
 	kvpfree(buckets_nouse,
 		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
 	if (bucket_gens)
@@ -2163,10 +2107,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 {
 	unsigned i;
 
-	free_heap(&ca->alloc_heap);
-	free_fifo(&ca->free_inc);
-	for (i = 0; i < RESERVE_NR; i++)
-		free_fifo(&ca->free[i]);
 	kvpfree(ca->buckets_nouse,
 		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 757919d5e20f..bcb40f15f82e 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,11 +58,6 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
 	return __bucket(ca, b, true);
 }
 
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
-{
-	return __bucket(ca, b, false);
-}
-
 static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
 {
 	return rcu_dereference_check(ca->bucket_gens,
@@ -151,50 +146,50 @@ static inline bool is_available_bucket(struct bucket_mark mark)
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
-					  struct bch_dev_usage stats)
+					  struct bch_dev_usage stats,
+					  enum alloc_reserve reserve)
 {
-	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+	s64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+	s64 reserved = 0;
+
+	switch (reserve) {
+	case RESERVE_none:
+		reserved += ca->mi.nbuckets >> 6;
+		fallthrough;
+	case RESERVE_movinggc:
+		reserved += ca->nr_btree_reserve;
+		fallthrough;
+	case RESERVE_btree:
+		reserved += ca->nr_btree_reserve;
+		fallthrough;
+	case RESERVE_btree_movinggc:
+		break;
+	default:
+		BUG();
+	}
 
 	if (WARN_ONCE(stats.buckets_unavailable > total,
 		      "buckets_unavailable overflow (%llu > %llu)\n",
 		      stats.buckets_unavailable, total))
 		return 0;
 
-	return total - stats.buckets_unavailable;
+	return max_t(s64, 0,
+		     total -
+		     stats.buckets_unavailable -
+		     ca->nr_open_buckets -
+		     reserved);
 }
 
-static inline u64 dev_buckets_available(struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+					enum alloc_reserve reserve)
 {
-	return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
-}
-
-static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
-					    struct bch_dev_usage stats)
-{
-	struct bch_fs *c = ca->fs;
-	s64 available = __dev_buckets_available(ca, stats);
-	unsigned i;
-
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++)
-		available -= fifo_used(&ca->free[i]);
-	available -= fifo_used(&ca->free_inc);
-	available -= ca->nr_open_buckets;
-	spin_unlock(&c->freelist_lock);
-
-	return max(available, 0LL);
-}
-
-static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
-{
-	return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
+	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
 }
 
 /* Filesystem usage: */
 
 static inline unsigned fs_usage_u64s(struct bch_fs *c)
 {
-
 	return sizeof(struct bch_fs_usage) / sizeof(u64) +
 		READ_ONCE(c->replicas.nr);
 }
@@ -222,7 +217,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
 
 void bch2_fs_usage_initialize(struct bch_fs *);
 
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
 void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 4f7018398385..6ddbea4da7d1 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -14,7 +14,6 @@ struct bucket_mark {
 	struct {
 	u8		gen;
 	u8		data_type:3,
-			owned_by_allocator:1,
 			stripe:1;
 	u16		dirty_sectors;
 	u16		cached_sectors;
@@ -29,7 +28,6 @@ struct bucket {
 	};
 
 	u64				io_time[2];
-	u8				oldest_gen;
 	unsigned			gen_valid:1;
 	u8				stripe_redundancy;
 	u32				stripe;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 9dc2f9f822c8..5030a5b831af 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1295,9 +1295,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 	BUG_ON(nr_have_data	> h->s->nr_data);
 	BUG_ON(nr_have_parity	> h->s->nr_parity);
 
-	percpu_down_read(&c->mark_lock);
-	rcu_read_lock();
-
 	buckets.nr = 0;
 	if (nr_have_parity < h->s->nr_parity) {
 		ret = bch2_bucket_alloc_set(c, &buckets,
@@ -1324,7 +1321,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 		}
 
 		if (ret)
-			goto err;
+			return ret;
 	}
 
 	buckets.nr = 0;
@@ -1352,12 +1349,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
 		}
 
 		if (ret)
-			goto err;
+			return ret;
 	}
-err:
-	rcu_read_unlock();
-	percpu_up_read(&c->mark_lock);
-	return ret;
+
+	return 0;
 }
 
 /* XXX: doesn't obey target: */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index cb15d1c8a135..f87f76553bf4 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -812,10 +812,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				break;
 			}
 		} else {
-			rcu_read_lock();
 			ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
 					       false, cl);
-			rcu_read_unlock();
 			if (IS_ERR(ob[nr_got])) {
 				ret = cl ? -EAGAIN : -ENOSPC;
 				break;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b2c3ee336c1f..3e418342ee67 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1398,6 +1398,10 @@ static void journal_write_done(struct closure *cl)
 		if (!JSET_NO_FLUSH(w->data)) {
 			j->flushed_seq_ondisk = seq;
 			j->last_seq_ondisk = w->last_seq;
+
+			closure_wake_up(&c->freelist_wait);
+
+			bch2_reset_alloc_cursors(c);
 		}
 	} else if (!j->err_seq || seq < j->err_seq)
 		j->err_seq	= seq;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index aecec55eb421..b9e1bd7b1d05 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -104,18 +104,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 	return DATA_SKIP;
 }
 
-static bool have_copygc_reserve(struct bch_dev *ca)
-{
-	bool ret;
-
-	spin_lock(&ca->fs->freelist_lock);
-	ret = fifo_full(&ca->free[RESERVE_movinggc]) ||
-		ca->allocator_state != ALLOCATOR_running;
-	spin_unlock(&ca->fs->freelist_lock);
-
-	return ret;
-}
-
 static inline int fragmentation_cmp(copygc_heap *heap,
 				   struct copygc_heap_entry l,
 				   struct copygc_heap_entry r)
@@ -247,11 +235,10 @@ static int bch2_copygc(struct bch_fs *c)
 	}
 
 	for_each_rw_member(ca, c, dev_idx) {
-		closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+		s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc),
+				ca->mi.nbuckets >> 6);
 
-		spin_lock(&ca->fs->freelist_lock);
-		sectors_reserved += fifo_used(&ca->free[RESERVE_movinggc]) * ca->mi.bucket_size;
-		spin_unlock(&ca->fs->freelist_lock);
+		sectors_reserved += avail * ca->mi.bucket_size;
 	}
 
 	ret = walk_buckets_to_copygc(c);
@@ -352,8 +339,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 	for_each_rw_member(ca, c, dev_idx) {
 		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
-		fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
-					ca->mi.bucket_size) >> 1);
+		fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
+				       ca->mi.bucket_size) >> 1);
 		fragmented = usage.d[BCH_DATA_user].fragmented;
 
 		wait = min(wait, max(0LL, fragmented_allowed - fragmented));
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 690a36ea1383..50e5c5e852f7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1374,6 +1374,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * Write out the superblock and journal buckets, now that we can do
 	 * btree updates
 	 */
+	bch_verbose(c, "marking superblocks");
 	err = "error marking superblock and journal";
 	for_each_member_device(ca, c, i) {
 		ret = bch2_trans_mark_dev_sb(c, ca);
@@ -1385,6 +1386,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 		ca->new_fs_bucket_idx = 0;
 	}
 
+	bch_verbose(c, "initializing freespace");
 	err = "error initializing freespace";
 	ret = bch2_fs_freespace_init(c);
 	if (ret)
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c6585034f4d4..3a8740fde9de 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -206,17 +206,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	 */
 	bch2_journal_flush_all_pins(&c->journal);
 
-	/*
-	 * If the allocator threads didn't all start up, the btree updates to
-	 * write out alloc info aren't going to work:
-	 */
-	if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
-		goto nowrote_alloc;
-
 	bch_verbose(c, "flushing journal and stopping allocators");
 
 	bch2_journal_flush_all_pins(&c->journal);
-	set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
 
 	do {
 		clean_passes++;
@@ -241,17 +233,11 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	bch_verbose(c, "flushing journal and stopping allocators complete");
 
 	set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-nowrote_alloc:
+
 	closure_wait_event(&c->btree_interior_update_wait,
 			   !bch2_btree_interior_updates_nr_pending(c));
 	flush_work(&c->btree_interior_update_work);
 
-	for_each_member_device(ca, c, i)
-		bch2_dev_allocator_stop(ca);
-
-	clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-	clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
-
 	bch2_fs_journal_stop(&c->journal);
 
 	/*
@@ -287,10 +273,6 @@ void bch2_fs_read_only(struct bch_fs *c)
 	/*
 	 * Block new foreground-end write operations from starting - any new
 	 * writes will return -EROFS:
-	 *
-	 * (This is really blocking new _allocations_, writes to previously
-	 * allocated space can still happen until stopping the allocator in
-	 * bch2_dev_allocator_stop()).
 	 */
 	percpu_ref_kill(&c->writes);
 
@@ -419,20 +401,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
-	for_each_rw_member(ca, c, i) {
-		ret = bch2_dev_allocator_start(ca);
-		if (ret) {
-			bch_err(c, "error starting allocator threads");
-			percpu_ref_put(&ca->io_ref);
-			goto err;
-		}
-	}
-
-	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
-	for_each_rw_member(ca, c, i)
-		bch2_wake_allocator(ca);
-
 	if (!early) {
 		ret = bch2_fs_read_write_late(c);
 		if (ret)
@@ -946,20 +914,6 @@ int bch2_fs_start(struct bch_fs *c)
 
 	set_bit(BCH_FS_STARTED, &c->flags);
 
-	/*
-	 * Allocator threads don't start filling copygc reserve until after we
-	 * set BCH_FS_STARTED - wake them now:
-	 *
-	 * XXX ugly hack:
-	 * Need to set ca->allocator_state here instead of relying on the
-	 * allocator threads to do it to avoid racing with the copygc threads
-	 * checking it and thinking they have no alloc reserve:
-	 */
-	for_each_online_member(ca, c, i) {
-		ca->allocator_state = ALLOCATOR_running;
-		bch2_wake_allocator(ca);
-	}
-
 	if (c->opts.read_only || c->opts.nochanges) {
 		bch2_fs_read_only(c);
 	} else {
@@ -1051,8 +1005,6 @@ static void bch2_dev_release(struct kobject *kobj)
 
 static void bch2_dev_free(struct bch_dev *ca)
 {
-	bch2_dev_allocator_stop(ca);
-
 	cancel_work_sync(&ca->io_error_work);
 
 	if (ca->kobj.state_in_sysfs &&
@@ -1167,6 +1119,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 	ca->mi = bch2_mi_to_cpu(member);
 	ca->uuid = member->uuid;
 
+	ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+			     ca->mi.bucket_size / btree_sectors(c));
+
 	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
 			    0, GFP_KERNEL) ||
 	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
@@ -1216,12 +1171,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 
 	ca->fs = c;
 
-	if (ca->mi.state == BCH_MEMBER_STATE_rw &&
-	    bch2_dev_allocator_start(ca)) {
-		bch2_dev_free(ca);
-		goto err;
-	}
-
 	bch2_dev_attach(c, ca, dev_idx);
 out:
 	pr_verbose_init(c->opts, "ret %i", ret);
@@ -1405,14 +1354,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 	/*
 	 * The allocator thread itself allocates btree nodes, so stop it first:
 	 */
-	bch2_dev_allocator_stop(ca);
 	bch2_dev_allocator_remove(c, ca);
 	bch2_dev_journal_stop(&c->journal, ca);
 
 	bch2_copygc_start(c);
 }
 
-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 {
 	lockdep_assert_held(&c->state_lock);
 
@@ -1420,8 +1368,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 
 	bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
-
-	return bch2_dev_allocator_start(ca);
 }
 
 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1448,7 +1394,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 	mutex_unlock(&c->sb_lock);
 
 	if (new_state == BCH_MEMBER_STATE_rw)
-		ret = __bch2_dev_read_write(c, ca);
+		__bch2_dev_read_write(c, ca);
 
 	rebalance_wakeup(c);
 
@@ -1710,13 +1656,8 @@ have_slot:
 
 	ca->new_fs_bucket_idx = 0;
 
-	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-		ret = __bch2_dev_read_write(c, ca);
-		if (ret) {
-			bch_err(c, "device add error: error going RW on new device: %i", ret);
-			goto err_late;
-		}
-	}
+	if (ca->mi.state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
 
 	up_write(&c->state_lock);
 	return 0;
@@ -1776,11 +1717,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 		goto err;
 	}
 
-	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-		ret = __bch2_dev_read_write(c, ca);
-		if (ret)
-			goto err;
-	}
+	if (ca->mi.state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
 
 	mutex_lock(&c->sb_lock);
 	mi = bch2_sb_get_members(c->disk_sb.sb);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ec672134cb18..e995b84b6172 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -170,7 +170,6 @@ read_attribute(congested);
 
 read_attribute(btree_avg_write_size);
 
-read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
 read_attribute(journal_debug);
@@ -186,11 +185,11 @@ read_attribute(internal_uuid);
 
 read_attribute(has_data);
 read_attribute(alloc_debug);
-write_attribute(wake_allocator);
 
 read_attribute(read_realloc_races);
 read_attribute(extent_migrate_done);
 read_attribute(extent_migrate_raced);
+read_attribute(bucket_alloc_fail);
 
 rw_attribute(discard);
 rw_attribute(label);
@@ -377,6 +376,8 @@ SHOW(bch2_fs)
 		    atomic_long_read(&c->extent_migrate_done));
 	sysfs_print(extent_migrate_raced,
 		    atomic_long_read(&c->extent_migrate_raced));
+	sysfs_print(bucket_alloc_fail,
+		    atomic_long_read(&c->bucket_alloc_fail));
 
 	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
 
@@ -577,6 +578,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_read_realloc_races,
 	&sysfs_extent_migrate_done,
 	&sysfs_extent_migrate_raced,
+	&sysfs_bucket_alloc_fail,
 
 	&sysfs_gc_gens_pos,
 
@@ -705,24 +707,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
 	NULL
 };
 
-static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-	enum alloc_reserve i;
-
-	spin_lock(&ca->fs->freelist_lock);
-
-	pr_buf(out, "free_inc:\t%zu\t%zu\n",
-	       fifo_used(&ca->free_inc),
-	       ca->free_inc.size);
-
-	for (i = 0; i < RESERVE_NR; i++)
-		pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
-		       fifo_used(&ca->free[i]),
-		       ca->free[i].size);
-
-	spin_unlock(&ca->fs->freelist_lock);
-}
-
 static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 {
 	struct bch_fs *c = ca->fs;
@@ -748,9 +732,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       "ec\t%16llu\n"
 	       "available%15llu\n"
 	       "\n"
-	       "free_inc\t\t%zu/%zu\n"
-	       "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
-	       "free[RESERVE_NONE]\t%zu/%zu\n"
 	       "freelist_wait\t\t%s\n"
 	       "open buckets allocated\t%u\n"
 	       "open buckets this dev\t%u\n"
@@ -758,13 +739,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       "open_buckets_wait\t%s\n"
 	       "open_buckets_btree\t%u\n"
 	       "open_buckets_user\t%u\n"
-	       "btree reserve cache\t%u\n"
-	       "thread state:\t\t%s\n",
+	       "btree reserve cache\t%u\n",
 	       stats.buckets_ec,
-	       __dev_buckets_available(ca, stats),
-	       fifo_used(&ca->free_inc),		ca->free_inc.size,
-	       fifo_used(&ca->free[RESERVE_movinggc]),	ca->free[RESERVE_movinggc].size,
-	       fifo_used(&ca->free[RESERVE_none]),	ca->free[RESERVE_none].size,
+	       __dev_buckets_available(ca, stats, RESERVE_none),
 	       c->freelist_wait.list.first		? "waiting" : "empty",
 	       OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
 	       ca->nr_open_buckets,
@@ -772,8 +749,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       c->open_buckets_wait.list.first		? "waiting" : "empty",
 	       nr[BCH_DATA_btree],
 	       nr[BCH_DATA_user],
-	       c->btree_reserve_cache_nr,
-	       bch2_allocator_states[ca->allocator_state]);
+	       c->btree_reserve_cache_nr);
 }
 
 static const char * const bch2_rw[] = {
@@ -848,9 +824,6 @@ SHOW(bch2_dev)
 		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
 		     * 100 / CONGESTED_MAX);
 
-	if (attr == &sysfs_reserve_stats)
-		reserve_stats_to_text(out, ca);
-
 	if (attr == &sysfs_alloc_debug)
 		dev_alloc_debug_to_text(out, ca);
 
@@ -890,9 +863,6 @@ STORE(bch2_dev)
 			return ret;
 	}
 
-	if (attr == &sysfs_wake_allocator)
-		bch2_wake_allocator(ca);
-
 	return size;
 }
 SYSFS_OPS(bch2_dev);
@@ -918,11 +888,8 @@ struct attribute *bch2_dev_files[] = {
 	&sysfs_io_latency_stats_write,
 	&sysfs_congested,
 
-	&sysfs_reserve_stats,
-
 	/* debug: */
 	&sysfs_alloc_debug,
-	&sysfs_wake_allocator,
 	NULL
 };
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 89207fd7b617..caf59b977e2f 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -471,37 +471,74 @@ TRACE_EVENT(invalidate,
 );
 
 DECLARE_EVENT_CLASS(bucket_alloc,
-	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
-	TP_ARGS(ca, alloc_reserve),
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 avail,
+		 u64 seen,
+		 u64 open,
+		 u64 need_journal_commit,
+		 u64 nouse,
+		 bool nonblocking,
+		 int ret),
+	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret),
 
 	TP_STRUCT__entry(
-		__field(dev_t,			dev	)
-		__array(char,	reserve,	16	)
+		__field(dev_t,			dev			)
+		__array(char,	reserve,	16			)
+		__field(u64,			avail			)
+		__field(u64,			seen			)
+		__field(u64,			open			)
+		__field(u64,			need_journal_commit	)
+		__field(u64,			nouse			)
+		__field(bool,			nonblocking		)
+		__field(int,			ret			)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= ca->dev;
 		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+		__entry->avail		= avail;
+		__entry->seen		= seen;
+		__entry->open		= open;
+		__entry->need_journal_commit = need_journal_commit;
+		__entry->nouse		= nouse;
+		__entry->nonblocking	= nonblocking;
+		__entry->ret		= ret;
 	),
 
-	TP_printk("%d,%d reserve %s",
+	TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->reserve)
+		  __entry->reserve,
+		  __entry->avail,
+		  __entry->seen,
+		  __entry->open,
+		  __entry->need_journal_commit,
+		  __entry->nouse,
+		  __entry->nonblocking,
+		  __entry->ret)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc,
-	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
-	TP_ARGS(ca, alloc_reserve)
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 avail,
+		 u64 seen,
+		 u64 open,
+		 u64 need_journal_commit,
+		 u64 nouse,
+		 bool nonblocking,
+		 int ret),
+	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
-	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
-	TP_ARGS(ca, alloc_reserve)
-);
-
-DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
-	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
-	TP_ARGS(ca, alloc_reserve)
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 avail,
+		 u64 seen,
+		 u64 open,
+		 u64 need_journal_commit,
+		 u64 nouse,
+		 bool nonblocking,
+		 int ret),
+	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
 );
 
 /* Moving IO */