diff options
-rw-r--r-- | fs/bcachefs/alloc_background.c | 560 | ||||
-rw-r--r-- | fs/bcachefs/alloc_background.h | 33 | ||||
-rw-r--r-- | fs/bcachefs/alloc_foreground.c | 561 | ||||
-rw-r--r-- | fs/bcachefs/alloc_foreground.h | 11 | ||||
-rw-r--r-- | fs/bcachefs/alloc_types.h | 23 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs.h | 23 | ||||
-rw-r--r-- | fs/bcachefs/btree_gc.c | 10 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_interior.c | 17 | ||||
-rw-r--r-- | fs/bcachefs/buckets.c | 72 | ||||
-rw-r--r-- | fs/bcachefs/buckets.h | 62 | ||||
-rw-r--r-- | fs/bcachefs/buckets_types.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/ec.c | 13 | ||||
-rw-r--r-- | fs/bcachefs/journal.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/journal_io.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/movinggc.c | 23 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 82 | ||||
-rw-r--r-- | fs/bcachefs/sysfs.c | 47 | ||||
-rw-r--r-- | fs/bcachefs/trace.h | 67 |
19 files changed, 615 insertions, 999 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 5d553d9b6151..3ba2b35fad53 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -27,13 +27,6 @@ #include <linux/sched/task.h> #include <linux/sort.h> -const char * const bch2_allocator_states[] = { -#define x(n) #n, - ALLOC_THREAD_STATES() -#undef x - NULL -}; - /* Persistent alloc info: */ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { @@ -431,7 +424,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) g->_mark.gen = a.gen; g->io_time[READ] = a.io_time[READ]; g->io_time[WRITE] = a.io_time[WRITE]; - g->oldest_gen = !gc ? a.oldest_gen : a.gen; g->gen_valid = 1; if (!gc || @@ -553,7 +545,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); - SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } if (old_a.data_type && !new_a->data_type && @@ -698,493 +689,6 @@ out: return ret; } -/* Background allocator thread: */ - -/* - * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens - * (marking them as invalidated on disk), then optionally issues discard - * commands to the newly free buckets, then puts them on the various freelists. - */ - -/* - * bucket_gc_gen() returns the difference between the bucket's current gen and - * the oldest gen of any pointer into that bucket in the btree. - */ - -static inline u8 bucket_gc_gen(struct bucket *g) -{ - return g->mark.gen - g->oldest_gen; -} - -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, - struct bucket_mark m) -{ - u8 gc_gen; - - if (!is_available_bucket(m)) - return false; - - if (m.owned_by_allocator) - return false; - - if (ca->buckets_nouse && - test_bit(b, ca->buckets_nouse)) - return false; - - if (ca->new_fs_bucket_idx) { - /* - * Device or filesystem is still being initialized, and we - * haven't fully marked superblocks & journal: - */ - if (is_superblock_bucket(ca, b)) - return false; - - if (b < ca->new_fs_bucket_idx) - return false; - } - - gc_gen = bucket_gc_gen(bucket(ca, b)); - - ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; - ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX; - - return gc_gen < BUCKET_GC_GEN_MAX; -} - -/* - * Determines what order we're going to reuse buckets, smallest bucket_key() - * first. - */ - -static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, - u64 now, u64 last_seq_ondisk) -{ - unsigned used = m.cached_sectors; - - if (used) { - /* - * Prefer to keep buckets that have been read more recently, and - * buckets that have more data in them: - */ - u64 last_read = max_t(s64, 0, now - g->io_time[READ]); - u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); - - return -last_read_scaled; - } else { - /* - * Prefer to use buckets with smaller gc_gen so that we don't - * have to walk the btree and recalculate oldest_gen - but shift - * off the low bits so that buckets will still have equal sort - * keys when there's only a small difference, so that we can - * keep sequential buckets together: - */ - return bucket_gc_gen(g) >> 4; - } -} - -static inline int bucket_alloc_cmp(alloc_heap *h, - struct alloc_heap_entry l, - struct alloc_heap_entry r) -{ - return cmp_int(l.key, r.key) ?: - cmp_int(r.nr, l.nr) ?: - cmp_int(l.bucket, r.bucket); -} - -static inline int bucket_idx_cmp(const void *_l, const void *_r) -{ - const struct alloc_heap_entry *l = _l, *r = _r; - - return cmp_int(l->bucket, r->bucket); -} - -static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets; - struct alloc_heap_entry e = { 0 }; - u64 now, last_seq_ondisk; - size_t b, i, nr = 0; - - down_read(&ca->bucket_lock); - - buckets = bucket_array(ca); - ca->alloc_heap.used = 0; - now = atomic64_read(&c->io_clock[READ].now); - last_seq_ondisk = c->journal.flushed_seq_ondisk; - - /* - * Find buckets with lowest read priority, by building a maxheap sorted - * by read priority and repeatedly replacing the maximum element until - * all buckets have been visited. - */ - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket *g = &buckets->b[b]; - struct bucket_mark m = READ_ONCE(g->mark); - unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); - - cond_resched(); - - if (!bch2_can_invalidate_bucket(ca, b, m)) - continue; - - if (!m.data_type && - bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - last_seq_ondisk, - ca->dev_idx, b)) { - ca->buckets_waiting_on_journal++; - continue; - } - - if (e.nr && e.bucket + e.nr == b && e.key == key) { - e.nr++; - } else { - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, - -bucket_alloc_cmp, NULL); - - e = (struct alloc_heap_entry) { - .bucket = b, - .nr = 1, - .key = key, - }; - } - } - - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, - -bucket_alloc_cmp, NULL); - - for (i = 0; i < ca->alloc_heap.used; i++) - nr += ca->alloc_heap.data[i].nr; - - while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { - nr -= ca->alloc_heap.data[0].nr; - heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); - } - - up_read(&ca->bucket_lock); -} - -static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) -{ - size_t i, nr = 0; - - ca->inc_gen_needs_gc = 0; - ca->inc_gen_really_needs_gc = 0; - ca->buckets_waiting_on_journal = 0; - - find_reclaimable_buckets_lru(c, ca); - - heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); - - for (i = 0; i < ca->alloc_heap.used; i++) - nr += ca->alloc_heap.data[i].nr; - - return nr; -} - -static int bucket_invalidate_btree(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - struct bkey_i_alloc_v4 *a) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - POS(ca->dev_idx, b), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - bkey_alloc_v4_init(&a->k_i); - a->k.p = iter.pos; - bch2_alloc_to_v4(k, &a->v); - a->v.gen++; - a->v.data_type = 0; - a->v.dirty_sectors = 0; - a->v.cached_sectors = 0; - a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); - a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); - - ret = bch2_trans_update(trans, &iter, &a->k_i, - BTREE_TRIGGER_BUCKET_INVALIDATE| - BTREE_UPDATE_NO_KEY_CACHE_COHERENCY); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - u64 *journal_seq, unsigned flags) -{ - struct bkey_i_alloc_v4 a; - size_t b; - u64 commit_seq = 0; - int ret = 0; - - /* - * If the read-only path is trying to shut down, we can't be generating - * new btree updates: - */ - if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) - return 1; - - BUG_ON(!ca->alloc_heap.used || - !ca->alloc_heap.data[0].nr); - b = ca->alloc_heap.data[0].bucket; - - /* first, put on free_inc and mark as owned by allocator: */ - percpu_down_read(&c->mark_lock); - - bch2_mark_alloc_bucket(c, ca, b, true); - - spin_lock(&c->freelist_lock); - verify_not_on_freelist(c, ca, b); - BUG_ON(!fifo_push(&ca->free_inc, b)); - spin_unlock(&c->freelist_lock); - - percpu_up_read(&c->mark_lock); - - ret = bch2_trans_do(c, NULL, &commit_seq, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - flags, - bucket_invalidate_btree(&trans, ca, b, &a)); - - if (!ret) { - /* remove from alloc_heap: */ - struct alloc_heap_entry e, *top = ca->alloc_heap.data; - - top->bucket++; - top->nr--; - - if (!top->nr) - heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); - - /* - * If we invalidating cached data then we need to wait on the - * journal commit: - */ - if (a.v.data_type) - *journal_seq = max(*journal_seq, commit_seq); - - /* - * We already waiting on u.alloc_seq when we filtered out - * buckets that need journal commit: - */ - BUG_ON(*journal_seq > a.v.journal_seq); - } else { - size_t b2; - - /* remove from free_inc: */ - percpu_down_read(&c->mark_lock); - spin_lock(&c->freelist_lock); - - bch2_mark_alloc_bucket(c, ca, b, false); - - BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); - BUG_ON(b != b2); - - spin_unlock(&c->freelist_lock); - percpu_up_read(&c->mark_lock); - } - - return ret < 0 ? ret : 0; -} - -/* - * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: - */ -static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) -{ - u64 journal_seq = 0; - int ret = 0; - - /* Only use nowait if we've already invalidated at least one bucket: */ - while (!ret && - !fifo_full(&ca->free_inc) && - ca->alloc_heap.used) { - if (kthread_should_stop()) { - ret = 1; - break; - } - - ret = bch2_invalidate_one_bucket(c, ca, &journal_seq, - (!fifo_empty(&ca->free_inc) - ? BTREE_INSERT_NOWAIT : 0)); - /* - * We only want to batch up invalidates when they're going to - * require flushing the journal: - */ - if (!journal_seq) - break; - } - - /* If we used NOWAIT, don't return the error: */ - if (!fifo_empty(&ca->free_inc)) - ret = 0; - if (ret < 0) - bch_err(ca, "error invalidating buckets: %i", ret); - if (ret) - return ret; - - if (journal_seq) - ret = bch2_journal_flush_seq(&c->journal, journal_seq); - if (ret) { - bch_err(ca, "journal error: %i", ret); - return ret; - } - - return 0; -} - -static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state) -{ - if (ca->allocator_state != new_state) { - ca->allocator_state = new_state; - closure_wake_up(&ca->fs->freelist_wait); - } -} - -static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -{ - unsigned i; - int ret = 0; - - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) { - /* - * Don't strand buckets on the copygc freelist until - * after recovery is finished: - */ - if (i == RESERVE_movinggc && - !test_bit(BCH_FS_STARTED, &c->flags)) - continue; - - if (fifo_push(&ca->free[i], b)) { - fifo_pop(&ca->free_inc, b); - ret = 1; - break; - } - } - spin_unlock(&c->freelist_lock); - - ca->allocator_state = ret - ? ALLOCATOR_running - : ALLOCATOR_blocked_full; - closure_wake_up(&c->freelist_wait); - return ret; -} - -static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -{ - if (!c->opts.nochanges && - ca->mi.discard && - bdev_max_discard_sectors(ca->disk_sb.bdev)) - blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b), - ca->mi.bucket_size, GFP_NOFS); -} - -static bool allocator_thread_running(struct bch_dev *ca) -{ - unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && - test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) - ? ALLOCATOR_running - : ALLOCATOR_stopped; - alloc_thread_set_state(ca, state); - return state == ALLOCATOR_running; -} - -static int buckets_available(struct bch_dev *ca, unsigned long gc_count) -{ - s64 available = dev_buckets_reclaimable(ca) - - (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0); - bool ret = available > 0; - - alloc_thread_set_state(ca, ret - ? ALLOCATOR_running - : ALLOCATOR_blocked); - return ret; -} - -/** - * bch_allocator_thread - move buckets from free_inc to reserves - * - * The free_inc FIFO is populated by find_reclaimable_buckets(), and - * the reserves are depleted by bucket allocation. When we run out - * of free_inc, try to invalidate some buckets and write out - * prios and gens. - */ -static int bch2_allocator_thread(void *arg) -{ - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; - unsigned long gc_count = c->gc_count; - size_t nr; - int ret; - - set_freezable(); - - while (1) { - ret = kthread_wait_freezable(allocator_thread_running(ca)); - if (ret) - goto stop; - - while (!ca->alloc_heap.used) { - cond_resched(); - - ret = kthread_wait_freezable(buckets_available(ca, gc_count)); - if (ret) - goto stop; - - gc_count = c->gc_count; - nr = find_reclaimable_buckets(c, ca); - - if (!nr && ca->buckets_waiting_on_journal) { - ret = bch2_journal_flush(&c->journal); - if (ret) - goto stop; - } else if (nr < (ca->mi.nbuckets >> 6) && - ca->buckets_waiting_on_journal >= nr / 2) { - bch2_journal_flush_async(&c->journal, NULL); - } - - if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || - ca->inc_gen_really_needs_gc) && - c->gc_thread) { - atomic_inc(&c->kick_gc); - wake_up_process(c->gc_thread); - } - - trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, - ca->inc_gen_really_needs_gc); - } - - ret = bch2_invalidate_buckets(c, ca); - if (ret) - goto stop; - - while (!fifo_empty(&ca->free_inc)) { - u64 b = fifo_peek(&ca->free_inc); - - discard_one_bucket(c, ca, b); - - ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b)); - if (ret) - goto stop; - } - } -stop: - alloc_thread_set_state(ca, ALLOCATOR_stopped); - return 0; -} - /* Startup/shutdown (ro/rw): */ void bch2_recalc_capacity(struct bch_fs *c) @@ -1193,7 +697,7 @@ void bch2_recalc_capacity(struct bch_fs *c) u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; - unsigned i, j; + unsigned i; lockdep_assert_held(&c->state_lock); @@ -1224,8 +728,9 @@ void bch2_recalc_capacity(struct bch_fs *c) * allocations for foreground writes must wait - * not -ENOSPC calculations. */ - for (j = 0; j < RESERVE_none; j++) - dev_reserve += ca->free[j].size; + + dev_reserve += ca->nr_btree_reserve * 2; + dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ dev_reserve += 1; /* btree write point */ dev_reserve += 1; /* copygc write point */ @@ -1281,8 +786,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { unsigned i; - BUG_ON(ca->alloc_thread); - /* First, remove device from allocation groups: */ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) @@ -1356,61 +859,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } -void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) -{ - if (ca->alloc_thread) - closure_wait_event(&c->freelist_wait, - ca->allocator_state != ALLOCATOR_running); -} - -/* stop allocator thread: */ -void bch2_dev_allocator_stop(struct bch_dev *ca) -{ - struct task_struct *p; - - p = rcu_dereference_protected(ca->alloc_thread, 1); - ca->alloc_thread = NULL; - - /* - * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid bch2_wake_allocator() racing: - * - * XXX: it would be better to have the rcu barrier be asynchronous - * instead of blocking us here - */ - synchronize_rcu(); - - if (p) { - kthread_stop(p); - put_task_struct(p); - } -} - -/* start allocator thread: */ -int bch2_dev_allocator_start(struct bch_dev *ca) -{ - struct task_struct *p; - - /* - * allocator thread already started? - */ - if (ca->alloc_thread) - return 0; - - p = kthread_create(bch2_allocator_thread, ca, - "bch-alloc/%s", ca->name); - if (IS_ERR(p)) { - bch_err(ca->fs, "error creating allocator thread: %li", - PTR_ERR(p)); - return PTR_ERR(p); - } - - get_task_struct(p); - rcu_assign_pointer(ca->alloc_thread, p); - wake_up_process(p); - return 0; -} - void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 8de109674f0f..74b23f9b1bd3 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -8,8 +8,6 @@ #include "debug.h" #include "super.h" -extern const char * const bch2_allocator_states[]; - /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U @@ -117,42 +115,11 @@ int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); int bch2_fs_freespace_init(struct bch_fs *); -static inline void bch2_wake_allocator(struct bch_dev *ca) -{ - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(ca->alloc_thread); - if (p) - wake_up_process(p); - rcu_read_unlock(); -} - -static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, - size_t bucket) -{ - if (bch2_expensive_debug_checks) { - size_t iter; - long i; - unsigned j; - - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - BUG_ON(i == bucket); - fifo_for_each_entry(i, &ca->free_inc, iter) - BUG_ON(i == bucket); - } -} - void bch2_recalc_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_stop(struct bch_dev *); -int bch2_dev_allocator_start(struct bch_dev *); - void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index c4b4689fdd0f..01abcf43341f 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -14,13 +14,18 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "btree_iter.h" +#include "btree_update.h" #include "btree_gc.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" +#include "error.h" #include "io.h" +#include "journal.h" #include "trace.h" #include <linux/math64.h> @@ -50,6 +55,17 @@ const char * const bch2_alloc_reserves[] = { * reference _after_ doing the index update that makes its allocation reachable. */ +void bch2_reset_alloc_cursors(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + ca->alloc_cursor = 0; + rcu_read_unlock(); +} + static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) { open_bucket_idx_t idx = ob - c->open_buckets; @@ -85,7 +101,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - bch2_mark_alloc_bucket(c, ca, ob->bucket, false); ob->valid = false; ob->data_type = 0; @@ -185,39 +200,35 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) } } -/** - * bch_bucket_alloc - allocate a single bucket from a specific device - * - * Returns index of bucket on success, 0 on failure - * */ -struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum alloc_reserve reserve, - bool may_alloc_partial, - struct closure *cl) +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 bucket, + enum alloc_reserve reserve, + struct bch_alloc_v4 *a, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct closure *cl) { struct open_bucket *ob; - long b = 0; - spin_lock(&c->freelist_lock); + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { + (*skipped_nouse)++; + return NULL; + } - if (may_alloc_partial) { - int i; - - for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { - ob = c->open_buckets + ca->open_buckets_partial[i]; - - if (reserve <= ob->alloc_reserve) { - array_remove_item(ca->open_buckets_partial, - ca->open_buckets_partial_nr, - i); - ob->on_partial_list = false; - ob->alloc_reserve = reserve; - spin_unlock(&c->freelist_lock); - return ob; - } - } + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + (*skipped_open)++; + return NULL; + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { + (*skipped_need_journal_commit)++; + return NULL; } + spin_lock(&c->freelist_lock); + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -226,36 +237,16 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, c->blocked_allocate_open_bucket = local_clock(); spin_unlock(&c->freelist_lock); - trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]); return ERR_PTR(-OPEN_BUCKETS_EMPTY); } - if (likely(fifo_pop(&ca->free[RESERVE_none], b))) - goto out; - - switch (reserve) { - case RESERVE_btree_movinggc: - case RESERVE_movinggc: - if (fifo_pop(&ca->free[RESERVE_movinggc], b)) - goto out; - break; - default: - break; + /* Recheck under lock: */ + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + spin_unlock(&c->freelist_lock); + (*skipped_open)++; + return NULL; } - if (cl) - closure_wait(&c->freelist_wait, cl); - - if (!c->blocked_allocate) - c->blocked_allocate = local_clock(); - - spin_unlock(&c->freelist_lock); - - trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]); - return ERR_PTR(-FREELIST_EMPTY); -out: - verify_not_on_freelist(c, ca, b); - ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); @@ -264,8 +255,8 @@ out: ob->sectors_free = ca->mi.bucket_size; ob->alloc_reserve = reserve; ob->dev = ca->dev_idx; - ob->gen = *bucket_gen(ca, b); - ob->bucket = b; + ob->gen = a->gen; + ob->bucket = bucket; spin_unlock(&ob->lock); ca->nr_open_buckets++; @@ -286,10 +277,326 @@ out: } spin_unlock(&c->freelist_lock); + return ob; +} + +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, + enum alloc_reserve reserve, u64 free_entry, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct bkey_s_c freespace_k, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + struct open_bucket *ob; + struct bch_alloc_v4 a; + u64 b = free_entry & ~(~0ULL << 56); + unsigned genbits = free_entry >> 56; + struct printbuf buf = PRINTBUF; + int ret; + + if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { + pr_buf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" + " freespace key ", + ca->mi.first_bucket, ca->mi.nbuckets); + bch2_bkey_val_to_text(&buf, c, freespace_k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) { + ob = ERR_PTR(ret); + goto err; + } + + bch2_alloc_to_v4(k, &a); + + if (genbits != (alloc_freespace_genbits(a) >> 56)) { + pr_buf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" + " freespace key ", + genbits, alloc_freespace_genbits(a) >> 56); + bch2_bkey_val_to_text(&buf, c, freespace_k); + pr_buf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + + } + + if (a.data_type != BUCKET_free) { + pr_buf(&buf, "non free bucket in freespace btree\n" + " freespace key "); + bch2_bkey_val_to_text(&buf, c, freespace_k); + pr_buf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + + ob = __try_alloc_bucket(c, ca, b, reserve, &a, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl); + if (!ob) + iter.path->preserve = false; +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ob; +} + +static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve) +{ + struct open_bucket *ob; + int i; + + spin_lock(&c->freelist_lock); + + for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { + ob = c->open_buckets + ca->open_buckets_partial[i]; + + if (reserve <= ob->alloc_reserve) { + array_remove_item(ca->open_buckets_partial, + ca->open_buckets_partial_nr, + i); + ob->on_partial_list = false; + ob->alloc_reserve = reserve; + spin_unlock(&c->freelist_lock); + return ob; + } + } + + spin_unlock(&c->freelist_lock); + return NULL; +} + +/* + * This path is for before the freespace btree is initialized: + * + * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & + * journal buckets - journal buckets will be < ca->new_fs_bucket_idx + */ +static noinline struct open_bucket * +bch2_bucket_alloc_early(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + u64 *buckets_seen, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); + int ret; +again: + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), + BTREE_ITER_SLOTS, k, ret) { + struct bch_alloc_v4 a; + + if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + break; + + if (ca->new_fs_bucket_idx && + is_superblock_bucket(ca, k.k->p.offset)) + continue; + + bch2_alloc_to_v4(k, &a); + + if (bucket_state(a) != BUCKET_free) + continue; + + (*buckets_seen)++; + + ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl); + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + + ca->alloc_cursor = alloc_cursor; + + if (!ob && alloc_cursor > alloc_start) { + alloc_cursor = alloc_start; + goto again; + } + + return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); +} + +static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + u64 *buckets_seen, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; + int ret; + + BUG_ON(ca->new_fs_bucket_idx); +again: + for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, alloc_cursor), 0, k, ret) { + if (k.k->p.inode != ca->dev_idx) + break; + + for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); + alloc_cursor < k.k->p.offset; + alloc_cursor++) { + if (btree_trans_too_many_iters(trans)) { + ob = ERR_PTR(-EINTR); + break; + } + + (*buckets_seen)++; + + ob = try_alloc_bucket(trans, ca, reserve, + alloc_cursor, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + k, cl); + if (ob) { + iter.path->preserve = false; + break; + } + } + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + + ca->alloc_cursor = alloc_cursor; + + if (!ob && ret) + ob = ERR_PTR(ret); + + if (!ob && alloc_start > ca->mi.first_bucket) { + alloc_cursor = alloc_start = ca->mi.first_bucket; + goto again; + } + + return ob; +} + +/** + * bch_bucket_alloc - allocate a single bucket from a specific device + * + * Returns index of bucket on success, 0 on failure + * */ +static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + bool may_alloc_partial, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct open_bucket *ob = NULL; + u64 avail = dev_buckets_available(ca, reserve); + u64 buckets_seen = 0; + u64 skipped_open = 0; + u64 skipped_need_journal_commit = 0; + u64 skipped_nouse = 0; + + if (may_alloc_partial) { + ob = try_alloc_partial_bucket(c, ca, reserve); + if (ob) + return ob; + } +again: + if (!avail) { + if (cl) { + closure_wait(&c->freelist_wait, cl); + /* recheck after putting ourself on waitlist */ + avail = dev_buckets_available(ca, reserve); + if (avail) { + closure_wake_up(&c->freelist_wait); + goto again; + } + } + + if (!c->blocked_allocate) + c->blocked_allocate = local_clock(); + + ob = ERR_PTR(-FREELIST_EMPTY); + goto err; + } + + ob = likely(ca->mi.freespace_initialized) + ? bch2_bucket_alloc_freelist(trans, ca, reserve, + &buckets_seen, + &skipped_open, + &skipped_need_journal_commit, + &skipped_nouse, + cl) + : bch2_bucket_alloc_early(trans, ca, reserve, + &buckets_seen, + &skipped_open, + &skipped_need_journal_commit, + &skipped_nouse, + cl); + + if (skipped_need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); +err: + if (!ob) + ob = ERR_PTR(-FREELIST_EMPTY); + + if (!IS_ERR(ob)) { + trace_bucket_alloc(ca, bch2_alloc_reserves[reserve], avail, + buckets_seen, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl == NULL, PTR_ERR_OR_ZERO(ob)); + } else { + trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail, + buckets_seen, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl == NULL, PTR_ERR_OR_ZERO(ob)); + atomic_long_inc(&c->bucket_alloc_fail); + } + + return ob; +} - bch2_wake_allocator(ca); +struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve, + bool may_alloc_partial, + struct closure *cl) +{ + struct open_bucket *ob; - trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]); + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, + may_alloc_partial, cl))); return ob; } @@ -320,7 +627,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, struct dev_stripe_state *stripe) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca); + u64 free_space = dev_buckets_available(ca, RESERVE_none); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; @@ -358,7 +665,7 @@ static void add_new_bucket(struct bch_fs *c, ob_push(c, ptrs, ob); } -int bch2_bucket_alloc_set(struct bch_fs *c, +static int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct open_buckets *ptrs, struct dev_stripe_state *stripe, struct bch_devs_mask *devs_may_alloc, @@ -369,10 +676,12 @@ int bch2_bucket_alloc_set(struct bch_fs *c, unsigned flags, struct closure *cl) { + struct bch_fs *c = trans->c; struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + unsigned dev; struct bch_dev *ca; - int ret = -INSUFFICIENT_DEVICES; + int ret = 0; unsigned i; BUG_ON(*nr_effective >= nr_replicas); @@ -380,35 +689,68 @@ int bch2_bucket_alloc_set(struct bch_fs *c, for (i = 0; i < devs_sorted.nr; i++) { struct open_bucket *ob; - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + dev = devs_sorted.devs[i]; + + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + if (!ca) continue; - if (!ca->mi.durability && *have_cache) + if (!ca->mi.durability && *have_cache) { + percpu_ref_put(&ca->ref); continue; + } - ob = bch2_bucket_alloc(c, ca, reserve, + ob = bch2_bucket_alloc_trans(trans, ca, reserve, flags & BUCKET_MAY_ALLOC_PARTIAL, cl); - if (IS_ERR(ob)) { - ret = PTR_ERR(ob); - - if (cl) - return ret; + if (!IS_ERR(ob)) + bch2_dev_stripe_increment(ca, stripe); + percpu_ref_put(&ca->ref); + + ret = PTR_ERR_OR_ZERO(ob); + if (ret) { + if (ret == -EINTR || cl) + break; continue; } add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, flags, ob); - bch2_dev_stripe_increment(ca, stripe); - if (*nr_effective >= nr_replicas) - return 0; + break; } + if (*nr_effective >= nr_replicas) + ret = 0; + else if (!ret) + ret = -INSUFFICIENT_DEVICES; + return ret; } +int bch2_bucket_alloc_set(struct bch_fs *c, + struct open_buckets *ptrs, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl) +{ + return bch2_trans_do(c, NULL, NULL, 0, + bch2_bucket_alloc_set_trans(&trans, ptrs, stripe, + devs_may_alloc, nr_replicas, + nr_effective, have_cache, reserve, + flags, cl)); +} + /* Allocate from stripes: */ /* @@ -513,7 +855,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c, wp->ptrs = ptrs_skip; } -static int open_bucket_add_buckets(struct bch_fs *c, +static int open_bucket_add_buckets(struct btree_trans *trans, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_list *devs_have, @@ -526,6 +868,7 @@ static int open_bucket_add_buckets(struct bch_fs *c, unsigned flags, struct closure *_cl) { + struct bch_fs *c = trans->c; struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; @@ -557,7 +900,8 @@ static int open_bucket_add_buckets(struct bch_fs *c, target, erasure_code, nr_replicas, nr_effective, have_cache, flags, _cl); - if (ret == -FREELIST_EMPTY || + if (ret == -EINTR || + ret == -FREELIST_EMPTY || ret == -OPEN_BUCKETS_EMPTY) return ret; if (*nr_effective >= nr_replicas) @@ -571,25 +915,22 @@ static int open_bucket_add_buckets(struct bch_fs *c, if (*nr_effective >= nr_replicas) return 0; - percpu_down_read(&c->mark_lock); - rcu_read_lock(); - retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, + ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, reserve, flags, cl); - if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) { + if (ret && + ret != -EINTR && + ret != -INSUFFICIENT_DEVICES && + !cl && _cl) { cl = _cl; goto retry_blocking; } - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - return ret; } @@ -703,15 +1044,25 @@ static bool try_decrease_writepoints(struct bch_fs *c, return true; } -static struct write_point *writepoint_find(struct bch_fs *c, +static void bch2_trans_mutex_lock(struct btree_trans *trans, + struct mutex *lock) +{ + if (!mutex_trylock(lock)) { + bch2_trans_unlock(trans); + mutex_lock(lock); + } +} + +static struct write_point *writepoint_find(struct btree_trans *trans, unsigned long write_point) { + struct bch_fs *c = trans->c; struct write_point *wp, *oldest; struct hlist_head *head; if (!(write_point & 1UL)) { wp = (struct write_point *) write_point; - mutex_lock(&wp->lock); + bch2_trans_mutex_lock(trans, &wp->lock); return wp; } @@ -720,7 +1071,7 @@ restart_find: wp = __writepoint_find(head, write_point); if (wp) { lock_wp: - mutex_lock(&wp->lock); + bch2_trans_mutex_lock(trans, &wp->lock); if (wp->write_point == write_point) goto out; mutex_unlock(&wp->lock); @@ -733,8 +1084,8 @@ restart_find_oldest: if (!oldest || time_before64(wp->last_used, oldest->last_used)) oldest = wp; - mutex_lock(&oldest->lock); - mutex_lock(&c->write_points_hash_lock); + bch2_trans_mutex_lock(trans, &oldest->lock); + bch2_trans_mutex_lock(trans, &c->write_points_hash_lock); if (oldest >= c->write_points + c->write_points_nr || try_increase_writepoints(c)) { mutex_unlock(&c->write_points_hash_lock); @@ -762,7 +1113,7 @@ out: /* * Get us an open_bucket we can allocate from, return with it locked: */ -int bch2_alloc_sectors_start(struct bch_fs *c, +int bch2_alloc_sectors_start_trans(struct btree_trans *trans, unsigned target, unsigned erasure_code, struct write_point_specifier write_point, @@ -774,6 +1125,7 @@ int bch2_alloc_sectors_start(struct bch_fs *c, struct closure *cl, struct write_point **wp_ret) { + struct bch_fs *c = trans->c; struct write_point *wp; struct open_bucket *ob; struct open_buckets ptrs; @@ -793,7 +1145,7 @@ retry: write_points_nr = c->write_points_nr; have_cache = false; - *wp_ret = wp = writepoint_find(c, write_point.v); + *wp_ret = wp = writepoint_find(trans, write_point.v); if (wp->data_type == BCH_DATA_user) ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; @@ -803,21 +1155,21 @@ retry: have_cache = true; if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, ob_flags, cl); } else { - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, ob_flags, NULL); - if (!ret) + if (!ret || ret == -EINTR) goto alloc_done; - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 0, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, @@ -871,10 +1223,33 @@ err: case -INSUFFICIENT_DEVICES: return -EROFS; default: - BUG(); + return ret; } } +int bch2_alloc_sectors_start(struct bch_fs *c, + unsigned target, + unsigned erasure_code, + struct write_point_specifier write_point, + struct bch_devs_list *devs_have, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl, + struct write_point **wp_ret) +{ + return bch2_trans_do(c, NULL, NULL, 0, + bch2_alloc_sectors_start_trans(&trans, target, + erasure_code, + write_point, + devs_have, + nr_replicas, + nr_replicas_required, + reserve, + flags, cl, wp_ret)); +} + struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index efb6d155bd00..12583a7e7aa3 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -14,6 +14,8 @@ struct bch_devs_List; extern const char * const bch2_alloc_reserves[]; +void bch2_reset_alloc_cursors(struct bch_fs *); + struct dev_alloc_list { unsigned nr; u8 devs[BCH_SB_MEMBERS_MAX]; @@ -136,6 +138,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, unsigned, unsigned *, bool *, enum alloc_reserve, unsigned, struct closure *); +int bch2_alloc_sectors_start_trans(struct btree_trans *, + unsigned, unsigned, + struct write_point_specifier, + struct bch_devs_list *, + unsigned, unsigned, + enum alloc_reserve, + unsigned, + struct closure *, + struct write_point **); int bch2_alloc_sectors_start(struct bch_fs *, unsigned, unsigned, struct write_point_specifier, diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 9e00afb17559..b3bef7074511 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -10,18 +10,6 @@ struct ec_bucket_buf; -#define ALLOC_THREAD_STATES() \ - x(stopped) \ - x(running) \ - x(blocked) \ - x(blocked_full) - -enum allocator_states { -#define x(n) ALLOCATOR_##n, - ALLOC_THREAD_STATES() -#undef x -}; - #define BCH_ALLOC_RESERVES() \ x(btree_movinggc) \ x(btree) \ @@ -32,11 +20,8 @@ enum alloc_reserve { #define x(name) RESERVE_##name, BCH_ALLOC_RESERVES() #undef x - RESERVE_NR }; -typedef FIFO(long) alloc_fifo; - #define OPEN_BUCKETS_COUNT 1024 #define WRITE_POINT_HASH_NR 32 @@ -127,12 +112,4 @@ struct write_point_specifier { unsigned long v; }; -struct alloc_heap_entry { - size_t bucket; - size_t nr; - unsigned long key; -}; - -typedef HEAP(struct alloc_heap_entry) alloc_heap; - #endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 6c11ebee73a9..879b2adc8b42 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -462,34 +462,18 @@ struct bch_dev { /* Allocator: */ u64 new_fs_bucket_idx; - struct task_struct __rcu *alloc_thread; + u64 alloc_cursor; - /* - * free: Buckets that are ready to be used - * - * free_inc: Incoming buckets - these are buckets that currently have - * cached data in them, and we can't reuse them until after we write - * their new gen to disk. After prio_write() finishes writing the new - * gens/prios, they'll be moved to the free list (and possibly discarded - * in the process) - */ - alloc_fifo free[RESERVE_NR]; - alloc_fifo free_inc; unsigned nr_open_buckets; + unsigned nr_btree_reserve; open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; open_bucket_idx_t open_buckets_partial_nr; - size_t fifo_last_bucket; - size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; - enum allocator_states allocator_state; - - alloc_heap alloc_heap; - atomic64_t rebalance_work; struct journal_device journal; @@ -511,8 +495,6 @@ struct bch_dev { enum { /* startup: */ BCH_FS_ALLOC_CLEAN, - BCH_FS_ALLOCATOR_RUNNING, - BCH_FS_ALLOCATOR_STOPPING, BCH_FS_INITIAL_GC_DONE, BCH_FS_INITIAL_GC_UNFIXED, BCH_FS_TOPOLOGY_REPAIR_DONE, @@ -914,6 +896,7 @@ mempool_t bio_bounce_pages; atomic_long_t read_realloc_races; atomic_long_t extent_migrate_done; atomic_long_t extent_migrate_raced; + atomic_long_t bucket_alloc_fail; unsigned btree_gc_periodic:1; unsigned copy_gc_enabled:1; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index c3d6c62ef062..7078b277e23b 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1684,9 +1684,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) */ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) { - struct bch_dev *ca; u64 start_time = local_clock(); - unsigned i, iter = 0; + unsigned iter = 0; int ret; lockdep_assert_held(&c->state_lock); @@ -1788,13 +1787,6 @@ out: bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); /* - * Wake up allocator in case it was waiting for buckets - * because of not being able to inc gens - */ - for_each_member_device(ca, c, i) - bch2_wake_allocator(ca); - - /* * At startup, allocations can happen directly instead of via the * allocator thread - issue wakeup in case they blocked on gc_lock: */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d1e3e2c76e30..2e958f88777b 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -178,12 +178,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, six_unlock_intent(&b->c.lock); } -static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, +static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct disk_reservation *res, struct closure *cl, bool interior_node, unsigned flags) { + struct bch_fs *c = trans->c; struct write_point *wp; struct btree *b; __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; @@ -214,7 +215,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - ret = bch2_alloc_sectors_start(c, + ret = bch2_alloc_sectors_start_trans(trans, c->opts.metadata_target ?: c->opts.foreground_target, 0, @@ -414,7 +415,8 @@ static void bch2_btree_reserve_put(struct btree_update *as) mutex_unlock(&c->btree_reserve_cache_lock); } -static int bch2_btree_reserve_get(struct btree_update *as, +static int bch2_btree_reserve_get(struct btree_trans *trans, + struct btree_update *as, unsigned nr_nodes[2], unsigned flags, struct closure *cl) @@ -441,7 +443,7 @@ static int bch2_btree_reserve_get(struct btree_update *as, struct prealloc_nodes *p = as->prealloc_nodes + interior; while (p->nr < nr_nodes[interior]) { - b = __bch2_btree_node_alloc(c, &as->disk_res, + b = __bch2_btree_node_alloc(trans, &as->disk_res, flags & BTREE_INSERT_NOWAIT ? NULL : cl, interior, flags); if (IS_ERR(b)) { @@ -1066,8 +1068,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_btree_reserve_get(as, nr_nodes, flags, NULL); - if (ret) { + ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); + if (ret == -EAGAIN || + ret == -ENOMEM) { struct closure cl; closure_init_stack(&cl); @@ -1075,7 +1078,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, bch2_trans_unlock(trans); do { - ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); closure_sync(&cl); } while (ret == -EAGAIN); } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 5e247235ab69..2c6fdf385ba3 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -296,11 +296,6 @@ static inline int bucket_sectors_fragmented(struct bch_dev *ca, : 0; } -static inline int is_stripe_data_bucket(struct bucket_mark m) -{ - return m.stripe && m.data_type != BCH_DATA_parity; -} - static inline enum bch_data_type bucket_type(struct bucket_mark m) { return m.cached_sectors && !m.dirty_sectors @@ -350,9 +345,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); preempt_enable(); - - if (!is_available_bucket(old) && is_available_bucket(new)) - bch2_wake_allocator(ca); } static inline int __update_replicas(struct bch_fs *c, @@ -488,19 +480,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, update_replicas_list(trans, &r.e, sectors); } -void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, bool owned_by_allocator) -{ - struct bucket *g = bucket(ca, b); - struct bucket_mark old, new; - - old = bucket_cmpxchg(g, new, ({ - new.owned_by_allocator = owned_by_allocator; - })); - - BUG_ON(owned_by_allocator == old.owned_by_allocator); -} - int bch2_mark_alloc(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) @@ -560,6 +539,10 @@ int bch2_mark_alloc(struct btree_trans *trans, } } + if (!new_a.data_type && + (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); + if (bucket_state(new_a) == BUCKET_need_gc_gens) { atomic_inc(&c->kick_gc); wake_up_process(c->gc_thread); @@ -583,7 +566,6 @@ int bch2_mark_alloc(struct btree_trans *trans, g->io_time[READ] = new_a.io_time[READ]; g->io_time[WRITE] = new_a.io_time[WRITE]; - g->oldest_gen = new_a.oldest_gen; g->gen_valid = 1; g->stripe = new_a.stripe; g->stripe_redundancy = new_a.stripe_redundancy; @@ -1861,8 +1843,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, a->v.data_type = type; a->v.dirty_sectors = sectors; - ret = bch2_trans_update(trans, &iter, &a->k_i, - BTREE_UPDATE_NO_KEY_CACHE_COHERENCY); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); if (ret) goto out; out: @@ -2048,24 +2029,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) struct bucket_array *buckets = NULL, *old_buckets = NULL; struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; unsigned long *buckets_nouse = NULL; - alloc_fifo free[RESERVE_NR]; - alloc_fifo free_inc; - alloc_heap alloc_heap; - - size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, - ca->mi.bucket_size / btree_sectors(c)); - /* XXX: these should be tunable */ - size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); - size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); - size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), - btree_reserve * 2); bool resize = ca->buckets[0] != NULL; int ret = -ENOMEM; - unsigned i; - - memset(&free, 0, sizeof(free)); - memset(&free_inc, 0, sizeof(free_inc)); - memset(&alloc_heap, 0, sizeof(alloc_heap)); if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + nbuckets * sizeof(struct bucket), @@ -2075,12 +2040,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) (c->opts.buckets_nouse && !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO))) || - !init_fifo(&free[RESERVE_movinggc], - copygc_reserve, GFP_KERNEL) || - !init_fifo(&free[RESERVE_none], reserve_none, GFP_KERNEL) || - !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || - !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) + GFP_KERNEL|__GFP_ZERO)))) goto err; buckets->first_bucket = ca->mi.first_bucket; @@ -2126,18 +2086,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) up_write(&c->gc_lock); } - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) { - fifo_move(&free[i], &ca->free[i]); - swap(ca->free[i], free[i]); - } - fifo_move(&free_inc, &ca->free_inc); - swap(ca->free_inc, free_inc); - spin_unlock(&c->freelist_lock); - - /* with gc lock held, alloc_heap can't be in use: */ - swap(ca->alloc_heap, alloc_heap); - nbuckets = ca->mi.nbuckets; if (resize) @@ -2145,10 +2093,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = 0; err: - free_heap(&alloc_heap); - free_fifo(&free_inc); - for (i = 0; i < RESERVE_NR; i++) - free_fifo(&free[i]); kvpfree(buckets_nouse, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); if (bucket_gens) @@ -2163,10 +2107,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca) { unsigned i; - free_heap(&ca->alloc_heap); - free_fifo(&ca->free_inc); - for (i = 0; i < RESERVE_NR; i++) - free_fifo(&ca->free[i]); kvpfree(ca->buckets_nouse, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 757919d5e20f..bcb40f15f82e 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -58,11 +58,6 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) return __bucket(ca, b, true); } -static inline struct bucket *bucket(struct bch_dev *ca, size_t b) -{ - return __bucket(ca, b, false); -} - static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) { return rcu_dereference_check(ca->bucket_gens, @@ -151,50 +146,50 @@ static inline bool is_available_bucket(struct bucket_mark mark) struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); static inline u64 __dev_buckets_available(struct bch_dev *ca, - struct bch_dev_usage stats) + struct bch_dev_usage stats, + enum alloc_reserve reserve) { - u64 total = ca->mi.nbuckets - ca->mi.first_bucket; + s64 total = ca->mi.nbuckets - ca->mi.first_bucket; + s64 reserved = 0; + + switch (reserve) { + case RESERVE_none: + reserved += ca->mi.nbuckets >> 6; + fallthrough; + case RESERVE_movinggc: + reserved += ca->nr_btree_reserve; + fallthrough; + case RESERVE_btree: + reserved += ca->nr_btree_reserve; + fallthrough; + case RESERVE_btree_movinggc: + break; + default: + BUG(); + } if (WARN_ONCE(stats.buckets_unavailable > total, "buckets_unavailable overflow (%llu > %llu)\n", stats.buckets_unavailable, total)) return 0; - return total - stats.buckets_unavailable; + return max_t(s64, 0, + total - + stats.buckets_unavailable - + ca->nr_open_buckets - + reserved); } -static inline u64 dev_buckets_available(struct bch_dev *ca) +static inline u64 dev_buckets_available(struct bch_dev *ca, + enum alloc_reserve reserve) { - return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); -} - -static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca, - struct bch_dev_usage stats) -{ - struct bch_fs *c = ca->fs; - s64 available = __dev_buckets_available(ca, stats); - unsigned i; - - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) - available -= fifo_used(&ca->free[i]); - available -= fifo_used(&ca->free_inc); - available -= ca->nr_open_buckets; - spin_unlock(&c->freelist_lock); - - return max(available, 0LL); -} - -static inline u64 dev_buckets_reclaimable(struct bch_dev *ca) -{ - return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca)); + return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve); } /* Filesystem usage: */ static inline unsigned fs_usage_u64s(struct bch_fs *c) { - return sizeof(struct bch_fs_usage) / sizeof(u64) + READ_ONCE(c->replicas.nr); } @@ -222,7 +217,6 @@ bch2_fs_usage_read_short(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); -void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 4f7018398385..6ddbea4da7d1 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -14,7 +14,6 @@ struct bucket_mark { struct { u8 gen; u8 data_type:3, - owned_by_allocator:1, stripe:1; u16 dirty_sectors; u16 cached_sectors; @@ -29,7 +28,6 @@ struct bucket { }; u64 io_time[2]; - u8 oldest_gen; unsigned gen_valid:1; u8 stripe_redundancy; u32 stripe; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 9dc2f9f822c8..5030a5b831af 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1295,9 +1295,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, BUG_ON(nr_have_data > h->s->nr_data); BUG_ON(nr_have_parity > h->s->nr_parity); - percpu_down_read(&c->mark_lock); - rcu_read_lock(); - buckets.nr = 0; if (nr_have_parity < h->s->nr_parity) { ret = bch2_bucket_alloc_set(c, &buckets, @@ -1324,7 +1321,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, } if (ret) - goto err; + return ret; } buckets.nr = 0; @@ -1352,12 +1349,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, } if (ret) - goto err; + return ret; } -err: - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - return ret; + + return 0; } /* XXX: doesn't obey target: */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index cb15d1c8a135..f87f76553bf4 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -812,10 +812,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, break; } } else { - rcu_read_lock(); ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, false, cl); - rcu_read_unlock(); if (IS_ERR(ob[nr_got])) { ret = cl ? -EAGAIN : -ENOSPC; break; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index b2c3ee336c1f..3e418342ee67 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1398,6 +1398,10 @@ static void journal_write_done(struct closure *cl) if (!JSET_NO_FLUSH(w->data)) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; + + closure_wake_up(&c->freelist_wait); + + bch2_reset_alloc_cursors(c); } } else if (!j->err_seq || seq < j->err_seq) j->err_seq = seq; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index aecec55eb421..b9e1bd7b1d05 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -104,18 +104,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, return DATA_SKIP; } -static bool have_copygc_reserve(struct bch_dev *ca) -{ - bool ret; - - spin_lock(&ca->fs->freelist_lock); - ret = fifo_full(&ca->free[RESERVE_movinggc]) || - ca->allocator_state != ALLOCATOR_running; - spin_unlock(&ca->fs->freelist_lock); - - return ret; -} - static inline int fragmentation_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) @@ -247,11 +235,10 @@ static int bch2_copygc(struct bch_fs *c) } for_each_rw_member(ca, c, dev_idx) { - closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); + s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc), + ca->mi.nbuckets >> 6); - spin_lock(&ca->fs->freelist_lock); - sectors_reserved += fifo_used(&ca->free[RESERVE_movinggc]) * ca->mi.bucket_size; - spin_unlock(&ca->fs->freelist_lock); + sectors_reserved += avail * ca->mi.bucket_size; } ret = walk_buckets_to_copygc(c); @@ -352,8 +339,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) for_each_rw_member(ca, c, dev_idx) { struct bch_dev_usage usage = bch2_dev_usage_read(ca); - fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) * - ca->mi.bucket_size) >> 1); + fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * + ca->mi.bucket_size) >> 1); fragmented = usage.d[BCH_DATA_user].fragmented; wait = min(wait, max(0LL, fragmented_allowed - fragmented)); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 690a36ea1383..50e5c5e852f7 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1374,6 +1374,7 @@ int bch2_fs_initialize(struct bch_fs *c) * Write out the superblock and journal buckets, now that we can do * btree updates */ + bch_verbose(c, "marking superblocks"); err = "error marking superblock and journal"; for_each_member_device(ca, c, i) { ret = bch2_trans_mark_dev_sb(c, ca); @@ -1385,6 +1386,7 @@ int bch2_fs_initialize(struct bch_fs *c) ca->new_fs_bucket_idx = 0; } + bch_verbose(c, "initializing freespace"); err = "error initializing freespace"; ret = bch2_fs_freespace_init(c); if (ret) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index c6585034f4d4..3a8740fde9de 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -206,17 +206,9 @@ static void __bch2_fs_read_only(struct bch_fs *c) */ bch2_journal_flush_all_pins(&c->journal); - /* - * If the allocator threads didn't all start up, the btree updates to - * write out alloc info aren't going to work: - */ - if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) - goto nowrote_alloc; - bch_verbose(c, "flushing journal and stopping allocators"); bch2_journal_flush_all_pins(&c->journal); - set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); do { clean_passes++; @@ -241,17 +233,11 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch_verbose(c, "flushing journal and stopping allocators complete"); set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -nowrote_alloc: + closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); flush_work(&c->btree_interior_update_work); - for_each_member_device(ca, c, i) - bch2_dev_allocator_stop(ca); - - clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); - clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); - bch2_fs_journal_stop(&c->journal); /* @@ -287,10 +273,6 @@ void bch2_fs_read_only(struct bch_fs *c) /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: - * - * (This is really blocking new _allocations_, writes to previously - * allocated space can still happen until stopping the allocator in - * bch2_dev_allocator_stop()). */ percpu_ref_kill(&c->writes); @@ -419,20 +401,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - for_each_rw_member(ca, c, i) { - ret = bch2_dev_allocator_start(ca); - if (ret) { - bch_err(c, "error starting allocator threads"); - percpu_ref_put(&ca->io_ref); - goto err; - } - } - - set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); - - for_each_rw_member(ca, c, i) - bch2_wake_allocator(ca); - if (!early) { ret = bch2_fs_read_write_late(c); if (ret) @@ -946,20 +914,6 @@ int bch2_fs_start(struct bch_fs *c) set_bit(BCH_FS_STARTED, &c->flags); - /* - * Allocator threads don't start filling copygc reserve until after we - * set BCH_FS_STARTED - wake them now: - * - * XXX ugly hack: - * Need to set ca->allocator_state here instead of relying on the - * allocator threads to do it to avoid racing with the copygc threads - * checking it and thinking they have no alloc reserve: - */ - for_each_online_member(ca, c, i) { - ca->allocator_state = ALLOCATOR_running; - bch2_wake_allocator(ca); - } - if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); } else { @@ -1051,8 +1005,6 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { - bch2_dev_allocator_stop(ca); - cancel_work_sync(&ca->io_error_work); if (ca->kobj.state_in_sysfs && @@ -1167,6 +1119,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ca->mi = bch2_mi_to_cpu(member); ca->uuid = member->uuid; + ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, + ca->mi.bucket_size / btree_sectors(c)); + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL) || percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, @@ -1216,12 +1171,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->fs = c; - if (ca->mi.state == BCH_MEMBER_STATE_rw && - bch2_dev_allocator_start(ca)) { - bch2_dev_free(ca); - goto err; - } - bch2_dev_attach(c, ca, dev_idx); out: pr_verbose_init(c->opts, "ret %i", ret); @@ -1405,14 +1354,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) /* * The allocator thread itself allocates btree nodes, so stop it first: */ - bch2_dev_allocator_stop(ca); bch2_dev_allocator_remove(c, ca); bch2_dev_journal_stop(&c->journal, ca); bch2_copygc_start(c); } -static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); @@ -1420,8 +1368,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - - return bch2_dev_allocator_start(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -1448,7 +1394,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, mutex_unlock(&c->sb_lock); if (new_state == BCH_MEMBER_STATE_rw) - ret = __bch2_dev_read_write(c, ca); + __bch2_dev_read_write(c, ca); rebalance_wakeup(c); @@ -1710,13 +1656,8 @@ have_slot: ca->new_fs_bucket_idx = 0; - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - ret = __bch2_dev_read_write(c, ca); - if (ret) { - bch_err(c, "device add error: error going RW on new device: %i", ret); - goto err_late; - } - } + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return 0; @@ -1776,11 +1717,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path) goto err; } - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - ret = __bch2_dev_read_write(c, ca); - if (ret) - goto err; - } + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); mutex_lock(&c->sb_lock); mi = bch2_sb_get_members(c->disk_sb.sb); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index ec672134cb18..e995b84b6172 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -170,7 +170,6 @@ read_attribute(congested); read_attribute(btree_avg_write_size); -read_attribute(reserve_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); read_attribute(journal_debug); @@ -186,11 +185,11 @@ read_attribute(internal_uuid); read_attribute(has_data); read_attribute(alloc_debug); -write_attribute(wake_allocator); read_attribute(read_realloc_races); read_attribute(extent_migrate_done); read_attribute(extent_migrate_raced); +read_attribute(bucket_alloc_fail); rw_attribute(discard); rw_attribute(label); @@ -377,6 +376,8 @@ SHOW(bch2_fs) atomic_long_read(&c->extent_migrate_done)); sysfs_print(extent_migrate_raced, atomic_long_read(&c->extent_migrate_raced)); + sysfs_print(bucket_alloc_fail, + atomic_long_read(&c->bucket_alloc_fail)); sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); @@ -577,6 +578,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_read_realloc_races, &sysfs_extent_migrate_done, &sysfs_extent_migrate_raced, + &sysfs_bucket_alloc_fail, &sysfs_gc_gens_pos, @@ -705,24 +707,6 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) -{ - enum alloc_reserve i; - - spin_lock(&ca->fs->freelist_lock); - - pr_buf(out, "free_inc:\t%zu\t%zu\n", - fifo_used(&ca->free_inc), - ca->free_inc.size); - - for (i = 0; i < RESERVE_NR; i++) - pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, - fifo_used(&ca->free[i]), - ca->free[i].size); - - spin_unlock(&ca->fs->freelist_lock); -} - static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; @@ -748,9 +732,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) "ec\t%16llu\n" "available%15llu\n" "\n" - "free_inc\t\t%zu/%zu\n" - "free[RESERVE_MOVINGGC]\t%zu/%zu\n" - "free[RESERVE_NONE]\t%zu/%zu\n" "freelist_wait\t\t%s\n" "open buckets allocated\t%u\n" "open buckets this dev\t%u\n" @@ -758,13 +739,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) "open_buckets_wait\t%s\n" "open_buckets_btree\t%u\n" "open_buckets_user\t%u\n" - "btree reserve cache\t%u\n" - "thread state:\t\t%s\n", + "btree reserve cache\t%u\n", stats.buckets_ec, - __dev_buckets_available(ca, stats), - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->free[RESERVE_movinggc]), ca->free[RESERVE_movinggc].size, - fifo_used(&ca->free[RESERVE_none]), ca->free[RESERVE_none].size, + __dev_buckets_available(ca, stats, RESERVE_none), c->freelist_wait.list.first ? "waiting" : "empty", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, ca->nr_open_buckets, @@ -772,8 +749,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) c->open_buckets_wait.list.first ? "waiting" : "empty", nr[BCH_DATA_btree], nr[BCH_DATA_user], - c->btree_reserve_cache_nr, - bch2_allocator_states[ca->allocator_state]); + c->btree_reserve_cache_nr); } static const char * const bch2_rw[] = { @@ -848,9 +824,6 @@ SHOW(bch2_dev) clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) * 100 / CONGESTED_MAX); - if (attr == &sysfs_reserve_stats) - reserve_stats_to_text(out, ca); - if (attr == &sysfs_alloc_debug) dev_alloc_debug_to_text(out, ca); @@ -890,9 +863,6 @@ STORE(bch2_dev) return ret; } - if (attr == &sysfs_wake_allocator) - bch2_wake_allocator(ca); - return size; } SYSFS_OPS(bch2_dev); @@ -918,11 +888,8 @@ struct attribute *bch2_dev_files[] = { &sysfs_io_latency_stats_write, &sysfs_congested, - &sysfs_reserve_stats, - /* debug: */ &sysfs_alloc_debug, - &sysfs_wake_allocator, NULL }; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 89207fd7b617..caf59b977e2f 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -471,37 +471,74 @@ TRACE_EVENT(invalidate, ); DECLARE_EVENT_CLASS(bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), - TP_ARGS(ca, alloc_reserve), + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 avail, + u64 seen, + u64 open, + u64 need_journal_commit, + u64 nouse, + bool nonblocking, + int ret), + TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret), TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, reserve, 16 ) + __field(dev_t, dev ) + __array(char, reserve, 16 ) + __field(u64, avail ) + __field(u64, seen ) + __field(u64, open ) + __field(u64, need_journal_commit ) + __field(u64, nouse ) + __field(bool, nonblocking ) + __field(int, ret ) ), TP_fast_assign( __entry->dev = ca->dev; strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->avail = avail; + __entry->seen = seen; + __entry->open = open; + __entry->need_journal_commit = need_journal_commit; + __entry->nouse = nouse; + __entry->nonblocking = nonblocking; + __entry->ret = ret; ), - TP_printk("%d,%d reserve %s", + TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->reserve) + __entry->reserve, + __entry->avail, + __entry->seen, + __entry->open, + __entry->need_journal_commit, + __entry->nouse, + __entry->nonblocking, + __entry->ret) ); DEFINE_EVENT(bucket_alloc, bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), - TP_ARGS(ca, alloc_reserve) + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 avail, + u64 seen, + u64 open, + u64 need_journal_commit, + u64 nouse, + bool nonblocking, + int ret), + TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret) ); DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), - TP_ARGS(ca, alloc_reserve) -); - -DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), - TP_ARGS(ca, alloc_reserve) + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 avail, + u64 seen, + u64 open, + u64 need_journal_commit, + u64 nouse, + bool nonblocking, + int ret), + TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret) ); /* Moving IO */ |