bcachefs: Run btree updates after write out of write_point

In the write path, after the write to the block device(s) complete we have to punt to process context to do the btree update. Instead of using the work item embedded in op->cl, this patch switches to a per write-point work item. This helps with two different issues: - lock contention: btree updates to the same writepoint will (usually) be updating the same alloc keys - context switch overhead: when we're bottlenecked on btree updates, having a thread (running out of a work item) checking the write point for completed ops is cheaper than queueing up a new work item and waking up a kworker. In an arbitrary benchmark, 4k random writes with fio running inside a VM, this patch resulted in a 10% improvement in total iops. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
author: Kent Overstreet <kent.overstreet@linux.dev> 2022-10-31 16:13:05 -0400
committer: Kent Overstreet <kent.overstreet@linux.dev> 2023-10-22 17:09:29 -0400
commit: b17d3cec14b487924df709dbeffb900f124a2607 (patch)
tree: e3a5ce061e8e294d8d0570a7916996cebefb9d8d /fs/bcachefs
parent: 5f417394033a0d8bfb31d02b3becf7381dc13867 (diff)
download: linux-stable-b17d3cec14b487924df709dbeffb900f124a2607.tar.gz
linux-stable-b17d3cec14b487924df709dbeffb900f124a2607.tar.bz2
linux-stable-b17d3cec14b487924df709dbeffb900f124a2607.zip
11 files changed, 246 insertions, 137 deletions
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 76a4b8029bdf..c4b4689fdd0f 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -762,16 +762,17 @@ out:
 /*
  * Get us an open_bucket we can allocate from, return with it locked:
  */
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
-				unsigned target,
-				unsigned erasure_code,
-				struct write_point_specifier write_point,
-				struct bch_devs_list *devs_have,
-				unsigned nr_replicas,
-				unsigned nr_replicas_required,
-				enum alloc_reserve reserve,
-				unsigned flags,
-				struct closure *cl)
+int bch2_alloc_sectors_start(struct bch_fs *c,
+			     unsigned target,
+			     unsigned erasure_code,
+			     struct write_point_specifier write_point,
+			     struct bch_devs_list *devs_have,
+			     unsigned nr_replicas,
+			     unsigned nr_replicas_required,
+			     enum alloc_reserve reserve,
+			     unsigned flags,
+			     struct closure *cl,
+			     struct write_point **wp_ret)
 {
 	struct write_point *wp;
 	struct open_bucket *ob;
@@ -792,7 +793,7 @@ retry:
 	write_points_nr = c->write_points_nr;
 	have_cache	= false;
 
-	wp = writepoint_find(c, write_point.v);
+	*wp_ret = wp = writepoint_find(c, write_point.v);
 
 	if (wp->data_type == BCH_DATA_user)
 		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@@ -848,7 +849,7 @@ alloc_done:
 
 	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
 
-	return wp;
+	return 0;
 err:
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
@@ -866,9 +867,9 @@ err:
 	switch (ret) {
 	case -OPEN_BUCKETS_EMPTY:
 	case -FREELIST_EMPTY:
-		return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
+		return cl ? -EAGAIN : -ENOSPC;
 	case -INSUFFICIENT_DEVICES:
-		return ERR_PTR(-EROFS);
+		return -EROFS;
 	default:
 		BUG();
 	}
@@ -895,13 +896,13 @@ struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 				    struct bkey_i *k, unsigned sectors,
 				    bool cached)
-
 {
 	struct open_bucket *ob;
 	unsigned i;
 
 	BUG_ON(sectors > wp->sectors_free);
-	wp->sectors_free -= sectors;
+	wp->sectors_free	-= sectors;
+	wp->sectors_allocated	+= sectors;
 
 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
@@ -942,6 +943,10 @@ static inline void writepoint_init(struct write_point *wp,
 {
 	mutex_init(&wp->lock);
 	wp->data_type = type;
+
+	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
+	INIT_LIST_HEAD(&wp->writes);
+	spin_lock_init(&wp->writes_lock);
 }
 
 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
@@ -997,3 +1002,33 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
 	}
 
 }
+
+static const char * const bch2_write_point_states[] = {
+#define x(n)	#n,
+	WRITE_POINT_STATES()
+#undef x
+	NULL
+};
+
+void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct write_point *wp;
+	unsigned i;
+
+	for (wp = c->write_points;
+	     wp < c->write_points + ARRAY_SIZE(c->write_points);
+	     wp++) {
+		pr_buf(out, "%lu: ", wp->write_point);
+		bch2_hprint(out, wp->sectors_allocated);
+
+		pr_buf(out, " last wrote: ");
+		bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+		for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+			pr_buf(out, " %s: ", bch2_write_point_states[i]);
+			bch2_pr_time_units(out, wp->time[i]);
+		}
+
+		pr_newline(out);
+	}
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 3598c70b93b4..9b4389b09cbb 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -122,14 +122,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
 		      unsigned, unsigned *, bool *, enum alloc_reserve,
 		      unsigned, struct closure *);
 
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-					     unsigned, unsigned,
-					     struct write_point_specifier,
-					     struct bch_devs_list *,
-					     unsigned, unsigned,
-					     enum alloc_reserve,
-					     unsigned,
-					     struct closure *);
+int bch2_alloc_sectors_start(struct bch_fs *,
+			     unsigned, unsigned,
+			     struct write_point_specifier,
+			     struct bch_devs_list *,
+			     unsigned, unsigned,
+			     enum alloc_reserve,
+			     unsigned,
+			     struct closure *,
+			     struct write_point **);
 
 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
@@ -156,4 +157,6 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
 void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
 
+void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index e3a3eb271158..9e00afb17559 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -81,18 +81,46 @@ struct dev_stripe_state {
 	u64			next_alloc[BCH_SB_MEMBERS_MAX];
 };
 
+#define WRITE_POINT_STATES()		\
+	x(stopped)			\
+	x(waiting_io)			\
+	x(waiting_work)			\
+	x(running)
+
+enum write_point_state {
+#define x(n)	WRITE_POINT_##n,
+	WRITE_POINT_STATES()
+#undef x
+	WRITE_POINT_STATE_NR
+};
+
 struct write_point {
-	struct hlist_node	node;
-	struct mutex		lock;
-	u64			last_used;
-	unsigned long		write_point;
-	enum bch_data_type	data_type;
+	struct {
+		struct hlist_node	node;
+		struct mutex		lock;
+		u64			last_used;
+		unsigned long		write_point;
+		enum bch_data_type	data_type;
 
-	/* calculated based on how many pointers we're actually going to use: */
-	unsigned		sectors_free;
+		/* calculated based on how many pointers we're actually going to use: */
+		unsigned		sectors_free;
+
+		struct open_buckets	ptrs;
+		struct dev_stripe_state	stripe;
+
+		u64			sectors_allocated;
+	} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+	struct {
+		struct work_struct	index_update_work;
+
+		struct list_head	writes;
+		spinlock_t		writes_lock;
 
-	struct open_buckets	ptrs;
-	struct dev_stripe_state	stripe;
+		enum write_point_state	state;
+		u64			last_state_change;
+		u64			time[WRITE_POINT_STATE_NR];
+	} __attribute__((__aligned__(SMP_CACHE_BYTES)));
 };
 
 struct write_point_specifier {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index ff56c374ff2e..d1e3e2c76e30 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -191,6 +191,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	unsigned nr_reserve;
 	enum alloc_reserve alloc_reserve;
+	int ret;
 
 	if (flags & BTREE_INSERT_USE_RESERVE) {
 		nr_reserve	= 0;
@@ -213,7 +214,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-	wp = bch2_alloc_sectors_start(c,
+	ret = bch2_alloc_sectors_start(c,
 				      c->opts.metadata_target ?:
 				      c->opts.foreground_target,
 				      0,
@@ -221,9 +222,9 @@ retry:
 				      &devs_have,
 				      res->nr_replicas,
 				      c->opts.metadata_replicas_required,
-				      alloc_reserve, 0, cl);
-	if (IS_ERR(wp))
-		return ERR_CAST(wp);
+				      alloc_reserve, 0, cl, &wp);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
 
 	if (wp->sectors_free < btree_sectors(c)) {
 		struct open_bucket *ob;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 2cc56979fcb3..6bebbd44ccc8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -589,7 +589,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 	}
 }
 
-static void __bch2_write(struct closure *);
+static void __bch2_write(struct bch_write_op *);
 
 static void bch2_write_done(struct closure *cl)
 {
@@ -686,22 +686,86 @@ err:
 	goto out;
 }
 
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+	if (state != wp->state) {
+		u64 now = ktime_get_ns();
+
+		if (wp->last_state_change &&
+		    time_after64(now, wp->last_state_change))
+			wp->time[wp->state] += now - wp->last_state_change;
+		wp->state = state;
+		wp->last_state_change = now;
+	}
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+	enum write_point_state state;
+
+	state = running			 ? WRITE_POINT_running :
+		!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+					 : WRITE_POINT_stopped;
+
+	__wp_update_state(wp, state);
+}
+
 static void bch2_write_index(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bch_fs *c = op->c;
+	struct write_point *wp = op->wp;
+	struct workqueue_struct *wq = index_update_wq(op);
 
-	__bch2_write_index(op);
+	barrier();
 
-	if (!(op->flags & BCH_WRITE_DONE)) {
-		continue_at(cl, __bch2_write, index_update_wq(op));
-	} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
-		bch2_journal_flush_seq_async(&c->journal,
-					     *op_journal_seq(op),
-					     cl);
-		continue_at(cl, bch2_write_done, index_update_wq(op));
-	} else {
-		continue_at_nobarrier(cl, bch2_write_done, NULL);
+	/*
+	 * We're not using wp->writes_lock here, so this is racey: that's ok,
+	 * because this is just for diagnostic purposes, and we're running out
+	 * of interrupt context here so if we were to take the log we'd have to
+	 * switch to spin_lock_irq()/irqsave(), which is not free:
+	 */
+	if (wp->state == WRITE_POINT_waiting_io)
+		__wp_update_state(wp, WRITE_POINT_waiting_work);
+
+	op->btree_update_ready = true;
+	queue_work(wq, &wp->index_update_work);
+}
+
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+	struct write_point *wp =
+		container_of(work, struct write_point, index_update_work);
+	struct bch_write_op *op;
+
+	while (1) {
+		spin_lock(&wp->writes_lock);
+		list_for_each_entry(op, &wp->writes, wp_list)
+			if (op->btree_update_ready) {
+				list_del(&op->wp_list);
+				goto unlock;
+			}
+		op = NULL;
+unlock:
+		wp_update_state(wp, op != NULL);
+		spin_unlock(&wp->writes_lock);
+
+		if (!op)
+			break;
+
+		op->flags |= BCH_WRITE_IN_WORKER;
+
+		__bch2_write_index(op);
+
+		if (!(op->flags & BCH_WRITE_DONE)) {
+			__bch2_write(op);
+		} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+			bch2_journal_flush_seq_async(&op->c->journal,
+						     *op_journal_seq(op),
+						     &op->cl);
+			continue_at(&op->cl, bch2_write_done, index_update_wq(op));
+		} else {
+			bch2_write_done(&op->cl);
+		}
 	}
 }
 
@@ -734,10 +798,8 @@ static void bch2_write_endio(struct bio *bio)
 
 	if (parent)
 		bio_endio(&parent->bio);
-	else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
-		closure_put(cl);
 	else
-		continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
+		closure_put(cl);
 }
 
 static void init_append_extent(struct bch_write_op *op,
@@ -1136,19 +1198,18 @@ err:
 	return ret;
 }
 
-static void __bch2_write(struct closure *cl)
+static void __bch2_write(struct bch_write_op *op)
 {
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
-	struct write_point *wp;
+	struct write_point *wp = NULL;
 	struct bio *bio = NULL;
-	bool skip_put = true;
 	unsigned nofs_flags;
 	int ret;
 
 	nofs_flags = memalloc_nofs_save();
 again:
 	memset(&op->failed, 0, sizeof(op->failed));
+	op->btree_update_ready = false;
 
 	do {
 		struct bkey_i *key_to_write;
@@ -1158,13 +1219,13 @@ again:
 		/* +1 for possible cache device: */
 		if (op->open_buckets.nr + op->nr_replicas + 1 >
 		    ARRAY_SIZE(op->open_buckets.v))
-			goto flush_io;
+			break;
 
 		if (bch2_keylist_realloc(&op->insert_keys,
 					op->inline_keys,
 					ARRAY_SIZE(op->inline_keys),
 					BKEY_EXTENT_U64s_MAX))
-			goto flush_io;
+			break;
 
 		if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
 		    percpu_ref_is_dying(&c->writes)) {
@@ -1177,7 +1238,7 @@ again:
 		 * freeing up space on specific disks, which means that
 		 * allocations for specific disks may hang arbitrarily long:
 		 */
-		wp = bch2_alloc_sectors_start(c,
+		ret = bch2_alloc_sectors_start(c,
 			op->target,
 			op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
 			op->write_point,
@@ -1187,53 +1248,34 @@ again:
 			op->alloc_reserve,
 			op->flags,
 			(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
-				      BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
-		EBUG_ON(!wp);
-
-		if (unlikely(IS_ERR(wp))) {
-			if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
-				ret = PTR_ERR(wp);
+				      BCH_WRITE_ONLY_SPECIFIED_DEVS))
+			? NULL : &op->cl,
+			&wp);
+		if (unlikely(ret)) {
+			if (unlikely(ret != -EAGAIN))
 				goto err;
-			}
 
-			goto flush_io;
+			break;
 		}
 
-		/*
-		 * It's possible for the allocator to fail, put us on the
-		 * freelist waitlist, and then succeed in one of various retry
-		 * paths: if that happens, we need to disable the skip_put
-		 * optimization because otherwise there won't necessarily be a
-		 * barrier before we free the bch_write_op:
-		 */
-		if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
-			skip_put = false;
+		EBUG_ON(!wp);
 
 		bch2_open_bucket_get(c, wp, &op->open_buckets);
 		ret = bch2_write_extent(op, wp, &bio);
+
 		bch2_alloc_sectors_done(c, wp);
 
 		if (ret < 0)
 			goto err;
 
-		if (ret) {
-			skip_put = false;
-		} else {
-			/*
-			 * for the skip_put optimization this has to be set
-			 * before we submit the bio:
-			 */
+		if (!ret)
 			op->flags |= BCH_WRITE_DONE;
-		}
 
 		bio->bi_end_io	= bch2_write_endio;
 		bio->bi_private	= &op->cl;
 		bio->bi_opf |= REQ_OP_WRITE;
 
-		if (!skip_put)
-			closure_get(bio->bi_private);
-		else
-			op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
+		closure_get(bio->bi_private);
 
 		key_to_write = (void *) (op->insert_keys.keys_p +
 					 key_to_write_offset);
@@ -1241,55 +1283,49 @@ again:
 		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
 					  key_to_write);
 	} while (ret);
-
-	if (!skip_put)
-		continue_at(cl, bch2_write_index, index_update_wq(op));
 out:
-	memalloc_nofs_restore(nofs_flags);
-	return;
-err:
-	op->error = ret;
-	op->flags |= BCH_WRITE_DONE;
-
-	continue_at(cl, bch2_write_index, index_update_wq(op));
-	goto out;
-flush_io:
 	/*
 	 * If the write can't all be submitted at once, we generally want to
 	 * block synchronously as that signals backpressure to the caller.
-	 *
-	 * However, if we're running out of a workqueue, we can't block here
-	 * because we'll be blocking other work items from completing:
 	 */
-	if (current->flags & PF_WQ_WORKER) {
-		continue_at(cl, bch2_write_index, index_update_wq(op));
-		goto out;
-	}
-
-	closure_sync(cl);
-
-	if (!bch2_keylist_empty(&op->insert_keys)) {
+	if (!(op->flags & BCH_WRITE_DONE) &&
+	    !(op->flags & BCH_WRITE_IN_WORKER)) {
+		closure_sync(&op->cl);
 		__bch2_write_index(op);
 
-		if (op->error) {
-			op->flags |= BCH_WRITE_DONE;
-			continue_at_nobarrier(cl, bch2_write_done, NULL);
-			goto out;
-		}
+		if (!(op->flags & BCH_WRITE_DONE))
+			goto again;
+		bch2_write_done(&op->cl);
+	} else {
+		spin_lock(&wp->writes_lock);
+		op->wp = wp;
+		list_add_tail(&op->wp_list, &wp->writes);
+		if (wp->state == WRITE_POINT_stopped)
+			__wp_update_state(wp, WRITE_POINT_waiting_io);
+		spin_unlock(&wp->writes_lock);
+
+		continue_at(&op->cl, bch2_write_index, NULL);
 	}
 
-	goto again;
+	memalloc_nofs_restore(nofs_flags);
+	return;
+err:
+	op->error = ret;
+	op->flags |= BCH_WRITE_DONE;
+	goto out;
 }
 
 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 {
-	struct closure *cl = &op->cl;
 	struct bio *bio = &op->wbio.bio;
 	struct bvec_iter iter;
 	struct bkey_i_inline_data *id;
 	unsigned sectors;
 	int ret;
 
+	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+	op->flags |= BCH_WRITE_DONE;
+
 	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
 
 	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
@@ -1317,11 +1353,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 	set_bkey_val_bytes(&id->k, data_len);
 	bch2_keylist_push(&op->insert_keys);
 
-	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
-	op->flags |= BCH_WRITE_DONE;
-
-	continue_at_nobarrier(cl, bch2_write_index, NULL);
-	return;
+	__bch2_write_index(op);
 err:
 	bch2_write_done(&op->cl);
 }
@@ -1349,6 +1381,7 @@ void bch2_write(struct closure *cl)
 	struct bch_fs *c = op->c;
 	unsigned data_len;
 
+	EBUG_ON(op->cl.parent);
 	BUG_ON(!op->nr_replicas);
 	BUG_ON(!op->write_point.v);
 	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
@@ -1381,18 +1414,14 @@ void bch2_write(struct closure *cl)
 		return;
 	}
 
-	continue_at_nobarrier(cl, __bch2_write, NULL);
+	__bch2_write(op);
 	return;
 err:
 	bch2_disk_reservation_put(c, &op->res);
 
-	if (op->end_io) {
-		EBUG_ON(cl->parent);
-		closure_debug_destroy(cl);
+	closure_debug_destroy(&op->cl);
+	if (op->end_io)
 		op->end_io(op);
-	} else {
-		closure_return(cl);
-	}
 }
 
 /* Cache promotion on read */
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index f8ce9543c9e3..b484d3387968 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -41,7 +41,7 @@ enum bch_write_flags {
 	__BCH_WRITE_CHECK_ENOSPC,
 	__BCH_WRITE_MOVE,
 	__BCH_WRITE_JOURNAL_SEQ_PTR,
-	__BCH_WRITE_SKIP_CLOSURE_PUT,
+	__BCH_WRITE_IN_WORKER,
 	__BCH_WRITE_DONE,
 };
 
@@ -59,7 +59,7 @@ enum bch_write_flags {
 
 /* Internal: */
 #define BCH_WRITE_JOURNAL_SEQ_PTR	(1U << __BCH_WRITE_JOURNAL_SEQ_PTR)
-#define BCH_WRITE_SKIP_CLOSURE_PUT	(1U << __BCH_WRITE_SKIP_CLOSURE_PUT)
+#define BCH_WRITE_IN_WORKER		(1U << __BCH_WRITE_IN_WORKER)
 #define BCH_WRITE_DONE			(1U << __BCH_WRITE_DONE)
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -115,6 +115,8 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 
 void bch2_write(struct closure *);
 
+void bch2_write_point_do_index_updates(struct work_struct *);
+
 static inline struct bch_write_bio *wbio_init(struct bio *bio)
 {
 	struct bch_write_bio *wbio = to_wbio(bio);
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 53270f0a08a3..c316a39d381a 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -119,6 +119,7 @@ struct bch_write_op {
 	unsigned		nr_replicas_required:4;
 	unsigned		alloc_reserve:3;
 	unsigned		incompressible:1;
+	unsigned		btree_update_ready:1;
 
 	struct bch_devs_list	devs_have;
 	u16			target;
@@ -134,6 +135,9 @@ struct bch_write_op {
 
 	struct write_point_specifier write_point;
 
+	struct write_point	*wp;
+	struct list_head	wp_list;
+
 	struct disk_reservation	res;
 
 	struct open_buckets	open_buckets;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e4201aecdba1..208482db3683 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -791,7 +791,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
 
 	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+				WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
 	    !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 7e10adba5c75..ec672134cb18 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -180,6 +180,7 @@ read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
 read_attribute(stripes_heap);
 read_attribute(open_buckets);
+read_attribute(write_points);
 
 read_attribute(internal_uuid);
 
@@ -418,6 +419,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_open_buckets)
 		bch2_open_buckets_to_text(out, c);
 
+	if (attr == &sysfs_write_points)
+		bch2_write_points_to_text(out, c);
+
 	if (attr == &sysfs_compression_stats)
 		bch2_compression_stats_to_text(out, c);
 
@@ -563,6 +567,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_new_stripes,
 	&sysfs_stripes_heap,
 	&sysfs_open_buckets,
+	&sysfs_write_points,
 	&sysfs_io_timers_read,
 	&sysfs_io_timers_write,
 
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index f290c069c683..e1d36d9b092c 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -390,7 +390,7 @@ static const struct time_unit *pick_time_units(u64 ns)
 	return u;
 }
 
-static void pr_time_units(struct printbuf *out, u64 ns)
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
 {
 	const struct time_unit *u = pick_time_units(ns);
 
@@ -410,13 +410,13 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
 
 	pr_buf(out, "frequency:\t");
-	pr_time_units(out, freq);
+	bch2_pr_time_units(out, freq);
 
 	pr_buf(out, "\navg duration:\t");
-	pr_time_units(out, stats->average_duration);
+	bch2_pr_time_units(out, stats->average_duration);
 
 	pr_buf(out, "\nmax duration:\t");
-	pr_time_units(out, stats->max_duration);
+	bch2_pr_time_units(out, stats->max_duration);
 
 	i = eytzinger0_first(NR_QUANTILES);
 	u = pick_time_units(stats->quantiles.entries[i].m);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index fd776fb281b7..085f1c357383 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -352,6 +352,8 @@ static inline void pr_sectors(struct printbuf *out, u64 v)
 	bch2_pr_units(out, v, v << 9);
 }
 
+void bch2_pr_time_units(struct printbuf *, u64);
+
 #ifdef __KERNEL__
 static inline void pr_time(struct printbuf *out, u64 time)
 {
author	Kent Overstreet <kent.overstreet@linux.dev>	2022-10-31 16:13:05 -0400
committer	Kent Overstreet <kent.overstreet@linux.dev>	2023-10-22 17:09:29 -0400
commit	b17d3cec14b487924df709dbeffb900f124a2607 (patch)
tree	e3a5ce061e8e294d8d0570a7916996cebefb9d8d /fs/bcachefs
parent	5f417394033a0d8bfb31d02b3becf7381dc13867 (diff)
download	linux-stable-b17d3cec14b487924df709dbeffb900f124a2607.tar.gz linux-stable-b17d3cec14b487924df709dbeffb900f124a2607.tar.bz2 linux-stable-b17d3cec14b487924df709dbeffb900f124a2607.zip