summaryrefslogtreecommitdiffstats
path: root/fs/bcachefs/fs-io.c
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2022-11-02 17:12:00 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:09:51 -0400
commita8b3a677e786fa869d220a6a78b5532a36dc2f4d (patch)
tree3fdbdbb71945ae42dab8dc94971e1c78286eaa63 /fs/bcachefs/fs-io.c
parent4dcd1cae72912ab08d313ee5a730608022b211d4 (diff)
downloadlinux-a8b3a677e786fa869d220a6a78b5532a36dc2f4d.tar.gz
linux-a8b3a677e786fa869d220a6a78b5532a36dc2f4d.tar.bz2
linux-a8b3a677e786fa869d220a6a78b5532a36dc2f4d.zip
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when possible. Patch components: - New boolean filesystem and inode option, nocow: note that when nocow is enabled, data checksumming and compression are implicitly disabled - To prevent in-place writes from racing with data moves (data_update.c) or bucket reuse (i.e. a bucket being reused and re-allocated while a nocow write is in flight, we have a new locking mechanism. Buckets can be locked for either data update or data move, using a fixed size hash table of two_state_shared locks. We don't have any chaining, meaning updates and moves to different buckets that hash to the same lock will wait unnecessarily - we'll want to watch for this becoming an issue. - The allocator path also needs to check for in-place writes in flight to a given bucket before giving it out: thus we add another counter to bucket_alloc_state so we can track this. - Fsync now may need to issue cache flushes to block devices instead of flushing the journal. We add a device bitmask to bch_inode_info, ei_devs_need_flush, which tracks devices that need to have flushes issued - note that this will lead to unnecessary flushes when other codepaths have already issued flushes, we may want to replace this with a sequence number. - New nocow write path: look up extents, and if they're writable write to them - otherwise fall back to the normal COW write path. XXX: switch to sequence numbers instead of bitmask for devs needing journal flush XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to run in process context - see if we can improve this Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs/fs-io.c')
-rw-r--r--fs/bcachefs/fs-io.c98
1 files changed, 89 insertions, 9 deletions
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b5cf0a3218ea..ec575b27eedb 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,6 +35,72 @@
#include <trace/events/writeback.h>
+struct nocow_flush {
+ struct closure *cl;
+ struct bch_dev *ca;
+ struct bio bio;
+};
+
+static void nocow_flush_endio(struct bio *_bio)
+{
+
+ struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
+
+ closure_put(bio->cl);
+ percpu_ref_put(&bio->ca->io_ref);
+ bio_put(&bio->bio);
+}
+
+static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct closure *cl)
+{
+ struct nocow_flush *bio;
+ struct bch_dev *ca;
+ struct bch_devs_mask devs;
+ unsigned dev;
+
+ dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
+ if (dev == BCH_SB_MEMBERS_MAX)
+ return;
+
+ devs = inode->ei_devs_need_flush;
+ memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
+
+ for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
+ rcu_read_lock();
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca && !percpu_ref_tryget(&ca->io_ref))
+ ca = NULL;
+ rcu_read_unlock();
+
+ if (!ca)
+ continue;
+
+ bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
+ REQ_OP_FLUSH,
+ GFP_KERNEL,
+ &c->nocow_flush_bioset),
+ struct nocow_flush, bio);
+ bio->cl = cl;
+ bio->ca = ca;
+ bio->bio.bi_end_io = nocow_flush_endio;
+ closure_bio_submit(&bio->bio, cl);
+ }
+}
+
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+ struct bch_inode_info *inode)
+{
+ struct closure cl;
+
+ closure_init_stack(&cl);
+ bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+ closure_sync(&cl);
+
+ return 0;
+}
+
static inline bool bio_full(struct bio *bio, unsigned len)
{
if (bio->bi_vcnt >= bio->bi_max_vecs)
@@ -1327,6 +1393,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->subvol = inode->ei_subvol;
op->pos = POS(inode->v.i_ino, sector);
op->end_io = bch2_writepage_io_done;
+ op->devs_need_flush = &inode->ei_devs_need_flush;
op->wbio.bio.bi_iter.bi_sector = sector;
op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
}
@@ -2148,10 +2215,12 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio)
if (!dio->op.error) {
ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
- if (ret)
+ if (ret) {
dio->op.error = ret;
- else
+ } else {
bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
+ bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
+ }
}
if (dio->sync) {
@@ -2296,6 +2365,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
dio->op.nr_replicas = dio->op.opts.data_replicas;
dio->op.subvol = inode->ei_subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+ dio->op.devs_need_flush = &inode->ei_devs_need_flush;
if (sync)
dio->op.flags |= BCH_WRITE_SYNC;
@@ -2495,19 +2565,21 @@ out:
* inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
* insert trigger: look up the btree inode instead
*/
-static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum)
+static int bch2_flush_inode(struct bch_fs *c,
+ struct bch_inode_info *inode)
{
- struct bch_inode_unpacked inode;
+ struct bch_inode_unpacked u;
int ret;
if (c->opts.journal_flush_disabled)
return 0;
- ret = bch2_inode_find_by_inum(c, inum, &inode);
+ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
if (ret)
return ret;
- return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq);
+ return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+ bch2_inode_flush_nocow_writes(c, inode);
}
int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -2518,7 +2590,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = file_write_and_wait_range(file, start, end);
ret2 = sync_inode_metadata(&inode->v, 1);
- ret3 = bch2_flush_inode(c, inode_inum(inode));
+ ret3 = bch2_flush_inode(c, inode);
return bch2_err_class(ret ?: ret2 ?: ret3);
}
@@ -3105,6 +3177,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
continue;
}
+ /*
+ * XXX: for nocow mode, we should promote shared extents to
+ * unshared here
+ */
+
sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset;
if (!bkey_extent_is_allocation(k.k)) {
@@ -3368,7 +3445,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
IS_SYNC(file_inode(file_dst)))
- ret = bch2_flush_inode(c, inode_inum(dst));
+ ret = bch2_flush_inode(c, dst);
err:
bch2_quota_reservation_put(c, dst, &quota_res);
bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
@@ -3622,6 +3699,7 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
void bch2_fs_fsio_exit(struct bch_fs *c)
{
+ bioset_exit(&c->nocow_flush_bioset);
bioset_exit(&c->dio_write_bioset);
bioset_exit(&c->dio_read_bioset);
bioset_exit(&c->writepage_bioset);
@@ -3641,7 +3719,9 @@ int bch2_fs_fsio_init(struct bch_fs *c)
BIOSET_NEED_BVECS) ||
bioset_init(&c->dio_write_bioset,
4, offsetof(struct dio_write, op.wbio.bio),
- BIOSET_NEED_BVECS))
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->nocow_flush_bioset,
+ 1, offsetof(struct nocow_flush, bio), 0))
ret = -ENOMEM;
pr_verbose_init(c->opts, "ret %i", ret);