summaryrefslogtreecommitdiffstats
path: root/fs/f2fs/file.c
diff options
context:
space:
mode:
authorEric Biggers <ebiggers@google.com>2021-07-23 00:59:21 -0700
committerJaegeuk Kim <jaegeuk@kernel.org>2021-12-10 15:48:30 -0800
commita1e09b03e6f5c1d713c88259909137c0fd264ede (patch)
treebbb25b680bdb35e3b9b87868e165db453519c619 /fs/f2fs/file.c
parent1517c1a7a4456f080fabc4ac9853930e4b880d14 (diff)
downloadlinux-a1e09b03e6f5c1d713c88259909137c0fd264ede.tar.gz
linux-a1e09b03e6f5c1d713c88259909137c0fd264ede.tar.bz2
linux-a1e09b03e6f5c1d713c88259909137c0fd264ede.zip
f2fs: use iomap for direct I/O
Make f2fs_file_read_iter() and f2fs_file_write_iter() use the iomap direct I/O implementation instead of the fs/direct-io.c one. The iomap implementation is more efficient, and it also avoids the need to add new features and optimizations to the old implementation. This new implementation also eliminates the need for f2fs to hook bio submission and completion and to allocate memory per-bio. This is because it's possible to correctly update f2fs's in-flight DIO counters using __iomap_dio_rw() in combination with an implementation of iomap_dio_ops::end_io() (as suggested by Christoph Hellwig). When possible, this new implementation preserves existing f2fs behavior such as the conditions for falling back to buffered I/O. This patch has been tested with xfstests by running 'gce-xfstests -c f2fs -g auto -X generic/017' with and without this patch; no regressions were seen. (Some tests fail both before and after. generic/017 hangs both before and after, so it had to be excluded.) Signed-off-by: Eric Biggers <ebiggers@google.com> [Jaegeuk Kim: use spin_lock_bh for f2fs_update_iostat in softirq] Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Diffstat (limited to 'fs/f2fs/file.c')
-rw-r--r--fs/f2fs/file.c342
1 files changed, 301 insertions, 41 deletions
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 048db4852b28..7516d97d5016 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -24,6 +24,7 @@
#include <linux/sched/signal.h>
#include <linux/fileattr.h>
#include <linux/fadvise.h>
+#include <linux/iomap.h>
#include "f2fs.h"
#include "node.h"
@@ -4219,23 +4220,145 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return __f2fs_ioctl(filp, cmd, arg);
}
-static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+/*
+ * Return %true if the given read or write request should use direct I/O, or
+ * %false if it should use buffered I/O.
+ */
+static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ unsigned int align;
+
+ if (!(iocb->ki_flags & IOCB_DIRECT))
+ return false;
+
+ if (f2fs_force_buffered_io(inode, iocb, iter))
+ return false;
+
+ /*
+ * Direct I/O not aligned to the disk's logical_block_size will be
+ * attempted, but will fail with -EINVAL.
+ *
+ * f2fs additionally requires that direct I/O be aligned to the
+ * filesystem block size, which is often a stricter requirement.
+ * However, f2fs traditionally falls back to buffered I/O on requests
+ * that are logical_block_size-aligned but not fs-block aligned.
+ *
+ * The below logic implements this behavior.
+ */
+ align = iocb->ki_pos | iov_iter_alignment(iter);
+ if (!IS_ALIGNED(align, i_blocksize(inode)) &&
+ IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev)))
+ return false;
+
+ return true;
+}
+
+static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error,
+ unsigned int flags)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp));
+
+ dec_page_count(sbi, F2FS_DIO_READ);
+ if (error)
+ return error;
+ f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size);
+ return 0;
+}
+
+static const struct iomap_dio_ops f2fs_iomap_dio_read_ops = {
+ .end_io = f2fs_dio_read_end_io,
+};
+
+static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- int ret;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ const loff_t pos = iocb->ki_pos;
+ const size_t count = iov_iter_count(to);
+ struct iomap_dio *dio;
+ ssize_t ret;
+
+ if (count == 0)
+ return 0; /* skip atime update */
+
+ trace_f2fs_direct_IO_enter(inode, pos, count, READ);
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ } else {
+ down_read(&fi->i_gc_rwsem[READ]);
+ }
+
+ /*
+ * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of
+ * the higher-level function iomap_dio_rw() in order to ensure that the
+ * F2FS_DIO_READ counter will be decremented correctly in all cases.
+ */
+ inc_page_count(sbi, F2FS_DIO_READ);
+ dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops,
+ &f2fs_iomap_dio_read_ops, 0, 0);
+ if (IS_ERR_OR_NULL(dio)) {
+ ret = PTR_ERR_OR_ZERO(dio);
+ if (ret != -EIOCBQUEUED)
+ dec_page_count(sbi, F2FS_DIO_READ);
+ } else {
+ ret = iomap_dio_complete(dio);
+ }
+
+ up_read(&fi->i_gc_rwsem[READ]);
+
+ file_accessed(file);
+out:
+ trace_f2fs_direct_IO_exit(inode, pos, count, READ, ret);
+ return ret;
+}
+
+static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- ret = generic_file_read_iter(iocb, iter);
+ if (f2fs_should_use_dio(inode, iocb, to))
+ return f2fs_dio_read_iter(iocb, to);
+ ret = filemap_read(iocb, to, 0);
if (ret > 0)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_READ_IO, ret);
-
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret);
return ret;
}
+static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ ssize_t count;
+ int err;
+
+ if (IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED))
+ return -EPERM;
+
+ count = generic_write_checks(iocb, from);
+ if (count <= 0)
+ return count;
+
+ err = file_modified(file);
+ if (err)
+ return err;
+ return count;
+}
+
/*
* Preallocate blocks for a write request, if it is possible and helpful to do
* so. Returns a positive number if blocks may have been preallocated, 0 if no
@@ -4243,15 +4366,14 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the
* requested blocks (not just some of them) have been allocated.
*/
-static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter)
+static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
+ bool dio)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
const loff_t pos = iocb->ki_pos;
const size_t count = iov_iter_count(iter);
struct f2fs_map_blocks map = {};
- bool dio = (iocb->ki_flags & IOCB_DIRECT) &&
- !f2fs_force_buffered_io(inode, iocb, iter);
int flag;
int ret;
@@ -4308,13 +4430,174 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter)
return map.m_len;
}
-static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EOPNOTSUPP;
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = generic_perform_write(file, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret);
+ }
+ return ret;
+}
+
+static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
+ unsigned int flags)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp));
+
+ dec_page_count(sbi, F2FS_DIO_WRITE);
+ if (error)
+ return error;
+ f2fs_update_iostat(sbi, APP_DIRECT_IO, size);
+ return 0;
+}
+
+static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = {
+ .end_io = f2fs_dio_write_end_io,
+};
+
+static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ bool *may_need_sync)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ const bool do_opu = f2fs_lfs_mode(sbi);
+ const int whint_mode = F2FS_OPTION(sbi).whint_mode;
+ const loff_t pos = iocb->ki_pos;
+ const ssize_t count = iov_iter_count(from);
+ const enum rw_hint hint = iocb->ki_hint;
+ unsigned int dio_flags;
+ struct iomap_dio *dio;
+ ssize_t ret;
+
+ trace_f2fs_direct_IO_enter(inode, pos, count, WRITE);
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* f2fs_convert_inline_inode() and block allocation can block */
+ if (f2fs_has_inline_data(inode) ||
+ !f2fs_overwrite_io(inode, pos, count)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ if (!down_read_trylock(&fi->i_gc_rwsem[WRITE])) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ up_read(&fi->i_gc_rwsem[WRITE]);
+ ret = -EAGAIN;
+ goto out;
+ }
+ } else {
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ goto out;
+
+ down_read(&fi->i_gc_rwsem[WRITE]);
+ if (do_opu)
+ down_read(&fi->i_gc_rwsem[READ]);
+ }
+ if (whint_mode == WHINT_MODE_OFF)
+ iocb->ki_hint = WRITE_LIFE_NOT_SET;
+
+ /*
+ * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of
+ * the higher-level function iomap_dio_rw() in order to ensure that the
+ * F2FS_DIO_WRITE counter will be decremented correctly in all cases.
+ */
+ inc_page_count(sbi, F2FS_DIO_WRITE);
+ dio_flags = 0;
+ if (pos + count > inode->i_size)
+ dio_flags |= IOMAP_DIO_FORCE_WAIT;
+ dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops,
+ &f2fs_iomap_dio_write_ops, dio_flags, 0);
+ if (IS_ERR_OR_NULL(dio)) {
+ ret = PTR_ERR_OR_ZERO(dio);
+ if (ret == -ENOTBLK)
+ ret = 0;
+ if (ret != -EIOCBQUEUED)
+ dec_page_count(sbi, F2FS_DIO_WRITE);
+ } else {
+ ret = iomap_dio_complete(dio);
+ }
+
+ if (whint_mode == WHINT_MODE_OFF)
+ iocb->ki_hint = hint;
+ if (do_opu)
+ up_read(&fi->i_gc_rwsem[READ]);
+ up_read(&fi->i_gc_rwsem[WRITE]);
+
+ if (ret < 0)
+ goto out;
+ if (pos + ret > inode->i_size)
+ f2fs_i_size_write(inode, pos + ret);
+ if (!do_opu)
+ set_inode_flag(inode, FI_UPDATE_WRITE);
+
+ if (iov_iter_count(from)) {
+ ssize_t ret2;
+ loff_t bufio_start_pos = iocb->ki_pos;
+
+ /*
+ * The direct write was partial, so we need to fall back to a
+ * buffered write for the remainder.
+ */
+
+ ret2 = f2fs_buffered_write_iter(iocb, from);
+ if (iov_iter_count(from))
+ f2fs_write_failed(inode, iocb->ki_pos);
+ if (ret2 < 0)
+ goto out;
+
+ /*
+ * Ensure that the pagecache pages are written to disk and
+ * invalidated to preserve the expected O_DIRECT semantics.
+ */
+ if (ret2 > 0) {
+ loff_t bufio_end_pos = bufio_start_pos + ret2 - 1;
+
+ ret += ret2;
+
+ ret2 = filemap_write_and_wait_range(file->f_mapping,
+ bufio_start_pos,
+ bufio_end_pos);
+ if (ret2 < 0)
+ goto out;
+ invalidate_mapping_pages(file->f_mapping,
+ bufio_start_pos >> PAGE_SHIFT,
+ bufio_end_pos >> PAGE_SHIFT);
+ }
+ } else {
+ /* iomap_dio_rw() already handled the generic_write_sync(). */
+ *may_need_sync = false;
+ }
+out:
+ trace_f2fs_direct_IO_exit(inode, pos, count, WRITE, ret);
+ return ret;
+}
+
+static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
const loff_t orig_pos = iocb->ki_pos;
const size_t orig_count = iov_iter_count(from);
loff_t target_size;
+ bool dio;
+ bool may_need_sync = true;
int preallocated;
ssize_t ret;
@@ -4337,42 +4620,23 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
inode_lock(inode);
}
- if (unlikely(IS_IMMUTABLE(inode))) {
- ret = -EPERM;
- goto out_unlock;
- }
-
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
- ret = -EPERM;
- goto out_unlock;
- }
-
- ret = generic_write_checks(iocb, from);
+ ret = f2fs_write_checks(iocb, from);
if (ret <= 0)
goto out_unlock;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!f2fs_overwrite_io(inode, iocb->ki_pos,
- iov_iter_count(from)) ||
- f2fs_has_inline_data(inode) ||
- f2fs_force_buffered_io(inode, iocb, from)) {
- ret = -EAGAIN;
- goto out_unlock;
- }
- }
+ /* Determine whether we will do a direct write or a buffered write. */
+ dio = f2fs_should_use_dio(inode, iocb, from);
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- goto out_unlock;
- }
/* Possibly preallocate the blocks for the write. */
target_size = iocb->ki_pos + iov_iter_count(from);
- preallocated = f2fs_preallocate_blocks(iocb, from);
+ preallocated = f2fs_preallocate_blocks(iocb, from, dio);
if (preallocated < 0)
ret = preallocated;
else
- ret = __generic_file_write_iter(iocb, from);
+ /* Do the actual write. */
+ ret = dio ?
+ f2fs_dio_write_iter(iocb, from, &may_need_sync):
+ f2fs_buffered_write_iter(iocb, from);
/* Don't leave any preallocated blocks around past i_size. */
if (preallocated && i_size_read(inode) < target_size) {
@@ -4387,15 +4651,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
clear_inode_flag(inode, FI_PREALLOCATED_ALL);
-
- if (ret > 0)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
-
out_unlock:
inode_unlock(inode);
out:
trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret);
- if (ret > 0)
+ if (ret > 0 && may_need_sync)
ret = generic_write_sync(iocb, ret);
return ret;
}