diff options
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r-- | fs/direct-io.c | 79 |
1 files changed, 64 insertions, 15 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c index 5fa2211e49ae..3aafb3343a65 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -45,6 +45,12 @@ #define DIO_PAGES 64 /* + * Flags for dio_complete() + */ +#define DIO_COMPLETE_ASYNC 0x01 /* This is async IO */ +#define DIO_COMPLETE_INVALIDATE 0x02 /* Can invalidate pages */ + +/* * This code generally works in units of "dio_blocks". A dio_block is * somewhere between the hard sector size and the filesystem block size. it * is determined on a per-invocation basis. When talking to the filesystem @@ -225,10 +231,11 @@ static inline struct page *dio_get_page(struct dio *dio, * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) { loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; + int err; /* * AIO submission can race with bio completion to get here while @@ -259,18 +266,37 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) ret = transferred; if (dio->end_io) { - int err; - // XXX: ki_pos?? err = dio->end_io(dio->iocb, offset, ret, dio->private); if (err) ret = err; } + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + * + * And this page cache invalidation has to be after dio->end_io(), as + * some filesystems convert unwritten extents to real allocations in + * end_io() when necessary, otherwise a racing buffer read would cache + * zeros from unwritten extents. + */ + if (flags & DIO_COMPLETE_INVALIDATE && + ret > 0 && dio->op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages) { + err = invalidate_inode_pages2_range(dio->inode->i_mapping, + offset >> PAGE_SHIFT, + (offset + ret - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(err); + } + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(dio->inode); - if (is_async) { + if (flags & DIO_COMPLETE_ASYNC) { /* * generic_write_sync expects ki_pos to have been updated * already, but the submission path only does this for @@ -291,7 +317,7 @@ static void dio_aio_complete_work(struct work_struct *work) { struct dio *dio = container_of(work, struct dio, complete_work); - dio_complete(dio, 0, true); + dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE); } static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); @@ -304,6 +330,7 @@ static void dio_bio_end_aio(struct bio *bio) struct dio *dio = bio->bi_private; unsigned long remaining; unsigned long flags; + bool defer_completion = false; /* cleanup the bio */ dio_bio_complete(dio, bio); @@ -315,12 +342,24 @@ static void dio_bio_end_aio(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - if (dio->result && dio->defer_completion) { + /* + * Defer completion when defer_completion is set or + * when the inode has pages mapped and this is AIO write. + * We need to invalidate those pages because there is a + * chance they contain stale data in the case buffered IO + * went in between AIO submission and completion into the + * same region. + */ + if (dio->result) + defer_completion = dio->defer_completion || + (dio->op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages); + if (defer_completion) { INIT_WORK(&dio->complete_work, dio_aio_complete_work); queue_work(dio->inode->i_sb->s_dio_done_wq, &dio->complete_work); } else { - dio_complete(dio, 0, true); + dio_complete(dio, 0, DIO_COMPLETE_ASYNC); } } } @@ -458,7 +497,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); @@ -838,7 +877,8 @@ out: */ if (sdio->boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); - dio_bio_submit(dio, sdio); + if (sdio->bio) + dio_bio_submit(dio, sdio); put_page(sdio->cur_page); sdio->cur_page = NULL; } @@ -1112,7 +1152,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { - unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; @@ -1210,10 +1250,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * For AIO O_(D)SYNC writes we need to defer completions to a workqueue * so that we can call ->fsync. */ - if (dio->is_async && iov_iter_rw(iter) == WRITE && - ((iocb->ki_filp->f_flags & O_DSYNC) || - IS_SYNC(iocb->ki_filp->f_mapping->host))) { - retval = dio_set_defer_completion(dio); + if (dio->is_async && iov_iter_rw(iter) == WRITE) { + retval = 0; + if ((iocb->ki_filp->f_flags & O_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host)) + retval = dio_set_defer_completion(dio); + else if (!dio->inode->i_sb->s_dio_done_wq) { + /* + * In case of AIO write racing with buffered read we + * need to defer completion. We can't decide this now, + * however the workqueue needs to be initialized here. + */ + retval = sb_init_dio_done_wq(dio->inode->i_sb); + } if (retval) { /* * We grab i_mutex only for reads so we don't have @@ -1322,7 +1371,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio_await_completion(dio); if (drop_refcount(dio) == 0) { - retval = dio_complete(dio, retval, false); + retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE); } else BUG_ON(retval != -EIOCBQUEUED); |