From ec56b1f1fdc69599963574ce94cc5693d535dd64 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:18:18 +1000 Subject: xfs: mmap lock needs to be inside freeze protection Lock ordering for the new mmap lock needs to be: mmap_sem sb_start_pagefault i_mmap_lock page lock Right now xfs_vm_page_mkwrite gets this the wrong way around, While technically it cannot deadlock due to the current freeze ordering, it's still a landmine that might explode if we change anything in future. Hence we need to nest the locks correctly. Signed-off-by: Dave Chinner Reviewed-by: Jan Kara Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8121e75352ee..0b4e79fd8d05 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1487,15 +1487,20 @@ xfs_filemap_page_mkwrite( struct vm_fault *vmf) { struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; + int ret; trace_xfs_filemap_page_mkwrite(ip); + sb_start_pagefault(VFS_I(ip)->i_sb); + file_update_time(vma->vm_file); xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + sb_end_pagefault(VFS_I(ip)->i_sb); - return error; + return block_page_mkwrite_return(ret); } const struct file_operations xfs_file_operations = { -- cgit v1.2.3 From 6b698edeeef00c127d73501b386590299f01327a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:18:53 +1000 Subject: xfs: add DAX file operations support Add the initial support for DAX file operations to XFS. This includes the necessary block allocation and mmap page fault hooks for DAX to function. Note that there are changes to the splice interfaces to ensure that for DAX splice avoids direct page cache manipulations and instead takes the DAX IO paths for read/write operations. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 118 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 69 insertions(+), 49 deletions(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0b4e79fd8d05..a629dce4903e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -284,7 +284,7 @@ xfs_file_read_iter( if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; - if (unlikely(ioflags & XFS_IO_ISDIRECT)) { + if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -378,7 +378,11 @@ xfs_file_splice_read( trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + /* for dax, we need to avoid the page cache */ + if (IS_DAX(VFS_I(ip))) + ret = default_file_splice_read(infilp, ppos, pipe, count, flags); + else + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -672,7 +676,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if ((pos | count) & target->bt_logical_sectormask) + if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask)) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -758,8 +762,11 @@ xfs_file_dio_aio_write( out: xfs_rw_iunlock(ip, iolock); - /* No fallback to buffered IO on errors for XFS. */ - ASSERT(ret < 0 || ret == count); + /* + * No fallback to buffered IO on errors for XFS. DAX can result in + * partial writes, but direct IO will either complete fully or fail. + */ + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); return ret; } @@ -842,7 +849,7 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1063,17 +1070,6 @@ xfs_file_readdir( return xfs_readdir(ip, ctx, bufsize); } -STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) -{ - vma->vm_ops = &xfs_file_vm_ops; - - file_accessed(filp); - return 0; -} - /* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). @@ -1454,26 +1450,11 @@ xfs_file_llseek( * ordering of: * * mmap_sem (MM) - * i_mmap_lock (XFS - truncate serialisation) - * page_lock (MM) - * i_lock (XFS - extent map serialisation) + * sb_start_pagefault(vfs, freeze) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) */ -STATIC int -xfs_filemap_fault( - struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; - - trace_xfs_filemap_fault(ip); - - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = filemap_fault(vma, vmf); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - - return error; -} /* * mmap()d file has taken write protection fault and is being made writable. We @@ -1486,21 +1467,66 @@ xfs_filemap_page_mkwrite( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + struct inode *inode = file_inode(vma->vm_file); int ret; - trace_xfs_filemap_page_mkwrite(ip); + trace_xfs_filemap_page_mkwrite(XFS_I(inode)); - sb_start_pagefault(VFS_I(ip)->i_sb); + sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + + if (IS_DAX(inode)) { + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, + xfs_end_io_dax_write); + } else { + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = block_page_mkwrite_return(ret); + } + + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + + return ret; +} + +STATIC int +xfs_filemap_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file)); + int ret; - ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + trace_xfs_filemap_fault(ip); + + /* DAX can shortcut the normal fault path on write faults! */ + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip))) + return xfs_filemap_page_mkwrite(vma, vmf); + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + ret = filemap_fault(vma, vmf); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - sb_end_pagefault(VFS_I(ip)->i_sb); - return block_page_mkwrite_return(ret); + return ret; +} + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, +}; + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + file_accessed(filp); + vma->vm_ops = &xfs_file_vm_ops; + if (IS_DAX(file_inode(filp))) + vma->vm_flags |= VM_MIXEDMAP; + return 0; } const struct file_operations xfs_file_operations = { @@ -1531,9 +1557,3 @@ const struct file_operations xfs_dir_file_operations = { #endif .fsync = xfs_dir_fsync, }; - -static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = xfs_filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = xfs_filemap_page_mkwrite, -}; -- cgit v1.2.3 From 4f69f578a87d39c20b1ff70005a125e4594c3de8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:19:08 +1000 Subject: xfs: add DAX block zeroing support Add initial support for DAX block zeroing operations to XFS. DAX cannot use buffered IO through the page cache for zeroing, nor do we need to issue IO for uncached block zeroing. In both cases, we can simply call out to the dax block zeroing function. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 45 +++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a629dce4903e..cfd9b4f5ad6e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -79,14 +79,15 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero + * xfs_iozero clears the specified range supplied via the page cache (except in + * the DAX case). Writes through the page cache will allocate blocks over holes, + * though the callers usually map the holes first and avoid them. If a block is + * not completely zeroed, then it will be read from disk before being partially + * zeroed. * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. + * In the DAX case, we can just directly write to the underlying pages. This + * will not allocate blocks, but will avoid holes and unwritten extents and so + * not do unnecessary work. */ int xfs_iozero( @@ -96,7 +97,8 @@ xfs_iozero( { struct page *page; struct address_space *mapping; - int status; + int status = 0; + mapping = VFS_I(ip)->i_mapping; do { @@ -108,20 +110,27 @@ xfs_iozero( if (bytes > count) bytes = count; - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; + if (IS_DAX(VFS_I(ip))) { + status = dax_zero_page_range(VFS_I(ip), pos, bytes, + xfs_get_blocks_direct); + if (status) + break; + } else { + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; - zero_user(page, offset, bytes); + zero_user(page, offset, bytes); - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ + status = pagecache_write_end(NULL, mapping, pos, bytes, + bytes, page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + status = 0; + } pos += bytes; count -= bytes; - status = 0; } while (count); return (-status); -- cgit v1.2.3 From 4906e21545814e4129595118287a2f1415483c0b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2015 13:47:56 +1000 Subject: xfs: remove the flags argument to xfs_trans_cancel xfs_trans_cancel takes two flags arguments: XFS_TRANS_RELEASE_LOG_RES and XFS_TRANS_ABORT. Both of them are a direct product of the transaction state, and can be deducted: - any dirty transaction needs XFS_TRANS_ABORT to be properly canceled, and XFS_TRANS_ABORT is a noop for a transaction that is not dirty. - any transaction with a permanent log reservation needs XFS_TRANS_RELEASE_LOG_RES to be properly canceled, and passing XFS_TRANS_RELEASE_LOG_RES for a transaction without a permanent log reservation is invalid. So just remove the flags argument and do the right thing. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8121e75352ee..46598b7bce86 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -138,7 +138,7 @@ xfs_update_prealloc_flags( tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } -- cgit v1.2.3 From 70393313dd0b26a6a79e2737b6dff1f1937b936d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2015 13:48:08 +1000 Subject: xfs: saner xfs_trans_commit interface The flags argument to xfs_trans_commit is not useful for most callers, as a commit of a transaction without a permanent log reservation must pass 0 here, and all callers for a transaction with a permanent log reservation except for xfs_trans_roll must pass XFS_TRANS_RELEASE_LOG_RES. So remove the flags argument from the public xfs_trans_commit interfaces, and introduce low-level __xfs_trans_commit variant just for xfs_trans_roll that regrants a log reservation instead of releasing it. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs/xfs_file.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 46598b7bce86..0dec85865ce4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -160,7 +160,7 @@ xfs_update_prealloc_flags( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (flags & XFS_PREALLOC_SYNC) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } /* -- cgit v1.2.3