summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/xfs.txt123
-rw-r--r--MAINTAINERS7
-rw-r--r--fs/dax.c252
-rw-r--r--fs/ext2/Kconfig1
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/file.c76
-rw-r--r--fs/ext2/inode.c100
-rw-r--r--fs/internal.h11
-rw-r--r--fs/iomap.c89
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c325
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h35
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c112
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c95
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h10
-rw-r--r--fs/xfs/libxfs/xfs_btree.c59
-rw-r--r--fs/xfs/libxfs/xfs_btree.h28
-rw-r--r--fs/xfs/libxfs/xfs_defer.c79
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h10
-rw-r--r--fs/xfs/xfs_aops.c31
-rw-r--r--fs/xfs/xfs_aops.h1
-rw-r--r--fs/xfs/xfs_buf_item.c9
-rw-r--r--fs/xfs/xfs_file.c82
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_icache.c14
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_iomap.c494
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_mount.h44
-rw-r--r--fs/xfs/xfs_rmap_item.c36
-rw-r--r--fs/xfs/xfs_rmap_item.h8
-rw-r--r--fs/xfs/xfs_super.c5
-rw-r--r--fs/xfs/xfs_sysfs.c47
-rw-r--r--fs/xfs/xfs_trace.h75
-rw-r--r--fs/xfs/xfs_trans.c3
-rw-r--r--fs/xfs/xfs_trans_extfree.c3
-rw-r--r--fs/xfs/xfs_xattr.c1
-rw-r--r--include/linux/dax.h6
-rw-r--r--include/linux/iomap.h4
-rw-r--r--mm/filemap.c14
43 files changed, 1694 insertions, 616 deletions
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 8146e9fd5ffc..c2d44e6e117b 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -348,3 +348,126 @@ Removed Sysctls
---- -------
fs.xfs.xfsbufd_centisec v4.0
fs.xfs.age_buffer_centisecs v4.0
+
+
+Error handling
+==============
+
+XFS can act differently according to the type of error found during its
+operation. The implementation introduces the following concepts to the error
+handler:
+
+ -failure speed:
+ Defines how fast XFS should propagate an error upwards when a specific
+ error is found during the filesystem operation. It can propagate
+ immediately, after a defined number of retries, after a set time period,
+ or simply retry forever.
+
+ -error classes:
+ Specifies the subsystem the error configuration will apply to, such as
+ metadata IO or memory allocation. Different subsystems will have
+ different error handlers for which behaviour can be configured.
+
+ -error handlers:
+ Defines the behavior for a specific error.
+
+The filesystem behavior during an error can be set via sysfs files. Each
+error handler works independently - the first condition met by an error handler
+for a specific class will cause the error to be propagated rather than reset and
+retried.
+
+The action taken by the filesystem when the error is propagated is context
+dependent - it may cause a shut down in the case of an unrecoverable error,
+it may be reported back to userspace, or it may even be ignored because
+there's nothing useful we can with the error or anyone we can report it to (e.g.
+during unmount).
+
+The configuration files are organized into the following hierarchy for each
+mounted filesystem:
+
+ /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+Where:
+ <dev>
+ The short device name of the mounted filesystem. This is the same device
+ name that shows up in XFS kernel error messages as "XFS(<dev>): ..."
+
+ <class>
+ The subsystem the error configuration belongs to. As of 4.9, the defined
+ classes are:
+
+ - "metadata": applies metadata buffer write IO
+
+ <error>
+ The individual error handler configurations.
+
+
+Each filesystem has "global" error configuration options defined in their top
+level directory:
+
+ /sys/fs/xfs/<dev>/error/
+
+ fail_at_unmount (Min: 0 Default: 1 Max: 1)
+ Defines the filesystem error behavior at unmount time.
+
+ If set to a value of 1, XFS will override all other error configurations
+ during unmount and replace them with "immediate fail" characteristics.
+ i.e. no retries, no retry timeout. This will always allow unmount to
+ succeed when there are persistent errors present.
+
+ If set to 0, the configured retry behaviour will continue until all
+ retries and/or timeouts have been exhausted. This will delay unmount
+ completion when there are persistent errors, and it may prevent the
+ filesystem from ever unmounting fully in the case of "retry forever"
+ handler configurations.
+
+ Note: there is no guarantee that fail_at_unmount can be set whilst an
+ unmount is in progress. It is possible that the sysfs entries are
+ removed by the unmounting filesystem before a "retry forever" error
+ handler configuration causes unmount to hang, and hence the filesystem
+ must be configured appropriately before unmount begins to prevent
+ unmount hangs.
+
+Each filesystem has specific error class handlers that define the error
+propagation behaviour for specific errors. There is also a "default" error
+handler defined, which defines the behaviour for all errors that don't have
+specific handlers defined. Where multiple retry constraints are configuredi for
+a single error, the first retry configuration that expires will cause the error
+to be propagated. The handler configurations are found in the directory:
+
+ /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+ max_retries (Min: -1 Default: Varies Max: INTMAX)
+ Defines the allowed number of retries of a specific error before
+ the filesystem will propagate the error. The retry count for a given
+ error context (e.g. a specific metadata buffer) is reset every time
+ there is a successful completion of the operation.
+
+ Setting the value to "-1" will cause XFS to retry forever for this
+ specific error.
+
+ Setting the value to "0" will cause XFS to fail immediately when the
+ specific error is reported.
+
+ Setting the value to "N" (where 0 < N < Max) will make XFS retry the
+ operation "N" times before propagating the error.
+
+ retry_timeout_seconds (Min: -1 Default: Varies Max: 1 day)
+ Define the amount of time (in seconds) that the filesystem is
+ allowed to retry its operations when the specific error is
+ found.
+
+ Setting the value to "-1" will allow XFS to retry forever for this
+ specific error.
+
+ Setting the value to "0" will cause XFS to fail immediately when the
+ specific error is reported.
+
+ Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
+ operation for up to "N" seconds before propagating the error.
+
+Note: The default behaviour for a specific error handler is dependent on both
+the class and error context. For example, the default values for
+"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
+to "fail immediately" behaviour. This is done because ENODEV is a fatal,
+unrecoverable error no matter how many times the metadata IO is retried.
diff --git a/MAINTAINERS b/MAINTAINERS
index a306795a7450..3246c85e7d95 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12915,11 +12915,10 @@ F: arch/x86/xen/*swiotlb*
F: drivers/xen/*swiotlb*
XFS FILESYSTEM
-P: Silicon Graphics Inc
M: Dave Chinner <david@fromorbit.com>
-M: xfs@oss.sgi.com
-L: xfs@oss.sgi.com
-W: http://oss.sgi.com/projects/xfs
+M: linux-xfs@vger.kernel.org
+L: linux-xfs@vger.kernel.org
+W: http://xfs.org/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git
S: Supported
F: Documentation/filesystems/xfs.txt
diff --git a/fs/dax.c b/fs/dax.c
index 993dc6fe0416..cc025f82ef07 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,8 @@
#include <linux/vmstat.h>
#include <linux/pfn_t.h>
#include <linux/sizes.h>
+#include <linux/iomap.h>
+#include "internal.h"
/*
* We use lowest available bit in exceptional entry for locking, other two
@@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
return VM_FAULT_LOCKED;
}
-static int copy_user_bh(struct page *to, struct inode *inode,
- struct buffer_head *bh, unsigned long vaddr)
+static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
+ struct page *to, unsigned long vaddr)
{
struct blk_dax_ctl dax = {
- .sector = to_sector(bh, inode),
- .size = bh->b_size,
+ .sector = sector,
+ .size = size,
};
- struct block_device *bdev = bh->b_bdev;
void *vto;
if (dax_map_atomic(bdev, &dax) < 0)
@@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_insert_mapping(struct address_space *mapping,
- struct buffer_head *bh, void **entryp,
- struct vm_area_struct *vma, struct vm_fault *vmf)
+ struct block_device *bdev, sector_t sector, size_t size,
+ void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = (unsigned long)vmf->virtual_address;
- struct block_device *bdev = bh->b_bdev;
struct blk_dax_ctl dax = {
- .sector = to_sector(bh, mapping->host),
- .size = bh->b_size,
+ .sector = sector,
+ .size = size,
};
void *ret;
void *entry = *entryp;
@@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (vmf->cow_page) {
struct page *new_page = vmf->cow_page;
if (buffer_written(&bh))
- error = copy_user_bh(new_page, inode, &bh, vaddr);
+ error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
+ bh.b_size, new_page, vaddr);
else
clear_user_highpage(new_page, vaddr);
if (error)
@@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
/* Filesystem should not return unwritten buffers to us! */
WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
- error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+ error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
+ bh.b_size, &entry, vma, vmf);
unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out:
@@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
return dax_zero_page_range(inode, from, length, get_block);
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
+
+#ifdef CONFIG_FS_IOMAP
+static loff_t
+iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ struct iov_iter *iter = data;
+ loff_t end = pos + length, done = 0;
+ ssize_t ret = 0;
+
+ if (iov_iter_rw(iter) == READ) {
+ end = min(end, i_size_read(inode));
+ if (pos >= end)
+ return 0;
+
+ if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+ return iov_iter_zero(min(length, end - pos), iter);
+ }
+
+ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+ return -EIO;
+
+ while (pos < end) {
+ unsigned offset = pos & (PAGE_SIZE - 1);
+ struct blk_dax_ctl dax = { 0 };
+ ssize_t map_len;
+
+ dax.sector = iomap->blkno +
+ (((pos & PAGE_MASK) - iomap->offset) >> 9);
+ dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
+ map_len = dax_map_atomic(iomap->bdev, &dax);
+ if (map_len < 0) {
+ ret = map_len;
+ break;
+ }
+
+ dax.addr += offset;
+ map_len -= offset;
+ if (map_len > end - pos)
+ map_len = end - pos;
+
+ if (iov_iter_rw(iter) == WRITE)
+ map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+ else
+ map_len = copy_to_iter(dax.addr, map_len, iter);
+ dax_unmap_atomic(iomap->bdev, &dax);
+ if (map_len <= 0) {
+ ret = map_len ? map_len : -EFAULT;
+ break;
+ }
+
+ pos += map_len;
+ length -= map_len;
+ done += map_len;
+ }
+
+ return done ? done : ret;
+}
+
+/**
+ * iomap_dax_rw - Perform I/O to a DAX file
+ * @iocb: The control block for this I/O
+ * @iter: The addresses to do I/O from or to
+ * @ops: iomap ops passed from the file system
+ *
+ * This function performs read and write operations to directly mapped
+ * persistent memory. The callers needs to take care of read/write exclusion
+ * and evicting any page cache pages in the region under I/O.
+ */
+ssize_t
+iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+ struct iomap_ops *ops)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos = iocb->ki_pos, ret = 0, done = 0;
+ unsigned flags = 0;
+
+ if (iov_iter_rw(iter) == WRITE)
+ flags |= IOMAP_WRITE;
+
+ /*
+ * Yes, even DAX files can have page cache attached to them: A zeroed
+ * page is inserted into the pagecache when we have to serve a write
+ * fault on a hole. It should never be dirtied and can simply be
+ * dropped from the pagecache once we get real data for the page.
+ *
+ * XXX: This is racy against mmap, and there's nothing we can do about
+ * it. We'll eventually need to shift this down even further so that
+ * we can check if we allocated blocks over a hole first.
+ */
+ if (mapping->nrpages) {
+ ret = invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT,
+ (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ }
+
+ while (iov_iter_count(iter)) {
+ ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
+ iter, iomap_dax_actor);
+ if (ret <= 0)
+ break;
+ pos += ret;
+ done += ret;
+ }
+
+ iocb->ki_pos += done;
+ return done ? done : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_rw);
+
+/**
+ * iomap_dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @ops: iomap ops passed from the file system
+ *
+ * When a page fault occurs, filesystems may call this helper in their fault
+ * or mkwrite handler for DAX files. Assumes the caller has done all the
+ * necessary locking for the page fault to proceed successfully.
+ */
+int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ struct iomap_ops *ops)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
+ sector_t sector;
+ struct iomap iomap = { 0 };
+ unsigned flags = 0;
+ int error, major = 0;
+ void *entry;
+
+ /*
+ * Check whether offset isn't beyond end of file now. Caller is supposed
+ * to hold locks serializing us with truncate / punch hole so this is
+ * a reliable test.
+ */
+ if (pos >= i_size_read(inode))
+ return VM_FAULT_SIGBUS;
+
+ entry = grab_mapping_entry(mapping, vmf->pgoff);
+ if (IS_ERR(entry)) {
+ error = PTR_ERR(entry);
+ goto out;
+ }
+
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+ flags |= IOMAP_WRITE;
+
+ /*
+ * Note that we don't bother to use iomap_apply here: DAX required
+ * the file system block size to be equal the page size, which means
+ * that we never have to deal with more than a single extent here.
+ */
+ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
+ if (error)
+ goto unlock_entry;
+ if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
+ error = -EIO; /* fs corruption? */
+ goto unlock_entry;
+ }
+
+ sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+
+ if (vmf->cow_page) {
+ switch (iomap.type) {
+ case IOMAP_HOLE:
+ case IOMAP_UNWRITTEN:
+ clear_user_highpage(vmf->cow_page, vaddr);
+ break;
+ case IOMAP_MAPPED:
+ error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
+ vmf->cow_page, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ }
+
+ if (error)
+ goto unlock_entry;
+ if (!radix_tree_exceptional_entry(entry)) {
+ vmf->page = entry;
+ return VM_FAULT_LOCKED;
+ }
+ vmf->entry = entry;
+ return VM_FAULT_DAX_LOCKED;
+ }
+
+ switch (iomap.type) {
+ case IOMAP_MAPPED:
+ if (iomap.flags & IOMAP_F_NEW) {
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ major = VM_FAULT_MAJOR;
+ }
+ error = dax_insert_mapping(mapping, iomap.bdev, sector,
+ PAGE_SIZE, &entry, vma, vmf);
+ break;
+ case IOMAP_UNWRITTEN:
+ case IOMAP_HOLE:
+ if (!(vmf->flags & FAULT_FLAG_WRITE))
+ return dax_load_hole(mapping, entry, vmf);
+ /*FALLTHRU*/
+ default:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ }
+
+ unlock_entry:
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM | major;
+ /* -EBUSY is fine, somebody else faulted on the same PTE */
+ if (error < 0 && error != -EBUSY)
+ return VM_FAULT_SIGBUS | major;
+ return VM_FAULT_NOPAGE | major;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_fault);
+#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index c634874e12d9..36bea5adcaba 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,5 +1,6 @@
config EXT2_FS
tristate "Second extended fs support"
+ select FS_IOMAP if FS_DAX
help
Ext2 is a standard Linux file system for hard disks.
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 06af2f92226c..37e2be784ac7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations;
/* inode.c */
extern const struct address_space_operations ext2_aops;
extern const struct address_space_operations ext2_nobh_aops;
+extern struct iomap_ops ext2_iomap_ops;
/* namei.c */
extern const struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5efeefe17abb..423cc01c9d41 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -22,11 +22,59 @@
#include <linux/pagemap.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
+#include <linux/iomap.h>
+#include <linux/uio.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
#ifdef CONFIG_FS_DAX
+static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ ssize_t ret;
+
+ if (!iov_iter_count(to))
+ return 0; /* skip atime */
+
+ inode_lock_shared(inode);
+ ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
+ inode_unlock_shared(inode);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
+
+static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out_unlock;
+ ret = file_remove_privs(file);
+ if (ret)
+ goto out_unlock;
+ ret = file_update_time(file);
+ if (ret)
+ goto out_unlock;
+
+ ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
+ if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+ i_size_write(inode, iocb->ki_pos);
+ mark_inode_dirty(inode);
+ }
+
+out_unlock:
+ inode_unlock(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+
/*
* The lock ordering for ext2 DAX fault paths is:
*
@@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
- ret = dax_fault(vma, vmf, ext2_get_block);
+ ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
@@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
}
-/*
- * We have mostly NULL's here: the current defaults are ok for
- * the ext2 filesystem.
- */
+static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(iocb->ki_filp->f_mapping->host))
+ return ext2_dax_read_iter(iocb, to);
+#endif
+ return generic_file_read_iter(iocb, to);
+}
+
+static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(iocb->ki_filp->f_mapping->host))
+ return ext2_dax_write_iter(iocb, from);
+#endif
+ return generic_file_write_iter(iocb, from);
+}
+
const struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
+ .read_iter = ext2_file_read_iter,
+ .write_iter = ext2_file_write_iter,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index d5c7d09919f3..c7dbb4661119 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
#include <linux/buffer_head.h>
#include <linux/mpage.h>
#include <linux/fiemap.h>
+#include <linux/iomap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include "ext2.h"
@@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode,
*/
static int ext2_get_blocks(struct inode *inode,
sector_t iblock, unsigned long maxblocks,
- struct buffer_head *bh_result,
+ u32 *bno, bool *new, bool *boundary,
int create)
{
int err = -EIO;
@@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode,
/* Simplest case - block found, no allocation needed */
if (!partial) {
first_block = le32_to_cpu(chain[depth - 1].key);
- clear_buffer_new(bh_result); /* What's this do? */
count++;
/*map more blocks*/
while (count < maxblocks && count <= blocks_to_boundary) {
@@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode,
mutex_unlock(&ei->truncate_mutex);
if (err)
goto cleanup;
- clear_buffer_new(bh_result);
goto got_it;
}
}
@@ -745,15 +744,16 @@ static int ext2_get_blocks(struct inode *inode,
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
}
- } else
- set_buffer_new(bh_result);
+ } else {
+ *new = true;
+ }
ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
mutex_unlock(&ei->truncate_mutex);
got_it:
- map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+ *bno = le32_to_cpu(chain[depth-1].key);
if (count > blocks_to_boundary)
- set_buffer_boundary(bh_result);
+ *boundary = true;
err = count;
/* Clean up and exit */
partial = chain + depth - 1; /* the whole chain */
@@ -765,19 +765,82 @@ cleanup:
return err;
}
-int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+int ext2_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
- int ret = ext2_get_blocks(inode, iblock, max_blocks,
- bh_result, create);
- if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
- ret = 0;
+ bool new = false, boundary = false;
+ u32 bno;
+ int ret;
+
+ ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary,
+ create);
+ if (ret <= 0)
+ return ret;
+
+ map_bh(bh_result, inode->i_sb, bno);
+ bh_result->b_size = (ret << inode->i_blkbits);
+ if (new)
+ set_buffer_new(bh_result);
+ if (boundary)
+ set_buffer_boundary(bh_result);
+ return 0;
+
+}
+
+#ifdef CONFIG_FS_DAX
+static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned flags, struct iomap *iomap)
+{
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned long first_block = offset >> blkbits;
+ unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
+ bool new = false, boundary = false;
+ u32 bno;
+ int ret;
+
+ ret = ext2_get_blocks(inode, first_block, max_blocks,
+ &bno, &new, &boundary, flags & IOMAP_WRITE);
+ if (ret < 0)
+ return ret;
+
+ iomap->flags = 0;
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = (u64)first_block << blkbits;
+
+ if (ret == 0) {
+ iomap->type = IOMAP_HOLE;
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->length = 1 << blkbits;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->blkno = (sector_t)bno << (blkbits - 9);
+ iomap->length = (u64)ret << blkbits;
+ iomap->flags |= IOMAP_F_MERGED;
}
- return ret;
+ if (new)
+ iomap->flags |= IOMAP_F_NEW;
+ return 0;
}
+static int
+ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+ ssize_t written, unsigned flags, struct iomap *iomap)
+{
+ if (iomap->type == IOMAP_MAPPED &&
+ written < length &&
+ (flags & IOMAP_WRITE))
+ ext2_write_failed(inode->i_mapping, offset + length);
+ return 0;
+}
+
+struct iomap_ops ext2_iomap_ops = {
+ .iomap_begin = ext2_iomap_begin,
+ .iomap_end = ext2_iomap_end,
+};
+#endif /* CONFIG_FS_DAX */
+
int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -863,11 +926,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t offset = iocb->ki_pos;
ssize_t ret;
- if (IS_DAX(inode))
- ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL,
- DIO_LOCKING);
- else
- ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
+ if (WARN_ON_ONCE(IS_DAX(inode)))
+ return -EIO;
+
+ ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
if (ret < 0 && iov_iter_rw(iter) == WRITE)
ext2_write_failed(mapping, offset + count);
return ret;
diff --git a/fs/internal.h b/fs/internal.h
index ba0737649d4a..859178692ce4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
struct super_block;
struct file_system_type;
struct iomap;
+struct iomap_ops;
struct linux_binprm;
struct path;
struct mount;
@@ -164,3 +165,13 @@ extern struct dentry_operations ns_dentry_operations;
extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
unsigned long arg);
extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/*
+ * iomap support:
+ */
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+ void *data, struct iomap *iomap);
+
+loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
+ unsigned flags, struct iomap_ops *ops, void *data,
+ iomap_actor_t actor);
diff --git a/fs/iomap.c b/fs/iomap.c
index 706270f21b35..013d1d36fbbf 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -27,9 +27,6 @@
#include <linux/dax.h>
#include "internal.h"
-typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
- void *data, struct iomap *iomap);
-
/*
* Execute a iomap write on a segment of the mapping that spans a
* contiguous range of pages that have identical block mapping state.
@@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
* resources they require in the iomap_begin call, and release them in the
* iomap_end call.
*/
-static loff_t
+loff_t
iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
struct iomap_ops *ops, void *data, iomap_actor_t actor)
{
@@ -252,6 +249,88 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+static struct page *
+__iomap_read_page(struct inode *inode, loff_t offset)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+
+ page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ return page;
+}
+
+static loff_t
+iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ long status = 0;
+ ssize_t written = 0;
+
+ do {
+ struct page *page, *rpage;
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+
+ offset = (pos & (PAGE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
+
+ rpage = __iomap_read_page(inode, pos);
+ if (IS_ERR(rpage))
+ return PTR_ERR(rpage);
+
+ status = iomap_write_begin(inode, pos, bytes,
+ AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
+ &page, iomap);
+ put_page(rpage);
+ if (unlikely(status))
+ return status;
+
+ WARN_ON_ONCE(!PageUptodate(page));
+
+ status = iomap_write_end(inode, pos, bytes, bytes, page);
+ if (unlikely(status <= 0)) {
+ if (WARN_ON_ONCE(status == 0))
+ return -EIO;
+ return status;
+ }
+
+ cond_resched();
+
+ pos += status;
+ written += status;
+ length -= status;
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ } while (length);
+
+ return written;
+}
+
+int
+iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+ struct iomap_ops *ops)
+{
+ loff_t ret;
+
+ while (len) {
+ ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
+ iomap_dirty_actor);
+ if (ret <= 0)
+ return ret;
+ pos += ret;
+ len -= ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_file_dirty);
+
static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
unsigned bytes, struct iomap *iomap)
{
@@ -430,6 +509,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
if (iomap->flags & IOMAP_F_MERGED)
flags |= FIEMAP_EXTENT_MERGED;
+ if (iomap->flags & IOMAP_F_SHARED)
+ flags |= FIEMAP_EXTENT_SHARED;
return fiemap_fill_next_extent(fi, iomap->offset,
iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index fc593c869493..584e87e11cb6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -52,6 +52,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_inode_fork.o \
xfs_inode_buf.o \
xfs_log_rlimit.o \
+ xfs_ag_resv.o \
xfs_rmap.o \
xfs_rmap_btree.o \
xfs_sb.o \
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
new file mode 100644
index 000000000000..e3ae0f2b4294
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_btree.h"
+
+/*
+ * Per-AG Block Reservations
+ *
+ * For some kinds of allocation group metadata structures, it is advantageous
+ * to reserve a small number of blocks in each AG so that future expansions of
+ * that data structure do not encounter ENOSPC because errors during a btree
+ * split cause the filesystem to go offline.
+ *
+ * Prior to the introduction of reflink, this wasn't an issue because the free
+ * space btrees maintain a reserve of space (the AGFL) to handle any expansion
+ * that may be necessary; and allocations of other metadata (inodes, BMBT,
+ * dir/attr) aren't restricted to a single AG. However, with reflink it is
+ * possible to allocate all the space in an AG, have subsequent reflink/CoW
+ * activity expand the refcount btree, and discover that there's no space left
+ * to handle that expansion. Since we can calculate the maximum size of the
+ * refcount btree, we can reserve space for it and avoid ENOSPC.
+ *
+ * Handling per-AG reservations consists of three changes to the allocator's
+ * behavior: First, because these reservations are always needed, we decrease
+ * the ag_max_usable counter to reflect the size of the AG after the reserved
+ * blocks are taken. Second, the reservations must be reflected in the
+ * fdblocks count to maintain proper accounting. Third, each AG must maintain
+ * its own reserved block counter so that we can calculate the amount of space
+ * that must remain free to maintain the reservations. Fourth, the "remaining
+ * reserved blocks" count must be used when calculating the length of the
+ * longest free extent in an AG and to clamp maxlen in the per-AG allocation
+ * functions. In other words, we maintain a virtual allocation via in-core
+ * accounting tricks so that we don't have to clean up after a crash. :)
+ *
+ * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
+ * values via struct xfs_alloc_arg or directly to the xfs_free_extent
+ * function. It might seem a little funny to maintain a reservoir of blocks
+ * to feed another reservoir, but the AGFL only holds enough blocks to get
+ * through the next transaction. The per-AG reservation is to ensure (we
+ * hope) that each AG never runs out of blocks. Each data structure wanting
+ * to use the reservation system should update ask/used in xfs_ag_resv_init.
+ */
+
+/*
+ * Are we critically low on blocks? For now we'll define that as the number
+ * of blocks we can get our hands on being less than 10% of what we reserved
+ * or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_ag_resv_critical(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ xfs_extlen_t avail;
+ xfs_extlen_t orig;
+
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved;
+ orig = pag->pag_meta_resv.ar_asked;
+ break;
+ case XFS_AG_RESV_AGFL:
+ avail = pag->pagf_freeblks + pag->pagf_flcount -
+ pag->pag_meta_resv.ar_reserved;
+ orig = pag->pag_agfl_resv.ar_asked;
+ break;
+ default:
+ ASSERT(0);
+ return false;
+ }
+
+ trace_xfs_ag_resv_critical(pag, type, avail);
+
+ /* Critically low if less than 10% or max btree height remains. */
+ return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS;
+}
+
+/*
+ * How many blocks are reserved but not used, and therefore must not be
+ * allocated away?
+ */
+xfs_extlen_t
+xfs_ag_resv_needed(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ xfs_extlen_t len;
+
+ len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved;
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_AGFL:
+ len -= xfs_perag_resv(pag, type)->ar_reserved;
+ break;
+ case XFS_AG_RESV_NONE:
+ /* empty */
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ trace_xfs_ag_resv_needed(pag, type, len);
+
+ return len;
+}
+
+/* Clean out a reservation */
+static int
+__xfs_ag_resv_free(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ struct xfs_ag_resv *resv;
+ xfs_extlen_t oldresv;
+ int error;
+
+ trace_xfs_ag_resv_free(pag, type, 0);
+
+ resv = xfs_perag_resv(pag, type);
+ pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+ /*
+ * AGFL blocks are always considered "free", so whatever
+ * was reserved at mount time must be given back at umount.
+ */
+ if (type == XFS_AG_RESV_AGFL)
+ oldresv = resv->ar_orig_reserved;
+ else
+ oldresv = resv->ar_reserved;
+ error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+ resv->ar_reserved = 0;
+ resv->ar_asked = 0;
+
+ if (error)
+ trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/* Free a per-AG reservation. */
+int
+xfs_ag_resv_free(
+ struct xfs_perag *pag)
+{
+ int error;
+ int err2;
+
+ error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL);
+ err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
+ if (err2 && !error)
+ error = err2;
+ return error;
+}
+
+static int
+__xfs_ag_resv_init(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ xfs_extlen_t ask,
+ xfs_extlen_t used)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_ag_resv *resv;
+ int error;
+
+ resv = xfs_perag_resv(pag, type);
+ if (used > ask)
+ ask = used;
+ resv->ar_asked = ask;
+ resv->ar_reserved = resv->ar_orig_reserved = ask - used;
+ mp->m_ag_max_usable -= ask;
+
+ trace_xfs_ag_resv_init(pag, type, ask);
+
+ error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
+ if (error)
+ trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
+ error, _RET_IP_);
+
+ return error;
+}
+
+/* Create a per-AG block reservation. */
+int
+xfs_ag_resv_init(
+ struct xfs_perag *pag)
+{
+ xfs_extlen_t ask;
+ xfs_extlen_t used;
+ int error = 0;
+
+ /* Create the metadata reservation. */
+ if (pag->pag_meta_resv.ar_asked == 0) {
+ ask = used = 0;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+ ask, used);
+ if (error)
+ goto out;
+ }
+
+ /* Create the AGFL metadata reservation */
+ if (pag->pag_agfl_resv.ar_asked == 0) {
+ ask = used = 0;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
+ if (error)
+ goto out;
+ }
+
+out:
+ return error;
+}
+
+/* Allocate a block from the reservation. */
+void
+xfs_ag_resv_alloc_extent(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ struct xfs_alloc_arg *args)
+{
+ struct xfs_ag_resv *resv;
+ xfs_extlen_t len;
+ uint field;
+
+ trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
+
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_AGFL:
+ resv = xfs_perag_resv(pag, type);
+ break;
+ default:
+ ASSERT(0);
+ /* fall through */
+ case XFS_AG_RESV_NONE:
+ field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+ XFS_TRANS_SB_FDBLOCKS;
+ xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
+ return;
+ }
+
+ len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
+ resv->ar_reserved -= len;
+ if (type == XFS_AG_RESV_AGFL)
+ return;
+ /* Allocations of reserved blocks only need on-disk sb updates... */
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
+ /* ...but non-reserved blocks need in-core and on-disk updates. */
+ if (args->len > len)
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
+ -((int64_t)args->len - len));
+}
+
+/* Free a block to the reservation. */
+void
+xfs_ag_resv_free_extent(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type,
+ struct xfs_trans *tp,
+ xfs_extlen_t len)
+{
+ xfs_extlen_t leftover;
+ struct xfs_ag_resv *resv;
+
+ trace_xfs_ag_resv_free_extent(pag, type, len);
+
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ case XFS_AG_RESV_AGFL:
+ resv = xfs_perag_resv(pag, type);
+ break;
+ default:
+ ASSERT(0);
+ /* fall through */
+ case XFS_AG_RESV_NONE:
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+ return;
+ }
+
+ leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
+ resv->ar_reserved += leftover;
+ if (type == XFS_AG_RESV_AGFL)
+ return;
+ /* Freeing into the reserved pool only requires on-disk update... */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
+ /* ...but freeing beyond that requires in-core and on-disk update. */
+ if (len > leftover)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
+}
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
new file mode 100644
index 000000000000..8d6c687deef3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_AG_RESV_H__
+#define __XFS_AG_RESV_H__
+
+int xfs_ag_resv_free(struct xfs_perag *pag);
+int xfs_ag_resv_init(struct xfs_perag *pag);
+
+bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag,
+ enum xfs_ag_resv_type type);
+
+void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+ struct xfs_alloc_arg *args);
+void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+ struct xfs_trans *tp, xfs_extlen_t len);
+
+#endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 1d530c253c0e..ca75dc90ebe0 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -37,6 +37,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_ag_resv.h"
struct workqueue_struct *xfs_alloc_wq;
@@ -74,14 +75,8 @@ xfs_prealloc_blocks(
* extents need to be actually allocated. To get around this, we explicitly set
* aside a few blocks which will not be reserved in delayed allocation.
*
- * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist
- * and 4 more to handle a potential split of the file's bmap btree.
- *
- * When rmap is enabled, we must also be able to handle two rmap btree inserts
- * to record both the file data extent and a new bmbt block. The bmbt block
- * might not be in the same AG as the file data extent. In the worst case
- * the bmap btree splits multiple levels and all the new blocks come from
- * different AGs, so set aside enough to handle rmap btree splits in all AGs.
+ * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
+ * potential split of the file's bmap btree.
*/
unsigned int
xfs_alloc_set_aside(
@@ -90,8 +85,6 @@ xfs_alloc_set_aside(
unsigned int blocks;
blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
- blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
return blocks;
}
@@ -681,12 +674,29 @@ xfs_alloc_ag_vextent(
xfs_alloc_arg_t *args) /* argument structure for allocation */
{
int error=0;
+ xfs_extlen_t reservation;
+ xfs_extlen_t oldmax;
ASSERT(args->minlen > 0);
ASSERT(args->maxlen > 0);
ASSERT(args->minlen <= args->maxlen);
ASSERT(args->mod < args->prod);
ASSERT(args->alignment > 0);
+
+ /*
+ * Clamp maxlen to the amount of free space minus any reservations
+ * that have been made.
+ */
+ oldmax = args->maxlen;
+ reservation = xfs_ag_resv_needed(args->pag, args->resv);
+ if (args->maxlen > args->pag->pagf_freeblks - reservation)
+ args->maxlen = args->pag->pagf_freeblks - reservation;
+ if (args->maxlen == 0) {
+ args->agbno = NULLAGBLOCK;
+ args->maxlen = oldmax;
+ return 0;
+ }
+
/*
* Branch to correct routine based on the type.
*/
@@ -706,12 +716,14 @@ xfs_alloc_ag_vextent(
/* NOTREACHED */
}
+ args->maxlen = oldmax;
+
if (error || args->agbno == NULLAGBLOCK)
return error;
ASSERT(args->len >= args->minlen);
ASSERT(args->len <= args->maxlen);
- ASSERT(!args->wasfromfl || !args->isfl);
+ ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
ASSERT(args->agbno % args->alignment == 0);
/* if not file data, insert new block into the reverse map btree */
@@ -733,12 +745,7 @@ xfs_alloc_ag_vextent(
args->agbno, args->len));
}
- if (!args->isfl) {
- xfs_trans_mod_sb(args->tp, args->wasdel ?
- XFS_TRANS_SB_RES_FDBLOCKS :
- XFS_TRANS_SB_FDBLOCKS,
- -((long)(args->len)));
- }
+ xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
XFS_STATS_INC(args->mp, xs_allocx);
XFS_STATS_ADD(args->mp, xs_allocb, args->len);
@@ -1584,6 +1591,7 @@ xfs_alloc_ag_vextent_small(
int *stat) /* status: 0-freelist, 1-normal/none */
{
struct xfs_owner_info oinfo;
+ struct xfs_perag *pag;
int error;
xfs_agblock_t fbno;
xfs_extlen_t flen;
@@ -1601,7 +1609,8 @@ xfs_alloc_ag_vextent_small(
* to respect minleft even when pulling from the
* freelist.
*/
- else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+ else if (args->minlen == 1 && args->alignment == 1 &&
+ args->resv != XFS_AG_RESV_AGFL &&
(be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
> args->minleft)) {
error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
@@ -1630,13 +1639,18 @@ xfs_alloc_ag_vextent_small(
/*
* If we're feeding an AGFL block to something that
* doesn't live in the free space, we need to clear
- * out the OWN_AG rmap.
+ * out the OWN_AG rmap and add the block back to
+ * the AGFL per-AG reservation.
*/
xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
error = xfs_rmap_free(args->tp, args->agbp, args->agno,
fbno, 1, &oinfo);
if (error)
goto error0;
+ pag = xfs_perag_get(args->mp, args->agno);
+ xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL,
+ args->tp, 1);
+ xfs_perag_put(pag);
*stat = 0;
return 0;
@@ -1684,7 +1698,7 @@ xfs_free_ag_extent(
xfs_agblock_t bno,
xfs_extlen_t len,
struct xfs_owner_info *oinfo,
- int isfl)
+ enum xfs_ag_resv_type type)
{
xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
@@ -1912,21 +1926,22 @@ xfs_free_ag_extent(
*/
pag = xfs_perag_get(mp, agno);
error = xfs_alloc_update_counters(tp, pag, agbp, len);
+ xfs_ag_resv_free_extent(pag, type, tp, len);
xfs_perag_put(pag);
if (error)
goto error0;
- if (!isfl)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
XFS_STATS_INC(mp, xs_freex);
XFS_STATS_ADD(mp, xs_freeb, len);
- trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+ trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+ haveleft, haveright);
return 0;
error0:
- trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+ trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+ -1, -1);
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
if (cnt_cur)
@@ -1951,21 +1966,43 @@ xfs_alloc_compute_maxlevels(
}
/*
- * Find the length of the longest extent in an AG.
+ * Find the length of the longest extent in an AG. The 'need' parameter
+ * specifies how much space we're going to need for the AGFL and the
+ * 'reserved' parameter tells us how many blocks in this AG are reserved for
+ * other callers.
*/
xfs_extlen_t
xfs_alloc_longest_free_extent(
struct xfs_mount *mp,
struct xfs_perag *pag,
- xfs_extlen_t need)
+ xfs_extlen_t need,
+ xfs_extlen_t reserved)
{
xfs_extlen_t delta = 0;
+ /*
+ * If the AGFL needs a recharge, we'll have to subtract that from the
+ * longest extent.
+ */
if (need > pag->pagf_flcount)
delta = need - pag->pagf_flcount;
+ /*
+ * If we cannot maintain others' reservations with space from the
+ * not-longest freesp extents, we'll have to subtract /that/ from
+ * the longest extent too.
+ */
+ if (pag->pagf_freeblks - pag->pagf_longest < reserved)
+ delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
+
+ /*
+ * If the longest extent is long enough to satisfy all the
+ * reservations and AGFL rules in place, we can return this extent.
+ */
if (pag->pagf_longest > delta)
return pag->pagf_longest - delta;
+
+ /* Otherwise, let the caller try for 1 block if there's space. */
return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
}
@@ -2005,20 +2042,24 @@ xfs_alloc_space_available(
{
struct xfs_perag *pag = args->pag;
xfs_extlen_t longest;
+ xfs_extlen_t reservation; /* blocks that are still reserved */
int available;
if (flags & XFS_ALLOC_FLAG_FREEING)
return true;
+ reservation = xfs_ag_resv_needed(pag, args->resv);
+
/* do we have enough contiguous free space for the allocation? */
- longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+ longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
+ reservation);
if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
return false;
- /* do have enough free space remaining for the allocation? */
+ /* do we have enough free space remaining for the allocation? */
available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
- min_free - args->total);
- if (available < (int)args->minleft)
+ reservation - min_free - args->total);
+ if (available < (int)args->minleft || available <= 0)
return false;
return true;
@@ -2125,7 +2166,7 @@ xfs_alloc_fix_freelist(
if (error)
goto out_agbp_relse;
error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
- &targs.oinfo, 1);
+ &targs.oinfo, XFS_AG_RESV_AGFL);
if (error)
goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
@@ -2136,7 +2177,7 @@ xfs_alloc_fix_freelist(
targs.mp = mp;
targs.agbp = agbp;
targs.agno = args->agno;
- targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+ targs.alignment = targs.minlen = targs.prod = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
@@ -2147,6 +2188,7 @@ xfs_alloc_fix_freelist(
while (pag->pagf_flcount < need) {
targs.agbno = 0;
targs.maxlen = need - pag->pagf_flcount;
+ targs.resv = XFS_AG_RESV_AGFL;
/* Allocate as many blocks as possible at once. */
error = xfs_alloc_ag_vextent(&targs);
@@ -2826,7 +2868,8 @@ xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
- struct xfs_owner_info *oinfo) /* extent owner */
+ struct xfs_owner_info *oinfo, /* extent owner */
+ enum xfs_ag_resv_type type) /* block reservation type */
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agbp;
@@ -2835,6 +2878,7 @@ xfs_free_extent(
int error;
ASSERT(len != 0);
+ ASSERT(type != XFS_AG_RESV_AGFL);
if (XFS_TEST_ERROR(false, mp,
XFS_ERRTAG_FREE_EXTENT,
@@ -2852,7 +2896,7 @@ xfs_free_extent(
agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
err);
- error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0);
+ error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
if (error)
goto err;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 7fd8eafd9abe..7c404a6b0ae3 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -88,9 +88,9 @@ typedef struct xfs_alloc_arg {
int datatype; /* mask defining data type treatment */
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
- char isfl; /* set if is freelist blocks - !acctg */
xfs_fsblock_t firstblock; /* io first block allocated */
struct xfs_owner_info oinfo; /* owner of blocks being allocated */
+ enum xfs_ag_resv_type resv; /* block reservation to use */
} xfs_alloc_arg_t;
/*
@@ -119,7 +119,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
- struct xfs_perag *pag, xfs_extlen_t need);
+ struct xfs_perag *pag, xfs_extlen_t need,
+ xfs_extlen_t reserved);
unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
@@ -197,7 +198,8 @@ xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
- struct xfs_owner_info *oinfo);/* extent owner */
+ struct xfs_owner_info *oinfo, /* extent owner */
+ enum xfs_ag_resv_type type); /* block reservation type */
int /* error */
xfs_alloc_lookup_ge(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 06d1201b4718..9d7f61d36645 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -47,6 +47,7 @@
#include "xfs_attr_leaf.h"
#include "xfs_filestream.h"
#include "xfs_rmap.h"
+#include "xfs_ag_resv.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -1388,7 +1389,7 @@ xfs_bmap_search_multi_extents(
* Else, *lastxp will be set to the index of the found
* entry; *gotp will contain the entry.
*/
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmbt_rec_host_t * /* pointer to found extent entry */
xfs_bmap_search_extents(
xfs_inode_t *ip, /* incore inode pointer */
xfs_fileoff_t bno, /* block number searched for */
@@ -3502,7 +3503,8 @@ xfs_bmap_longest_free_extent(
}
longest = xfs_alloc_longest_free_extent(mp, pag,
- xfs_alloc_min_freelist(mp, pag));
+ xfs_alloc_min_freelist(mp, pag),
+ xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
if (*blen < longest)
*blen = longest;
@@ -3785,7 +3787,7 @@ xfs_bmap_btalloc(
}
args.minleft = ap->minleft;
args.wasdel = ap->wasdel;
- args.isfl = 0;
+ args.resv = XFS_AG_RESV_NONE;
args.datatype = ap->datatype;
if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
args.ip = ap->ip;
@@ -4079,7 +4081,7 @@ xfs_bmapi_read(
return 0;
}
-STATIC int
+int
xfs_bmapi_reserve_delalloc(
struct xfs_inode *ip,
xfs_fileoff_t aoff,
@@ -4175,91 +4177,6 @@ out_unreserve_quota:
return error;
}
-/*
- * Map file blocks to filesystem blocks, adding delayed allocations as needed.
- */
-int
-xfs_bmapi_delay(
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting file offs. mapped */
- xfs_filblks_t len, /* length to map in file */
- struct xfs_bmbt_irec *mval, /* output: map values */
- int *nmap, /* i/o: mval size/count */
- int flags) /* XFS_BMAPI_... */
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- struct xfs_bmbt_irec got; /* current file extent record */
- struct xfs_bmbt_irec prev; /* previous file extent record */
- xfs_fileoff_t obno; /* old block number (offset) */
- xfs_fileoff_t end; /* end of mapped file region */
- xfs_extnum_t lastx; /* last useful extent number */
- int eof; /* we've hit the end of extents */
- int n = 0; /* current extent index */
- int error = 0;
-
- ASSERT(*nmap >= 1);
- ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
- ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- XFS_STATS_INC(mp, xs_blk_mapw);
-
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
- if (error)
- return error;
- }
-
- xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
- end = bno + len;
- obno = bno;
-
- while (bno < end && n < *nmap) {
- if (eof || got.br_startoff > bno) {
- error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
- &prev, &lastx, eof);
- if (error) {
- if (n == 0) {
- *nmap = 0;
- return error;
- }
- break;
- }
- }
-
- /* set up the extent map to return. */
- xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
- xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
- /* If we're done, stop now. */
- if (bno >= end || n >= *nmap)
- break;
-
- /* Else go on to the next record. */
- prev = got;
- if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
- else
- eof = 1;
- }
-
- *nmap = n;
- return 0;
-}
-
-
static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 05576b7263f6..8395f6e8cf7d 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -181,9 +181,6 @@ int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
int *nmap, int flags);
-int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
- xfs_filblks_t len, struct xfs_bmbt_irec *mval,
- int *nmap, int flags);
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
xfs_fsblock_t *firstblock, xfs_extlen_t total,
@@ -202,5 +199,12 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_defer_ops *dfops, enum shift_direction direction,
int num_exts);
int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
+struct xfs_bmbt_rec_host *
+ xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
+ int fork, int *eofp, xfs_extnum_t *lastxp,
+ struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
+int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff,
+ xfs_filblks_t len, struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof);
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 08569792fe20..aa1752f918b8 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2070,7 +2070,7 @@ __xfs_btree_updkeys(
struct xfs_buf *bp0,
bool force_all)
{
- union xfs_btree_bigkey key; /* keys from current level */
+ union xfs_btree_key key; /* keys from current level */
union xfs_btree_key *lkey; /* keys from the next level up */
union xfs_btree_key *hkey;
union xfs_btree_key *nlkey; /* keys from the next level up */
@@ -2086,7 +2086,7 @@ __xfs_btree_updkeys(
trace_xfs_btree_updkeys(cur, level, bp0);
- lkey = (union xfs_btree_key *)&key;
+ lkey = &key;
hkey = xfs_btree_high_key_from_key(cur, lkey);
xfs_btree_get_keys(cur, block, lkey);
for (level++; level < cur->bc_nlevels; level++) {
@@ -3226,7 +3226,7 @@ xfs_btree_insrec(
struct xfs_buf *bp; /* buffer for block */
union xfs_btree_ptr nptr; /* new block ptr */
struct xfs_btree_cur *ncur; /* new btree cursor */
- union xfs_btree_bigkey nkey; /* new block key */
+ union xfs_btree_key nkey; /* new block key */
union xfs_btree_key *lkey;
int optr; /* old key/record index */
int ptr; /* key/record index */
@@ -3241,7 +3241,7 @@ xfs_btree_insrec(
XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
ncur = NULL;
- lkey = (union xfs_btree_key *)&nkey;
+ lkey = &nkey;
/*
* If we have an external root pointer, and we've made it to the
@@ -3444,14 +3444,14 @@ xfs_btree_insert(
union xfs_btree_ptr nptr; /* new block number (split result) */
struct xfs_btree_cur *ncur; /* new cursor (split result) */
struct xfs_btree_cur *pcur; /* previous level's cursor */
- union xfs_btree_bigkey bkey; /* key of block to insert */
+ union xfs_btree_key bkey; /* key of block to insert */
union xfs_btree_key *key;
union xfs_btree_rec rec; /* record to insert */
level = 0;
ncur = NULL;
pcur = cur;
- key = (union xfs_btree_key *)&bkey;
+ key = &bkey;
xfs_btree_set_ptr_null(cur, &nptr);
@@ -4797,3 +4797,50 @@ xfs_btree_query_range(
return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
fn, priv);
}
+
+/*
+ * Calculate the number of blocks needed to store a given number of records
+ * in a short-format (per-AG metadata) btree.
+ */
+xfs_extlen_t
+xfs_btree_calc_size(
+ struct xfs_mount *mp,
+ uint *limits,
+ unsigned long long len)
+{
+ int level;
+ int maxrecs;
+ xfs_extlen_t rval;
+
+ maxrecs = limits[0];
+ for (level = 0, rval = 0; len > 1; level++) {
+ len += maxrecs - 1;
+ do_div(len, maxrecs);
+ maxrecs = limits[1];
+ rval += len;
+ }
+ return rval;
+}
+
+int
+xfs_btree_count_blocks_helper(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *data)
+{
+ xfs_extlen_t *blocks = data;
+ (*blocks)++;
+
+ return 0;
+}
+
+/* Count the blocks in a btree and return the result in *blocks. */
+int
+xfs_btree_count_blocks(
+ struct xfs_btree_cur *cur,
+ xfs_extlen_t *blocks)
+{
+ *blocks = 0;
+ return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
+ blocks);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 04d0865e5e6d..3f8556a5c2ad 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -37,30 +37,18 @@ union xfs_btree_ptr {
__be64 l; /* long form ptr */
};
-union xfs_btree_key {
- struct xfs_bmbt_key bmbt;
- xfs_bmdr_key_t bmbr; /* bmbt root block */
- xfs_alloc_key_t alloc;
- struct xfs_inobt_key inobt;
- struct xfs_rmap_key rmap;
-};
-
/*
- * In-core key that holds both low and high keys for overlapped btrees.
- * The two keys are packed next to each other on disk, so do the same
- * in memory. Preserve the existing xfs_btree_key as a single key to
- * avoid the mental model breakage that would happen if we passed a
- * bigkey into a function that operates on a single key.
+ * The in-core btree key. Overlapping btrees actually store two keys
+ * per pointer, so we reserve enough memory to hold both. The __*bigkey
+ * items should never be accessed directly.
*/
-union xfs_btree_bigkey {
+union xfs_btree_key {
struct xfs_bmbt_key bmbt;
xfs_bmdr_key_t bmbr; /* bmbt root block */
xfs_alloc_key_t alloc;
struct xfs_inobt_key inobt;
- struct {
- struct xfs_rmap_key rmap;
- struct xfs_rmap_key rmap_hi;
- };
+ struct xfs_rmap_key rmap;
+ struct xfs_rmap_key __rmap_bigkey[2];
};
union xfs_btree_rec {
@@ -513,6 +501,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
unsigned long len);
+xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits,
+ unsigned long long len);
/* return codes */
#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
@@ -529,4 +519,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
xfs_btree_visit_blocks_fn fn, void *data);
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index c221d0ecd52e..613c5cf19436 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -81,6 +81,10 @@
* - For each work item attached to the log intent item,
* * Perform the described action.
* * Attach the work item to the log done item.
+ * * If the result of doing the work was -EAGAIN, ->finish work
+ * wants a new transaction. See the "Requesting a Fresh
+ * Transaction while Finishing Deferred Work" section below for
+ * details.
*
* The key here is that we must log an intent item for all pending
* work items every time we roll the transaction, and that we must log
@@ -88,6 +92,34 @@
* we can perform complex remapping operations, chaining intent items
* as needed.
*
+ * Requesting a Fresh Transaction while Finishing Deferred Work
+ *
+ * If ->finish_item decides that it needs a fresh transaction to
+ * finish the work, it must ask its caller (xfs_defer_finish) for a
+ * continuation. The most likely cause of this circumstance are the
+ * refcount adjust functions deciding that they've logged enough items
+ * to be at risk of exceeding the transaction reservation.
+ *
+ * To get a fresh transaction, we want to log the existing log done
+ * item to prevent the log intent item from replaying, immediately log
+ * a new log intent item with the unfinished work items, roll the
+ * transaction, and re-call ->finish_item wherever it left off. The
+ * log done item and the new log intent item must be in the same
+ * transaction or atomicity cannot be guaranteed; defer_finish ensures
+ * that this happens.
+ *
+ * This requires some coordination between ->finish_item and
+ * defer_finish. Upon deciding to request a new transaction,
+ * ->finish_item should update the current work item to reflect the
+ * unfinished work. Next, it should reset the log done item's list
+ * count to the number of items finished, and return -EAGAIN.
+ * defer_finish sees the -EAGAIN, logs the new log intent item
+ * with the remaining work items, and leaves the xfs_defer_pending
+ * item at the head of the dop_work queue. Then it rolls the
+ * transaction and picks up processing where it left off. It is
+ * required that ->finish_item must be careful to leave enough
+ * transaction reservation to fit the new log intent item.
+ *
* This is an example of remapping the extent (E, E+B) into file X at
* offset A and dealing with the extent (C, C+B) already being mapped
* there:
@@ -104,21 +136,26 @@
* | Intent to add rmap (X, E, A, B) |
* +-------------------------------------------------+
* | Reduce refcount for extent (C, B) | t2
- * | Done reducing refcount for extent (C, B) |
+ * | Done reducing refcount for extent (C, 9) |
+ * | Intent to reduce refcount for extent (C+9, B-9) |
+ * | (ran out of space after 9 refcount updates) |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C+9, B+9) | t3
+ * | Done reducing refcount for extent (C+9, B-9) |
* | Increase refcount for extent (E, B) |
* | Done increasing refcount for extent (E, B) |
* | Intent to free extent (C, B) |
* | Intent to free extent (F, 1) (refcountbt block) |
* | Intent to remove rmap (F, 1, REFC) |
* +-------------------------------------------------+
- * | Remove rmap (X, C, A, B) | t3
+ * | Remove rmap (X, C, A, B) | t4
* | Done removing rmap (X, C, A, B) |
* | Add rmap (X, E, A, B) |
* | Done adding rmap (X, E, A, B) |
* | Remove rmap (F, 1, REFC) |
* | Done removing rmap (F, 1, REFC) |
* +-------------------------------------------------+
- * | Free extent (C, B) | t4
+ * | Free extent (C, B) | t5
* | Done freeing extent (C, B) |
* | Free extent (D, 1) |
* | Done freeing extent (D, 1) |
@@ -141,6 +178,9 @@
* - Intent to free extent (C, B)
* - Intent to free extent (F, 1) (refcountbt block)
* - Intent to remove rmap (F, 1, REFC)
+ *
+ * Note that the continuation requested between t2 and t3 is likely to
+ * reoccur.
*/
static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
@@ -323,7 +363,16 @@ xfs_defer_finish(
dfp->dfp_count--;
error = dfp->dfp_type->finish_item(*tp, dop, li,
dfp->dfp_done, &state);
- if (error) {
+ if (error == -EAGAIN) {
+ /*
+ * Caller wants a fresh transaction;
+ * put the work item back on the list
+ * and jump out.
+ */
+ list_add(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+ break;
+ } else if (error) {
/*
* Clean up after ourselves and jump out.
* xfs_defer_cancel will take care of freeing
@@ -335,9 +384,25 @@ xfs_defer_finish(
goto out;
}
}
- /* Done with the dfp, free it. */
- list_del(&dfp->dfp_list);
- kmem_free(dfp);
+ if (error == -EAGAIN) {
+ /*
+ * Caller wants a fresh transaction, so log a
+ * new log intent item to replace the old one
+ * and roll the transaction. See "Requesting
+ * a Fresh Transaction while Finishing
+ * Deferred Work" above.
+ */
+ dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
+ dfp->dfp_count);
+ dfp->dfp_done = NULL;
+ list_for_each(li, &dfp->dfp_work)
+ dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
+ li);
+ } else {
+ /* Done with the dfp, free it. */
+ list_del(&dfp->dfp_list);
+ kmem_free(dfp);
+ }
if (cleanup_fn)
cleanup_fn(*tp, state, error);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 31ca2208c03d..eab68ae2e011 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -132,7 +132,7 @@ xfs_inobt_free_block(
xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
return xfs_free_extent(cur->bc_tp,
XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
- &oinfo);
+ &oinfo, XFS_AG_RESV_NONE);
}
STATIC int
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index a6eed43fa7cd..fc5eef85d61e 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -647,9 +647,17 @@ struct xfs_rui_log_format {
__uint16_t rui_size; /* size of this item */
__uint32_t rui_nextents; /* # extents to free */
__uint64_t rui_id; /* rui identifier */
- struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */
+ struct xfs_map_extent rui_extents[]; /* array of extents to rmap */
};
+static inline size_t
+xfs_rui_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_rui_log_format) +
+ nr * sizeof(struct xfs_map_extent);
+}
+
/*
* This is the structure used to lay out an rud log item in the
* log. The rud_extents array is a variable size array whose
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 7575cfc3ad15..4a28fa91e3b1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -200,7 +200,7 @@ xfs_setfilesize_trans_alloc(
* Update on-disk file size now that data has been written to disk.
*/
STATIC int
-xfs_setfilesize(
+__xfs_setfilesize(
struct xfs_inode *ip,
struct xfs_trans *tp,
xfs_off_t offset,
@@ -225,6 +225,23 @@ xfs_setfilesize(
return xfs_trans_commit(tp);
}
+int
+xfs_setfilesize(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ size_t size)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ return __xfs_setfilesize(ip, tp, offset, size);
+}
+
STATIC int
xfs_setfilesize_ioend(
struct xfs_ioend *ioend,
@@ -247,7 +264,7 @@ xfs_setfilesize_ioend(
return error;
}
- return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
+ return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
/*
@@ -1336,13 +1353,12 @@ xfs_end_io_direct_write(
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
uintptr_t flags = (uintptr_t)private;
int error = 0;
trace_xfs_end_io_direct_write(ip, offset, size);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
if (size <= 0)
@@ -1380,14 +1396,9 @@ xfs_end_io_direct_write(
error = xfs_iomap_write_unwritten(ip, offset, size);
} else if (flags & XFS_DIO_FLAG_APPEND) {
- struct xfs_trans *tp;
-
trace_xfs_end_io_direct_write_append(ip, offset, size);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
- &tp);
- if (!error)
- error = xfs_setfilesize(ip, tp, offset, size);
+ error = xfs_setfilesize(ip, offset, size);
}
return error;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index bf2d9a141a73..1950e3bca2ac 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,6 +62,7 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private);
+int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
extern void xfs_count_page_state(struct page *, int *, int *);
extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index e455f9098d49..2975cb2319f4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -865,7 +865,7 @@ xfs_buf_item_log_segment(
*/
if (bit) {
end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
- mask = ((1 << (end_bit - bit)) - 1) << bit;
+ mask = ((1U << (end_bit - bit)) - 1) << bit;
*wordp |= mask;
wordp++;
bits_set = end_bit - bit;
@@ -888,7 +888,7 @@ xfs_buf_item_log_segment(
*/
end_bit = bits_to_set - bits_set;
if (end_bit) {
- mask = (1 << end_bit) - 1;
+ mask = (1U << end_bit) - 1;
*wordp |= mask;
}
}
@@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error(
bp->b_last_error != bp->b_error) {
bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
bp->b_last_error = bp->b_error;
- if (cfg->retry_timeout && !bp->b_first_retry_time)
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+ !bp->b_first_retry_time)
bp->b_first_retry_time = jiffies;
xfs_buf_ioerror(bp, 0);
@@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error(
if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
++bp->b_retries > cfg->max_retries)
goto permanent_error;
- if (cfg->retry_timeout &&
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
goto permanent_error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e612a0233710..c68517b0f248 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -269,6 +269,8 @@ xfs_file_dio_aio_read(
return -EINVAL;
}
+ file_accessed(iocb->ki_filp);
+
/*
* Locking is a bit tricky here. If we take an exclusive lock for direct
* IO, we effectively serialise all new concurrent read IO to this file
@@ -323,7 +325,6 @@ xfs_file_dio_aio_read(
}
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
- file_accessed(iocb->ki_filp);
return ret;
}
@@ -332,10 +333,7 @@ xfs_file_dax_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct iov_iter data = *to;
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
size_t count = iov_iter_count(to);
ssize_t ret = 0;
@@ -345,11 +343,7 @@ xfs_file_dax_read(
return 0; /* skip atime */
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
- if (ret > 0) {
- iocb->ki_pos += ret;
- iov_iter_advance(to, ret);
- }
+ ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
file_accessed(iocb->ki_filp);
@@ -711,70 +705,32 @@ xfs_file_dax_write(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0;
- int unaligned_io = 0;
- int iolock;
- struct iov_iter data;
+ int iolock = XFS_IOLOCK_EXCL;
+ ssize_t ret, error = 0;
+ size_t count;
+ loff_t pos;
- /* "unaligned" here means not aligned to a filesystem block */
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
- unaligned_io = 1;
- iolock = XFS_IOLOCK_EXCL;
- } else if (mapping->nrpages) {
- iolock = XFS_IOLOCK_EXCL;
- } else {
- iolock = XFS_IOLOCK_SHARED;
- }
xfs_rw_ilock(ip, iolock);
-
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
- /*
- * Yes, even DAX files can have page cache attached to them: A zeroed
- * page is inserted into the pagecache when we have to serve a write
- * fault on a hole. It should never be dirtied and can simply be
- * dropped from the pagecache once we get real data for the page.
- *
- * XXX: This is racy against mmap, and there's nothing we can do about
- * it. dax_do_io() should really do this invalidation internally as
- * it will know if we've allocated over a holei for this specific IO and
- * if so it needs to update the mapping tree and invalidate existing
- * PTEs over the newly allocated range. Remove this invalidation when
- * dax_do_io() is fixed up.
- */
- if (mapping->nrpages) {
- loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
+ pos = iocb->ki_pos;
+ count = iov_iter_count(from);
- ret = invalidate_inode_pages2_range(mapping,
- iocb->ki_pos >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
- WARN_ON_ONCE(ret);
- }
+ trace_xfs_file_dax_write(ip, count, pos);
- if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
- iolock = XFS_IOLOCK_SHARED;
+ ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+ if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+ i_size_write(inode, iocb->ki_pos);
+ error = xfs_setfilesize(ip, pos, ret);
}
- trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
-
- data = *from;
- ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
- xfs_end_io_direct_write, 0);
- if (ret > 0) {
- iocb->ki_pos += ret;
- iov_iter_advance(from, ret);
- }
out:
xfs_rw_iunlock(ip, iolock);
- return ret;
+ return error ? error : ret;
}
STATIC ssize_t
@@ -1513,7 +1469,7 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
+ ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
} else {
ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
@@ -1547,7 +1503,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
+ ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a75f7ab0581c..043ca3808ea2 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -30,6 +30,7 @@
#include "xfs_mru_cache.h"
#include "xfs_filestream.h"
#include "xfs_trace.h"
+#include "xfs_ag_resv.h"
struct xfs_fstrm_item {
struct xfs_mru_cache_elem mru;
@@ -198,7 +199,8 @@ xfs_filestream_pick_ag(
}
longest = xfs_alloc_longest_free_extent(mp, pag,
- xfs_alloc_min_freelist(mp, pag));
+ xfs_alloc_min_freelist(mp, pag),
+ xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
if (((minlen && longest >= minlen) ||
(!minlen && pag->pagf_freeblks >= minfree)) &&
(!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 0b7f986745c1..94ac06f3d908 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -553,7 +553,7 @@ xfs_growfs_data_private(
error = xfs_free_extent(tp,
XFS_AGB_TO_FSB(mp, agno,
be32_to_cpu(agf->agf_length) - new),
- new, &oinfo);
+ new, &oinfo, XFS_AG_RESV_NONE);
if (error)
goto error0;
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index fb39a66914dd..65b2e3f85f52 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1414,6 +1414,16 @@ xfs_inode_set_eofblocks_tag(
struct xfs_perag *pag;
int tagged;
+ /*
+ * Don't bother locking the AG and looking up in the radix trees
+ * if we already know that we have the tag set.
+ */
+ if (ip->i_flags & XFS_IEOFBLOCKS)
+ return;
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags |= XFS_IEOFBLOCKS;
+ spin_unlock(&ip->i_flags_lock);
+
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
trace_xfs_inode_set_eofblocks_tag(ip);
@@ -1449,6 +1459,10 @@ xfs_inode_clear_eofblocks_tag(
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags &= ~XFS_IEOFBLOCKS;
+ spin_unlock(&ip->i_flags_lock);
+
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
trace_xfs_inode_clear_eofblocks_tag(ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e1a411e08f00..8f30d2533b48 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -216,6 +216,7 @@ xfs_get_initial_prid(struct xfs_inode *dp)
#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */
+#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */
/*
* Per-lifetime flags need to be reset when re-using a reclaimable inode during
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2af0dda1c978..c08253e11545 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -42,17 +43,40 @@
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log)
-#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
-STATIC int
-xfs_iomap_eof_align_last_fsb(
- xfs_mount_t *mp,
- xfs_inode_t *ip,
- xfs_extlen_t extsize,
- xfs_fileoff_t *last_fsb)
+void
+xfs_bmbt_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (imap->br_startblock == HOLESTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_DELALLOC;
+ } else {
+ iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+ if (imap->br_state == XFS_EXT_UNWRITTEN)
+ iomap->type = IOMAP_UNWRITTEN;
+ else
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+ iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+ iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
+
+static xfs_extlen_t
+xfs_eof_alignment(
+ struct xfs_inode *ip,
+ xfs_extlen_t extsize)
{
- xfs_extlen_t align = 0;
- int eof, error;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_extlen_t align = 0;
if (!XFS_IS_REALTIME_INODE(ip)) {
/*
@@ -83,8 +107,21 @@ xfs_iomap_eof_align_last_fsb(
align = extsize;
}
+ return align;
+}
+
+STATIC int
+xfs_iomap_eof_align_last_fsb(
+ struct xfs_inode *ip,
+ xfs_extlen_t extsize,
+ xfs_fileoff_t *last_fsb)
+{
+ xfs_extlen_t align = xfs_eof_alignment(ip, extsize);
+
if (align) {
xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align);
+ int eof, error;
+
error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
if (error)
return error;
@@ -154,7 +191,7 @@ xfs_iomap_write_direct(
*/
ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
XFS_IFEXTENTS);
- error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+ error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb);
if (error)
goto out_unlock;
} else {
@@ -274,130 +311,6 @@ out_trans_cancel:
goto out_unlock;
}
-/*
- * If the caller is doing a write at the end of the file, then extend the
- * allocation out to the file system's write iosize. We clean up any extra
- * space left over when the file is closed in xfs_inactive().
- *
- * If we find we already have delalloc preallocation beyond EOF, don't do more
- * preallocation as it it not needed.
- */
-STATIC int
-xfs_iomap_eof_want_preallocate(
- xfs_mount_t *mp,
- xfs_inode_t *ip,
- xfs_off_t offset,
- size_t count,
- xfs_bmbt_irec_t *imap,
- int nimaps,
- int *prealloc)
-{
- xfs_fileoff_t start_fsb;
- xfs_filblks_t count_fsb;
- int n, error, imaps;
- int found_delalloc = 0;
-
- *prealloc = 0;
- if (offset + count <= XFS_ISIZE(ip))
- return 0;
-
- /*
- * If the file is smaller than the minimum prealloc and we are using
- * dynamic preallocation, don't do any preallocation at all as it is
- * likely this is the only write to the file that is going to be done.
- */
- if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
- XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
- return 0;
-
- /*
- * If there are any real blocks past eof, then don't
- * do any speculative allocation.
- */
- start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
- count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- while (count_fsb > 0) {
- imaps = nimaps;
- error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
- 0);
- if (error)
- return error;
- for (n = 0; n < imaps; n++) {
- if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
- (imap[n].br_startblock != DELAYSTARTBLOCK))
- return 0;
- start_fsb += imap[n].br_blockcount;
- count_fsb -= imap[n].br_blockcount;
-
- if (imap[n].br_startblock == DELAYSTARTBLOCK)
- found_delalloc = 1;
- }
- }
- if (!found_delalloc)
- *prealloc = 1;
- return 0;
-}
-
-/*
- * Determine the initial size of the preallocation. We are beyond the current
- * EOF here, but we need to take into account whether this is a sparse write or
- * an extending write when determining the preallocation size. Hence we need to
- * look up the extent that ends at the current write offset and use the result
- * to determine the preallocation size.
- *
- * If the extent is a hole, then preallocation is essentially disabled.
- * Otherwise we take the size of the preceeding data extent as the basis for the
- * preallocation size. If the size of the extent is greater than half the
- * maximum extent length, then use the current offset as the basis. This ensures
- * that for large files the preallocation size always extends to MAXEXTLEN
- * rather than falling short due to things like stripe unit/width alignment of
- * real extents.
- */
-STATIC xfs_fsblock_t
-xfs_iomap_eof_prealloc_initial_size(
- struct xfs_mount *mp,
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_bmbt_irec_t *imap,
- int nimaps)
-{
- xfs_fileoff_t start_fsb;
- int imaps = 1;
- int error;
-
- ASSERT(nimaps >= imaps);
-
- /* if we are using a specific prealloc size, return now */
- if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
- return 0;
-
- /* If the file is small, then use the minimum prealloc */
- if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
- return 0;
-
- /*
- * As we write multiple pages, the offset will always align to the
- * start of a page and hence point to a hole at EOF. i.e. if the size is
- * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
- * will return FSB 1. Hence if there are blocks in the file, we want to
- * point to the block prior to the EOF block and not the hole that maps
- * directly at @offset.
- */
- start_fsb = XFS_B_TO_FSB(mp, offset);
- if (start_fsb)
- start_fsb--;
- error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
- if (error)
- return 0;
-
- ASSERT(imaps == 1);
- if (imap[0].br_startblock == HOLESTARTBLOCK)
- return 0;
- if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
- return imap[0].br_blockcount << 1;
- return XFS_B_TO_FSB(mp, offset);
-}
-
STATIC bool
xfs_quota_need_throttle(
struct xfs_inode *ip,
@@ -459,27 +372,76 @@ xfs_quota_calc_throttle(
}
/*
+ * If we are doing a write at the end of the file and there are no allocations
+ * past this one, then extend the allocation out to the file system's write
+ * iosize.
+ *
* If we don't have a user specified preallocation size, dynamically increase
- * the preallocation size as the size of the file grows. Cap the maximum size
+ * the preallocation size as the size of the file grows. Cap the maximum size
* at a single extent or less if the filesystem is near full. The closer the
* filesystem is to full, the smaller the maximum prealocation.
+ *
+ * As an exception we don't do any preallocation at all if the file is smaller
+ * than the minimum preallocation and we are using the default dynamic
+ * preallocation scheme, as it is likely this is the only write to the file that
+ * is going to be done.
+ *
+ * We clean up any extra space left over when the file is closed in
+ * xfs_inactive().
*/
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
- struct xfs_mount *mp,
struct xfs_inode *ip,
- xfs_off_t offset,
- struct xfs_bmbt_irec *imap,
- int nimaps)
+ loff_t offset,
+ loff_t count,
+ xfs_extnum_t idx,
+ struct xfs_bmbt_irec *prev)
{
- xfs_fsblock_t alloc_blocks = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
int shift = 0;
int64_t freesp;
xfs_fsblock_t qblocks;
int qshift = 0;
+ xfs_fsblock_t alloc_blocks = 0;
+
+ if (offset + count <= XFS_ISIZE(ip))
+ return 0;
- alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
- imap, nimaps);
+ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+ (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)))
+ return 0;
+
+ /*
+ * If an explicit allocsize is set, the file is small, or we
+ * are writing behind a hole, then use the minimum prealloc:
+ */
+ if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
+ XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
+ idx == 0 ||
+ prev->br_startoff + prev->br_blockcount < offset_fsb)
+ return mp->m_writeio_blocks;
+
+ /*
+ * Determine the initial size of the preallocation. We are beyond the
+ * current EOF here, but we need to take into account whether this is
+ * a sparse write or an extending write when determining the
+ * preallocation size. Hence we need to look up the extent that ends
+ * at the current write offset and use the result to determine the
+ * preallocation size.
+ *
+ * If the extent is a hole, then preallocation is essentially disabled.
+ * Otherwise we take the size of the preceding data extent as the basis
+ * for the preallocation size. If the size of the extent is greater than
+ * half the maximum extent length, then use the current offset as the
+ * basis. This ensures that for large files the preallocation size
+ * always extends to MAXEXTLEN rather than falling short due to things
+ * like stripe unit/width alignment of real extents.
+ */
+ if (prev->br_blockcount <= (MAXEXTLEN >> 1))
+ alloc_blocks = prev->br_blockcount << 1;
+ else
+ alloc_blocks = XFS_B_TO_FSB(mp, offset);
if (!alloc_blocks)
goto check_writeio;
qblocks = alloc_blocks;
@@ -550,120 +512,145 @@ xfs_iomap_prealloc_size(
*/
while (alloc_blocks && alloc_blocks >= freesp)
alloc_blocks >>= 4;
-
check_writeio:
if (alloc_blocks < mp->m_writeio_blocks)
alloc_blocks = mp->m_writeio_blocks;
-
trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
mp->m_writeio_blocks);
-
return alloc_blocks;
}
-int
-xfs_iomap_write_delay(
- xfs_inode_t *ip,
- xfs_off_t offset,
- size_t count,
- xfs_bmbt_irec_t *ret_imap)
+static int
+xfs_file_iomap_begin_delay(
+ struct inode *inode,
+ loff_t offset,
+ loff_t count,
+ unsigned flags,
+ struct iomap *iomap)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb;
- xfs_fileoff_t last_fsb;
- xfs_off_t aligned_offset;
- xfs_fileoff_t ioalign;
- xfs_extlen_t extsz;
- int nimaps;
- xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
- int prealloc;
- int error;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- /*
- * Make sure that the dquots are there. This doesn't hold
- * the ilock across a disk read.
- */
- error = xfs_qm_dqattach_locked(ip, 0);
- if (error)
- return error;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t maxbytes_fsb =
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+ xfs_fileoff_t end_fsb, orig_end_fsb;
+ int error = 0, eof = 0;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec prev;
+ xfs_extnum_t idx;
- extsz = xfs_get_extsz_hint(ip);
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ ASSERT(!XFS_IS_REALTIME_INODE(ip));
+ ASSERT(!xfs_get_extsz_hint(ip));
- error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
- imap, XFS_WRITE_IMAPS, &prealloc);
- if (error)
- return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
-retry:
- if (prealloc) {
- xfs_fsblock_t alloc_blocks;
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
- alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
- XFS_WRITE_IMAPS);
+ XFS_STATS_INC(mp, xs_blk_mapw);
- aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
- ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
- last_fsb = ioalign + alloc_blocks;
- } else {
- last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
}
- if (prealloc || extsz) {
- error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
- if (error)
- return error;
+ xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx,
+ &got, &prev);
+ if (!eof && got.br_startoff <= offset_fsb) {
+ trace_xfs_iomap_found(ip, offset, count, 0, &got);
+ goto done;
}
+ error = xfs_qm_dqattach_locked(ip, 0);
+ if (error)
+ goto out_unlock;
+
/*
- * Make sure preallocation does not create extents beyond the range we
- * actually support in this filesystem.
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
+ * to keep the chunks of work done where somewhat symmetric with the
+ * work writeback does. This is a completely arbitrary number pulled
+ * out of thin air as a best guess for initial testing.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
*/
- if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes))
- last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+ count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+ end_fsb = orig_end_fsb =
+ min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+ if (eof) {
+ xfs_fsblock_t prealloc_blocks;
- ASSERT(last_fsb > offset_fsb);
+ prealloc_blocks =
+ xfs_iomap_prealloc_size(ip, offset, count, idx, &prev);
+ if (prealloc_blocks) {
+ xfs_extlen_t align;
+ xfs_off_t end_offset;
- nimaps = XFS_WRITE_IMAPS;
- error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
- imap, &nimaps, XFS_BMAPI_ENTIRE);
+ end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
+ end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+ prealloc_blocks;
+
+ align = xfs_eof_alignment(ip, 0);
+ if (align)
+ end_fsb = roundup_64(end_fsb, align);
+
+ end_fsb = min(end_fsb, maxbytes_fsb);
+ ASSERT(end_fsb > offset_fsb);
+ }
+ }
+
+retry:
+ error = xfs_bmapi_reserve_delalloc(ip, offset_fsb,
+ end_fsb - offset_fsb, &got,
+ &prev, &idx, eof);
switch (error) {
case 0:
+ break;
case -ENOSPC:
case -EDQUOT:
- break;
- default:
- return error;
- }
-
- /*
- * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
- * without EOF preallocation.
- */
- if (nimaps == 0) {
+ /* retry without any preallocation */
trace_xfs_delalloc_enospc(ip, offset, count);
- if (prealloc) {
- prealloc = 0;
- error = 0;
+ if (end_fsb != orig_end_fsb) {
+ end_fsb = orig_end_fsb;
goto retry;
}
- return error ? error : -ENOSPC;
+ /*FALLTHRU*/
+ default:
+ goto out_unlock;
}
- if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
- return xfs_alert_fsblock_zero(ip, &imap[0]);
-
/*
* Tag the inode as speculatively preallocated so we can reclaim this
* space on demand, if necessary.
*/
- if (prealloc)
+ if (end_fsb != orig_end_fsb)
xfs_inode_set_eofblocks_tag(ip);
- *ret_imap = imap[0];
- return 0;
+ trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+done:
+ if (isnullstartblock(got.br_startblock))
+ got.br_startblock = DELAYSTARTBLOCK;
+
+ if (!got.br_startblock) {
+ error = xfs_alert_fsblock_zero(ip, &got);
+ if (error)
+ goto out_unlock;
+ }
+
+ xfs_bmbt_to_iomap(ip, iomap, &got);
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
}
/*
@@ -947,37 +934,13 @@ error_on_bmapi_transaction:
return error;
}
-void
-xfs_bmbt_to_iomap(
- struct xfs_inode *ip,
- struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
-{
- struct xfs_mount *mp = ip->i_mount;
-
- if (imap->br_startblock == HOLESTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_DELALLOC;
- } else {
- iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
- if (imap->br_state == XFS_EXT_UNWRITTEN)
- iomap->type = IOMAP_UNWRITTEN;
- else
- iomap->type = IOMAP_MAPPED;
- }
- iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
- iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
- iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
-}
-
-static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool imap_needs_alloc(struct inode *inode,
+ struct xfs_bmbt_irec *imap, int nimaps)
{
return !nimaps ||
imap->br_startblock == HOLESTARTBLOCK ||
- imap->br_startblock == DELAYSTARTBLOCK;
+ imap->br_startblock == DELAYSTARTBLOCK ||
+ (IS_DAX(inode) && ISUNWRITTEN(imap));
}
static int
@@ -993,11 +956,18 @@ xfs_file_iomap_begin(
struct xfs_bmbt_irec imap;
xfs_fileoff_t offset_fsb, end_fsb;
int nimaps = 1, error = 0;
+ unsigned lockmode;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if ((flags & IOMAP_WRITE) &&
+ !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
+ return xfs_file_iomap_begin_delay(inode, offset, length, flags,
+ iomap);
+ }
+
+ lockmode = xfs_ilock_data_map_shared(ip);
ASSERT(offset <= mp->m_super->s_maxbytes);
if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
@@ -1008,11 +978,11 @@ xfs_file_iomap_begin(
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, XFS_BMAPI_ENTIRE);
if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return error;
}
- if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+ if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
/*
* We cap the maximum length we map here to MAX_WRITEBACK_PAGES
* pages to keep the chunks of work done where somewhat symmetric
@@ -1024,27 +994,23 @@ xfs_file_iomap_begin(
* the lower level functions are updated.
*/
length = min_t(loff_t, length, 1024 * PAGE_SIZE);
- if (xfs_get_extsz_hint(ip)) {
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
- error = xfs_iomap_write_direct(ip, offset, length, &imap,
- nimaps);
- } else {
- error = xfs_iomap_write_delay(ip, offset, length, &imap);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
+ */
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
+ error = xfs_iomap_write_direct(ip, offset, length, &imap,
+ nimaps);
if (error)
return error;
+ iomap->flags = IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
} else {
ASSERT(nimaps);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
trace_xfs_iomap_found(ip, offset, length, 0, &imap);
}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index fb8aca3d69ab..6498be485932 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -25,8 +25,6 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
- struct xfs_bmbt_irec *);
int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
struct xfs_bmbt_irec *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b36676cde103..041d9493e798 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -57,10 +57,16 @@ enum {
#define XFS_ERR_RETRY_FOREVER -1
+/*
+ * Although retry_timeout is in jiffies which is normally an unsigned long,
+ * we limit the retry timeout to 86400 seconds, or one day. So even a
+ * signed 32-bit long is sufficient for a HZ value up to 24855. Making it
+ * signed lets us store the special "-1" value, meaning retry forever.
+ */
struct xfs_error_cfg {
struct xfs_kobj kobj;
int max_retries;
- unsigned long retry_timeout; /* in jiffies, 0 = no timeout */
+ long retry_timeout; /* in jiffies, -1 = infinite */
};
typedef struct xfs_mount {
@@ -325,6 +331,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp)
}
#endif
+/* per-AG block reservation data structures*/
+enum xfs_ag_resv_type {
+ XFS_AG_RESV_NONE = 0,
+ XFS_AG_RESV_METADATA,
+ XFS_AG_RESV_AGFL,
+};
+
+struct xfs_ag_resv {
+ /* number of blocks originally reserved here */
+ xfs_extlen_t ar_orig_reserved;
+ /* number of blocks reserved here */
+ xfs_extlen_t ar_reserved;
+ /* number of blocks originally asked for */
+ xfs_extlen_t ar_asked;
+};
+
/*
* Per-ag incore structure, copies of information in agf and agi, to improve the
* performance of allocation group selection.
@@ -372,8 +394,28 @@ typedef struct xfs_perag {
/* for rcu-safe freeing */
struct rcu_head rcu_head;
int pagb_count; /* pagb slots in use */
+
+ /* Blocks reserved for all kinds of metadata. */
+ struct xfs_ag_resv pag_meta_resv;
+ /* Blocks reserved for just AGFL-based metadata. */
+ struct xfs_ag_resv pag_agfl_resv;
} xfs_perag_t;
+static inline struct xfs_ag_resv *
+xfs_perag_resv(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ return &pag->pag_meta_resv;
+ case XFS_AG_RESV_AGFL:
+ return &pag->pag_agfl_resv;
+ default:
+ return NULL;
+ }
+}
+
extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 2500f28689d5..0432a459871c 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -51,28 +51,16 @@ xfs_rui_item_free(
kmem_zone_free(xfs_rui_zone, ruip);
}
-/*
- * This returns the number of iovecs needed to log the given rui item.
- * We only need 1 iovec for an rui item. It just logs the rui_log_format
- * structure.
- */
-static inline int
-xfs_rui_item_sizeof(
- struct xfs_rui_log_item *ruip)
-{
- return sizeof(struct xfs_rui_log_format) +
- (ruip->rui_format.rui_nextents - 1) *
- sizeof(struct xfs_map_extent);
-}
-
STATIC void
xfs_rui_item_size(
struct xfs_log_item *lip,
int *nvecs,
int *nbytes)
{
+ struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
+
*nvecs += 1;
- *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip));
+ *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
}
/*
@@ -97,7 +85,7 @@ xfs_rui_item_format(
ruip->rui_format.rui_size = 1;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
- xfs_rui_item_sizeof(ruip));
+ xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents));
}
/*
@@ -205,16 +193,12 @@ xfs_rui_init(
{
struct xfs_rui_log_item *ruip;
- uint size;
ASSERT(nextents > 0);
- if (nextents > XFS_RUI_MAX_FAST_EXTENTS) {
- size = (uint)(sizeof(struct xfs_rui_log_item) +
- ((nextents - 1) * sizeof(struct xfs_map_extent)));
- ruip = kmem_zalloc(size, KM_SLEEP);
- } else {
+ if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
+ ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+ else
ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
- }
xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
ruip->rui_format.rui_nextents = nextents;
@@ -239,14 +223,12 @@ xfs_rui_copy_format(
uint len;
src_rui_fmt = buf->i_addr;
- len = sizeof(struct xfs_rui_log_format) +
- (src_rui_fmt->rui_nextents - 1) *
- sizeof(struct xfs_map_extent);
+ len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
if (buf->i_len != len)
return -EFSCORRUPTED;
- memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len);
+ memcpy(dst_rui_fmt, src_rui_fmt, len);
return 0;
}
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index aefcc3a318a5..340c968e1f9c 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -70,6 +70,14 @@ struct xfs_rui_log_item {
struct xfs_rui_log_format rui_format;
};
+static inline size_t
+xfs_rui_log_item_sizeof(
+ unsigned int nr)
+{
+ return offsetof(struct xfs_rui_log_item, rui_format) +
+ xfs_rui_log_format_sizeof(nr);
+}
+
/*
* This is the "rmap update done" log item. It is used to log the fact that
* some rmapbt updates mentioned in an earlier rui item have been performed.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c57c31996322..2d092f9577ca 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1782,9 +1782,8 @@ xfs_init_zones(void)
if (!xfs_rud_zone)
goto out_destroy_icreate_zone;
- xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) +
- ((XFS_RUI_MAX_FAST_EXTENTS - 1) *
- sizeof(struct xfs_map_extent))),
+ xfs_rui_zone = kmem_zone_init(
+ xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
"xfs_rui_item");
if (!xfs_rui_zone)
goto out_destroy_rud_zone;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 79cfd3fc5324..5f8d55d29a11 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -393,9 +393,15 @@ max_retries_show(
struct kobject *kobject,
char *buf)
{
+ int retries;
struct xfs_error_cfg *cfg = to_error_cfg(kobject);
- return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+ if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+ retries = -1;
+ else
+ retries = cfg->max_retries;
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", retries);
}
static ssize_t
@@ -415,7 +421,10 @@ max_retries_store(
if (val < -1)
return -EINVAL;
- cfg->max_retries = val;
+ if (val == -1)
+ cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+ else
+ cfg->max_retries = val;
return count;
}
XFS_SYSFS_ATTR_RW(max_retries);
@@ -425,10 +434,15 @@ retry_timeout_seconds_show(
struct kobject *kobject,
char *buf)
{
+ int timeout;
struct xfs_error_cfg *cfg = to_error_cfg(kobject);
- return snprintf(buf, PAGE_SIZE, "%ld\n",
- jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+ if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+ timeout = -1;
+ else
+ timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", timeout);
}
static ssize_t
@@ -445,11 +459,16 @@ retry_timeout_seconds_store(
if (ret)
return ret;
- /* 1 day timeout maximum */
- if (val < 0 || val > 86400)
+ /* 1 day timeout maximum, -1 means infinite */
+ if (val < -1 || val > 86400)
return -EINVAL;
- cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+ if (val == -1)
+ cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+ else {
+ cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+ ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX);
+ }
return count;
}
XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
@@ -519,18 +538,19 @@ struct xfs_error_init {
static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
{ .name = "default",
.max_retries = XFS_ERR_RETRY_FOREVER,
- .retry_timeout = 0,
+ .retry_timeout = XFS_ERR_RETRY_FOREVER,
},
{ .name = "EIO",
.max_retries = XFS_ERR_RETRY_FOREVER,
- .retry_timeout = 0,
+ .retry_timeout = XFS_ERR_RETRY_FOREVER,
},
{ .name = "ENOSPC",
.max_retries = XFS_ERR_RETRY_FOREVER,
- .retry_timeout = 0,
+ .retry_timeout = XFS_ERR_RETRY_FOREVER,
},
{ .name = "ENODEV",
- .max_retries = 0,
+ .max_retries = 0, /* We can't recover from devices disappearing */
+ .retry_timeout = 0,
},
};
@@ -561,7 +581,10 @@ xfs_error_sysfs_init_class(
goto out_error;
cfg->max_retries = init[i].max_retries;
- cfg->retry_timeout = msecs_to_jiffies(
+ if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER)
+ cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+ else
+ cfg->retry_timeout = msecs_to_jiffies(
init[i].retry_timeout * MSEC_PER_SEC);
}
return 0;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 74606cba7dba..c6b2b1dcde75 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1570,14 +1570,15 @@ TRACE_EVENT(xfs_agf,
TRACE_EVENT(xfs_free_extent,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
- xfs_extlen_t len, bool isfl, int haveleft, int haveright),
- TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+ xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft,
+ int haveright),
+ TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
- __field(int, isfl)
+ __field(int, resv)
__field(int, haveleft)
__field(int, haveright)
),
@@ -1586,16 +1587,16 @@ TRACE_EVENT(xfs_free_extent,
__entry->agno = agno;
__entry->agbno = agbno;
__entry->len = len;
- __entry->isfl = isfl;
+ __entry->resv = resv;
__entry->haveleft = haveleft;
__entry->haveright = haveright;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+ TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
__entry->len,
- __entry->isfl,
+ __entry->resv,
__entry->haveleft ?
(__entry->haveright ? "both" : "left") :
(__entry->haveright ? "right" : "none"))
@@ -1622,7 +1623,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__field(short, otype)
__field(char, wasdel)
__field(char, wasfromfl)
- __field(char, isfl)
+ __field(int, resv)
__field(int, datatype)
__field(xfs_fsblock_t, firstblock)
),
@@ -1643,13 +1644,13 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__entry->otype = args->otype;
__entry->wasdel = args->wasdel;
__entry->wasfromfl = args->wasfromfl;
- __entry->isfl = args->isfl;
+ __entry->resv = args->resv;
__entry->datatype = args->datatype;
__entry->firstblock = args->firstblock;
),
TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
"prod %u minleft %u total %u alignment %u minalignslop %u "
- "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
+ "len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
"datatype 0x%x firstblock 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
@@ -1667,7 +1668,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
__entry->wasdel,
__entry->wasfromfl,
- __entry->isfl,
+ __entry->resv,
__entry->datatype,
(unsigned long long)__entry->firstblock)
)
@@ -2585,6 +2586,60 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
+/* per-AG reservation */
+DECLARE_EVENT_CLASS(xfs_ag_resv_class,
+ TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv,
+ xfs_extlen_t len),
+ TP_ARGS(pag, resv, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, resv)
+ __field(xfs_extlen_t, freeblks)
+ __field(xfs_extlen_t, flcount)
+ __field(xfs_extlen_t, reserved)
+ __field(xfs_extlen_t, asked)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ struct xfs_ag_resv *r = xfs_perag_resv(pag, resv);
+
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->resv = resv;
+ __entry->freeblks = pag->pagf_freeblks;
+ __entry->flcount = pag->pagf_flcount;
+ __entry->reserved = r ? r->ar_reserved : 0;
+ __entry->asked = r ? r->ar_asked : 0;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->resv,
+ __entry->freeblks,
+ __entry->flcount,
+ __entry->reserved,
+ __entry->asked,
+ __entry->len)
+)
+#define DEFINE_AG_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_ag_resv_class, name, \
+ TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \
+ xfs_extlen_t len), \
+ TP_ARGS(pag, type, len))
+
+/* per-AG reservation tracepoints */
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_init);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
+
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 5f3d33d16e67..70f42ea86dfb 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -217,7 +217,7 @@ undo_log:
undo_blocks:
if (blocks > 0) {
- xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+ xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
@@ -318,7 +318,6 @@ xfs_trans_mod_sb(
* in-core superblock's counter. This should only
* be applied to the on-disk superblock.
*/
- ASSERT(delta < 0);
tp->t_res_fdblocks_delta += delta;
if (xfs_sb_version_haslazysbcount(&mp->m_sb))
flags &= ~XFS_TRANS_SB_DIRTY;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 459ddec137a4..ab438647592a 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -79,7 +79,8 @@ xfs_trans_free_extent(
trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
- error = xfs_free_extent(tp, start_block, ext_len, oinfo);
+ error = xfs_free_extent(tp, start_block, ext_len, oinfo,
+ XFS_AG_RESV_NONE);
/*
* Mark the transaction dirty, even on error. This ensures the
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index ea62245fee26..62900938f26d 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -147,6 +147,7 @@ __xfs_xattr_put_listent(
arraytop = context->count + prefix_len + namelen + 1;
if (arraytop > context->firstu) {
context->count = -1; /* insufficient space */
+ context->seen_enough = 1;
return 0;
}
offset = (char *)context->alist + context->count;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9c6dc7704043..add6c4bc568f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -6,13 +6,19 @@
#include <linux/radix-tree.h>
#include <asm/pgtable.h>
+struct iomap_ops;
+
/* We use lowest available exceptional entry bit for locking */
#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+ struct iomap_ops *ops);
ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
get_block_t, dio_iodone_t, int flags);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ struct iomap_ops *ops);
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3d70ece10313..e63e288dee83 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -22,6 +22,8 @@ struct vm_fault;
* Flags for iomap mappings:
*/
#define IOMAP_F_MERGED 0x01 /* contains multiple blocks/extents */
+#define IOMAP_F_SHARED 0x02 /* block shared with another file */
+#define IOMAP_F_NEW 0x04 /* blocks have been newly allocated */
/*
* Magic value for blkno:
@@ -64,6 +66,8 @@ struct iomap_ops {
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
struct iomap_ops *ops);
+int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+ struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, struct iomap_ops *ops);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
diff --git a/mm/filemap.c b/mm/filemap.c
index 8a287dfc5372..2f1175e86c77 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1910,16 +1910,18 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
+ struct iov_iter data = *iter;
loff_t size;
size = i_size_read(inode);
retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1);
- if (!retval) {
- struct iov_iter data = *iter;
- retval = mapping->a_ops->direct_IO(iocb, &data);
- }
+ if (retval < 0)
+ goto out;
+ file_accessed(file);
+
+ retval = mapping->a_ops->direct_IO(iocb, &data);
if (retval > 0) {
iocb->ki_pos += retval;
iov_iter_advance(iter, retval);
@@ -1935,10 +1937,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* DAX files, so don't bother trying.
*/
if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size ||
- IS_DAX(inode)) {
- file_accessed(file);
+ IS_DAX(inode))
goto out;
- }
}
retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);