From 64c23e86873ee410554d6d1c76b60da47025e96f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Jan 2011 13:07:30 +0100 Subject: make the feature checks in ->fallocate future proof Instead of various home grown checks that might need updates for new flags just check for any bit outside the mask of the features supported by the filesystem. This makes the check future proof for any newly added flag. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c4068f6abf03..4bdd160854eb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3645,7 +3645,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) unsigned int credits, blkbits = inode->i_blkbits; /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode && (mode != FALLOC_FL_KEEP_SIZE)) + if (mode & ~FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; /* -- cgit v1.2.3 From 2fe17c1075836b66678ed2a305fd09b6773883aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Jan 2011 13:07:43 +0100 Subject: fallocate should be a file operation Currently all filesystems except XFS implement fallocate asynchronously, while XFS forced a commit. Both of these are suboptimal - in case of O_SYNC I/O we really want our allocation on disk, especially for the !KEEP_SIZE case where we actually grow the file with user-visible zeroes. On the other hand always commiting the transaction is a bad idea for fast-path uses of fallocate like for example in recent Samba versions. Given that block allocation is a data plane operation anyway change it from an inode operation to a file operation so that we have the file structure available that lets us check for O_SYNC. This also includes moving the code around for a few of the filesystems, and remove the already unnedded S_ISDIR checks given that we only wire up fallocate for regular files. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ext4/extents.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4bdd160854eb..63a75810b7c3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3627,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode, } /* - * preallocate space for a file. This implements ext4's fallocate inode + * preallocate space for a file. This implements ext4's fallocate file * operation, which gets called from sys_fallocate system call. * For block-mapped files, posix_fallocate should fall back to the method * of writing zeroes to the required new blocks (the same behavior which is * expected for file systems which do not support fallocate() system call). */ -long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) +long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { + struct inode *inode = file->f_path.dentry->d_inode; handle_t *handle; loff_t new_size; unsigned int max_blocks; @@ -3655,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; - /* preallocation to directories is currently not supported */ - if (S_ISDIR(inode->i_mode)) - return -ENODEV; - map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because -- cgit v1.2.3 From e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 12 Feb 2011 08:17:34 -0500 Subject: ext4: serialize unaligned asynchronous DIO ext4 has a data corruption case when doing non-block-aligned asynchronous direct IO into a sparse file, as demonstrated by xfstest 240. The root cause is that while ext4 preallocates space in the hole, mappings of that space still look "new" and dio_zero_block() will zero out the unwritten portions. When more than one AIO thread is going, they both find this "new" block and race to zero out their portion; this is uncoordinated and causes data corruption. Dave Chinner fixed this for xfs by simply serializing all unaligned asynchronous direct IO. I've done the same here. The difference is that we only wait on conversions, not all IO. This is a very big hammer, and I'm not very pleased with stuffing this into ext4_file_write(). But since ext4 is DIO_LOCKING, we need to serialize it at this high level. I tried to move this into ext4_ext_direct_IO, but by then we have the i_mutex already, and we will wait on the work queue to do conversions - which must also take the i_mutex. So that won't work. This was originally exposed by qemu-kvm installing to a raw disk image with a normal sector-63 alignment. I've tested a backport of this patch with qemu, and it does avoid the corruption. It is also quite a lot slower (14 min for package installs, vs. 8 min for well-aligned) but I'll take slow correctness over fast corruption any day. Mingming suggested that we can track outstanding conversions, and wait on those so that non-sparse files won't be affected, and I've implemented that here; unaligned AIO to nonsparse files won't take a perf hit. [tytso@mit.edu: Keep the mutex as a hashed array instead of bloating the ext4 inode] [tytso@mit.edu: Fix up namespace issues so that global variables are protected with an "ext4_" prefix.] Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 63a75810b7c3..ccce8a7e94ed 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * that this IO needs to convertion to written when IO is * completed */ - if (io) + if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { io->flag = EXT4_IO_END_UNWRITTEN; - else + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; @@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * that we need to perform convertion when IO is done. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io) + if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { io->flag = EXT4_IO_END_UNWRITTEN; - else + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } -- cgit v1.2.3