summaryrefslogtreecommitdiffstats
path: root/fs/iomap/buffered-io.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-01-06 10:26:39 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-01-06 10:26:39 -0800
commitfbfd64d25c7af3b8695201ebc85efe90be28c5a3 (patch)
tree4d204ecce2cea0fdae4b5785fe9d9f03da672df4 /fs/iomap/buffered-io.c
parent13563da6ffcf49b8b45772e40b35f96926a7ee1e (diff)
parent368fcc5d3f8bf645a630a44e65f5eb008aba7082 (diff)
downloadlinux-fbfd64d25c7af3b8695201ebc85efe90be28c5a3.tar.gz
linux-fbfd64d25c7af3b8695201ebc85efe90be28c5a3.tar.bz2
linux-fbfd64d25c7af3b8695201ebc85efe90be28c5a3.zip
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs fixes from Christian Brauner: - Relax assertions on failure to encode file handles The ->encode_fh() method can fail for various reasons. None of them warrant a WARN_ON(). - Fix overlayfs file handle encoding by allowing encoding an fid from an inode without an alias - Make sure fuse_dir_open() handles FOPEN_KEEP_CACHE. If it's not specified fuse needs to invaludate the directory inode page cache - Fix qnx6 so it builds with gcc-15 - Various fixes for netfslib and ceph and nfs filesystems: - Ignore silly rename files from afs and nfs when building header archives - Fix read result collection in netfslib with multiple subrequests - Handle ENOMEM for netfslib buffered reads - Fix oops in nfs_netfs_init_request() - Parse the secctx command immediately in cachefiles - Remove a redundant smp_rmb() in netfslib - Handle recursion in read retry in netfslib - Fix clearing of folio_queue - Fix missing cancellation of copy-to_cache when the cache for a file is temporarly disabled in netfslib - Sanity check the hfs root record - Fix zero padding data issues in concurrent write scenarios - Fix is_mnt_ns_file() after converting nsfs to path_from_stashed() - Fix missing declaration of init_files - Increase I/O priority when writing revoke records in jbd2 - Flush filesystem device before updating tail sequence in jbd2 * tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (23 commits) ovl: support encoding fid from inode with no alias ovl: pass realinode to ovl_encode_real_fh() instead of realdentry fuse: respect FOPEN_KEEP_CACHE on opendir netfs: Fix is-caching check in read-retry netfs: Fix the (non-)cancellation of copy when cache is temporarily disabled netfs: Fix ceph copy to cache on write-begin netfs: Work around recursion by abandoning retry if nothing read netfs: Fix missing barriers by using clear_and_wake_up_bit() netfs: Remove redundant use of smp_rmb() cachefiles: Parse the "secctx" immediately nfs: Fix oops in nfs_netfs_init_request() when copying to cache netfs: Fix enomem handling in buffered reads netfs: Fix non-contiguous donation between completed reads kheaders: Ignore silly-rename files fs: relax assertions on failure to encode file handles fs: fix missing declaration of init_files fs: fix is_mnt_ns_file() iomap: fix zero padding data issue in concurrent append writes iomap: pass byte granular end position to iomap_add_to_ioend jbd2: flush filesystem device before updating tail sequence ...
Diffstat (limited to 'fs/iomap/buffered-io.c')
-rw-r--r--fs/iomap/buffered-io.c66
1 files changed, 57 insertions, 9 deletions
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 955f19e27e47..54dc27d92781 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1774,7 +1774,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
*/
static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, loff_t pos, unsigned len)
+ struct inode *inode, loff_t pos, loff_t end_pos,
+ unsigned len)
{
struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
@@ -1793,15 +1794,60 @@ new_ioend:
if (ifs)
atomic_add(len, &ifs->write_bytes_pending);
+
+ /*
+ * Clamp io_offset and io_size to the incore EOF so that ondisk
+ * file size updates in the ioend completion are byte-accurate.
+ * This avoids recovering files with zeroed tail regions when
+ * writeback races with appending writes:
+ *
+ * Thread 1: Thread 2:
+ * ------------ -----------
+ * write [A, A+B]
+ * update inode size to A+B
+ * submit I/O [A, A+BS]
+ * write [A+B, A+B+C]
+ * update inode size to A+B+C
+ * <I/O completes, updates disk size to min(A+B+C, A+BS)>
+ * <power failure>
+ *
+ * After reboot:
+ * 1) with A+B+C < A+BS, the file has zero padding in range
+ * [A+B, A+B+C]
+ *
+ * |< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|
+ * ^ ^ ^
+ * A A+B A+B+C
+ * (EOF)
+ *
+ * 2) with A+B+C > A+BS, the file has zero padding in range
+ * [A+B, A+BS]
+ *
+ * |< Block Size (BS) >|< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
+ * ^ ^ ^ ^
+ * A A+B A+BS A+B+C
+ * (EOF)
+ *
+ * D = Valid Data
+ * 0 = Zero Padding
+ *
+ * Note that this defeats the ability to chain the ioends of
+ * appending writes.
+ */
wpc->ioend->io_size += len;
+ if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
+ wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+
wbc_account_cgroup_owner(wbc, folio, len);
return 0;
}
static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, u64 pos, unsigned dirty_len,
- unsigned *count)
+ struct inode *inode, u64 pos, u64 end_pos,
+ unsigned dirty_len, unsigned *count)
{
int error;
@@ -1826,7 +1872,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
break;
default:
error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
- map_len);
+ end_pos, map_len);
if (!error)
(*count)++;
break;
@@ -1897,11 +1943,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
* remaining memory is zeroed when mapped, and writes to that
* region are not written out to the file.
*
- * Also adjust the writeback range to skip all blocks entirely
- * beyond i_size.
+ * Also adjust the end_pos to the end of file and skip writeback
+ * for all blocks entirely beyond i_size.
*/
folio_zero_segment(folio, poff, folio_size(folio));
- *end_pos = round_up(isize, i_blocksize(inode));
+ *end_pos = isize;
}
return true;
@@ -1914,6 +1960,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct inode *inode = folio->mapping->host;
u64 pos = folio_pos(folio);
u64 end_pos = pos + folio_size(folio);
+ u64 end_aligned = 0;
unsigned count = 0;
int error = 0;
u32 rlen;
@@ -1955,9 +2002,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
/*
* Walk through the folio to find dirty areas to write back.
*/
- while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+ end_aligned = round_up(end_pos, i_blocksize(inode));
+ while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
- pos, rlen, &count);
+ pos, end_pos, rlen, &count);
if (error)
break;
pos += rlen;