From 052a82d85a3b3eee6a386be2ba3b82278cf277ce Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Aug 2019 20:17:56 +0800 Subject: f2fs: fix to writeout dirty inode during node flush As Eric reported: On xfstest generic/204 on f2fs, I'm getting a kernel BUG. allocate_segment_by_default+0x9d/0x100 [f2fs] f2fs_allocate_data_block+0x3c0/0x5c0 [f2fs] do_write_page+0x62/0x110 [f2fs] f2fs_do_write_node_page+0x2b/0xa0 [f2fs] __write_node_page+0x2ec/0x590 [f2fs] f2fs_sync_node_pages+0x756/0x7e0 [f2fs] block_operations+0x25b/0x350 [f2fs] f2fs_write_checkpoint+0x104/0x1150 [f2fs] f2fs_sync_fs+0xa2/0x120 [f2fs] f2fs_balance_fs_bg+0x33c/0x390 [f2fs] f2fs_write_node_pages+0x4c/0x1f0 [f2fs] do_writepages+0x1c/0x70 __writeback_single_inode+0x45/0x320 writeback_sb_inodes+0x273/0x5c0 wb_writeback+0xff/0x2e0 wb_workfn+0xa1/0x370 process_one_work+0x138/0x350 worker_thread+0x4d/0x3d0 kthread+0x109/0x140 The root cause of this issue is, in a very small partition, e.g. in generic/204 testcase of fstest suit, filesystem's free space is 50MB, so at most we can write 12800 inline inode with command: `echo XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX > $SCRATCH_MNT/$i`, then filesystem will have: - 12800 dirty inline data page - 12800 dirty inode page - and 12800 dirty imeta (dirty inode) When we flush node-inode's page cache, we can also flush inline data with each inode page, however it will run out-of-free-space in device, then once it triggers checkpoint, there is no room for huge number of imeta, at this time, GC is useless, as there is no dirty segment at all. In order to fix this, we try to recognize inode page during node_inode's page flushing, and update inode page from dirty inode, so that later another imeta (dirty inode) flush can be avoided. Reported-and-tested-by: Eric Biggers Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d9ba1db2d01e..e5044eec8097 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1762,6 +1762,47 @@ out: return ret ? -EIO: 0; } +static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool clean; + + if (inode->i_ino != ino) + return 0; + + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) + return 0; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + clean = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + if (clean) + return 0; + + inode = igrab(inode); + if (!inode) + return 0; + return 1; +} + +static bool flush_dirty_inode(struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct inode *inode; + nid_t ino = ino_of_node(page); + + inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); + if (!inode) + return false; + + f2fs_update_inode(inode, page); + unlock_page(page); + + iput(inode); + return true; +} + int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) @@ -1785,6 +1826,7 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; bool submitted = false; + bool may_dirty = true; /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[NODE]) && @@ -1832,6 +1874,13 @@ continue_unlock: goto lock_node; } + /* flush dirty inode */ + if (IS_INODE(page) && may_dirty) { + may_dirty = false; + if (flush_dirty_inode(page)) + goto lock_node; + } + f2fs_wait_on_page_writeback(page, NODE, true, true); if (!clear_page_dirty_for_io(page)) -- cgit v1.2.3