From 8625147cafaa9ba74713d682f5185eb62cb2aedb Mon Sep 17 00:00:00 2001 From: James Houghton Date: Tue, 18 Oct 2022 20:01:25 +0000 Subject: hugetlbfs: don't delete error page from pagecache This change is very similar to the change that was made for shmem [1], and it solves the same problem but for HugeTLBFS instead. Currently, when poison is found in a HugeTLB page, the page is removed from the page cache. That means that attempting to map or read that hugepage in the future will result in a new hugepage being allocated instead of notifying the user that the page was poisoned. As [1] states, this is effectively memory corruption. The fix is to leave the page in the page cache. If the user attempts to use a poisoned HugeTLB page with a syscall, the syscall will fail with EIO, the same error code that shmem uses. For attempts to map the page, the thread will get a BUS_MCEERR_AR SIGBUS. [1]: commit a76054266661 ("mm: shmem: don't truncate page if memory failure happens") Link: https://lkml.kernel.org/r/20221018200125.848471-1-jthoughton@google.com Signed-off-by: James Houghton Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Tested-by: Naoya Horiguchi Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: James Houghton Cc: Miaohe Lin Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dd54f67e47fd..df7772335dc0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { unlock_page(page); + if (PageHWPoison(page)) { + put_page(page); + retval = -EIO; + break; + } + /* * We have the page, copy it to user space buffer. */ @@ -1111,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, static int hugetlbfs_error_remove_page(struct address_space *mapping, struct page *page) { - struct inode *inode = mapping->host; - pgoff_t index = page->index; - - hugetlb_delete_from_page_cache(page); - if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - return 0; } -- cgit v1.2.3 From 8ac932a4921a96ca52f61935dbba64ea87bbd5dc Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 29 Oct 2022 13:49:12 +0900 Subject: nilfs2: fix deadlock in nilfs_count_free_blocks() A semaphore deadlock can occur if nilfs_get_block() detects metadata corruption while locating data blocks and a superblock writeback occurs at the same time: task 1 task 2 ------ ------ * A file operation * nilfs_truncate() nilfs_get_block() down_read(rwsem A) <-- nilfs_bmap_lookup_contig() ... generic_shutdown_super() nilfs_put_super() * Prepare to write superblock * down_write(rwsem B) <-- nilfs_cleanup_super() * Detect b-tree corruption * nilfs_set_log_cursor() nilfs_bmap_convert_error() nilfs_count_free_blocks() __nilfs_error() down_read(rwsem A) <-- nilfs_set_error() down_write(rwsem B) <-- *** DEADLOCK *** Here, nilfs_get_block() readlocks rwsem A (= NILFS_MDT(dat_inode)->mi_sem) and then calls nilfs_bmap_lookup_contig(), but if it fails due to metadata corruption, __nilfs_error() is called from nilfs_bmap_convert_error() inside the lock section. Since __nilfs_error() calls nilfs_set_error() unless the filesystem is read-only and nilfs_set_error() attempts to writelock rwsem B (= nilfs->ns_sem) to write back superblock exclusively, hierarchical lock acquisition occurs in the order rwsem A -> rwsem B. Now, if another task starts updating the superblock, it may writelock rwsem B during the lock sequence above, and can deadlock trying to readlock rwsem A in nilfs_count_free_blocks(). However, there is actually no need to take rwsem A in nilfs_count_free_blocks() because it, within the lock section, only reads a single integer data on a shared struct with nilfs_sufile_get_ncleansegs(). This has been the case after commit aa474a220180 ("nilfs2: add local variable to cache the number of clean segments"), that is, even before this bug was introduced. So, this resolves the deadlock problem by just not taking the semaphore in nilfs_count_free_blocks(). Link: https://lkml.kernel.org/r/20221029044912.9139-1-konishi.ryusuke@gmail.com Fixes: e828949e5b42 ("nilfs2: call nilfs_error inside bmap routines") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+45d6ce7b7ad7ef455d03@syzkaller.appspotmail.com Tested-by: Ryusuke Konishi Cc: [2.6.38+ Signed-off-by: Andrew Morton --- fs/nilfs2/the_nilfs.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 3b4a079c9617..c8b89b4f94e0 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -690,9 +690,7 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) { unsigned long ncleansegs; - down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); - up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; return 0; } -- cgit v1.2.3 From 8cccf05fe857a18ee26e20d11a8455a73ffd4efd Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 4 Nov 2022 23:29:59 +0900 Subject: nilfs2: fix use-after-free bug of ns_writer on remount If a nilfs2 filesystem is downgraded to read-only due to metadata corruption on disk and is remounted read/write, or if emergency read-only remount is performed, detaching a log writer and synchronizing the filesystem can be done at the same time. In these cases, use-after-free of the log writer (hereinafter nilfs->ns_writer) can happen as shown in the scenario below: Task1 Task2 -------------------------------- ------------------------------ nilfs_construct_segment nilfs_segctor_sync init_wait init_waitqueue_entry add_wait_queue schedule nilfs_remount (R/W remount case) nilfs_attach_log_writer nilfs_detach_log_writer nilfs_segctor_destroy kfree finish_wait _raw_spin_lock_irqsave __raw_spin_lock_irqsave do_raw_spin_lock debug_spin_lock_before <-- use-after-free While Task1 is sleeping, nilfs->ns_writer is freed by Task2. After Task1 waked up, Task1 accesses nilfs->ns_writer which is already freed. This scenario diagram is based on the Shigeru Yoshida's post [1]. This patch fixes the issue by not detaching nilfs->ns_writer on remount so that this UAF race doesn't happen. Along with this change, this patch also inserts a few necessary read-only checks with superblock instance where only the ns_writer pointer was used to check if the filesystem is read-only. Link: https://syzkaller.appspot.com/bug?id=79a4c002e960419ca173d55e863bd09e8112df8b Link: https://lkml.kernel.org/r/20221103141759.1836312-1-syoshida@redhat.com [1] Link: https://lkml.kernel.org/r/20221104142959.28296-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+f816fa82f8783f7a02bb@syzkaller.appspotmail.com Reported-by: Shigeru Yoshida Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 15 ++++++++------- fs/nilfs2/super.c | 2 -- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index b4cebad21b48..3335ef352915 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -317,7 +317,7 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb) struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; - if (!sci || !sci->sc_flush_request) + if (sb_rdonly(sb) || unlikely(!sci) || !sci->sc_flush_request) return; set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); @@ -2242,7 +2242,7 @@ int nilfs_construct_segment(struct super_block *sb) struct nilfs_sc_info *sci = nilfs->ns_writer; struct nilfs_transaction_info *ti; - if (!sci) + if (sb_rdonly(sb) || unlikely(!sci)) return -EROFS; /* A call inside transactions causes a deadlock. */ @@ -2280,7 +2280,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, struct nilfs_transaction_info ti; int err = 0; - if (!sci) + if (sb_rdonly(sb) || unlikely(!sci)) return -EROFS; nilfs_transaction_lock(sb, &ti, 0); @@ -2776,11 +2776,12 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) if (nilfs->ns_writer) { /* - * This happens if the filesystem was remounted - * read/write after nilfs_error degenerated it into a - * read-only mount. + * This happens if the filesystem is made read-only by + * __nilfs_error or nilfs_remount and then remounted + * read/write. In these cases, reuse the existing + * writer. */ - nilfs_detach_log_writer(sb); + return 0; } nilfs->ns_writer = nilfs_segctor_new(sb, root); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index ba108f915391..6edb6e0dd61f 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1133,8 +1133,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out; if (*flags & SB_RDONLY) { - /* Shutting down log writer */ - nilfs_detach_log_writer(sb); sb->s_flags |= SB_RDONLY; /* -- cgit v1.2.3 From 82e60d00b753bb5cfecce22b8e952436b14d02a3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Nov 2022 17:34:31 -0400 Subject: fs: fix leaked psi pressure state When psi annotations were added to to btrfs compression reads, the psi state tracking over add_ra_bio_pages and btrfs_submit_compressed_read was faulty. A pressure state, once entered, is never left. This results in incorrectly elevated pressure, which triggers OOM kills. pflags record the *previous* memstall state when we enter a new one. The code tried to initialize pflags to 1, and then optimize the leave call when we either didn't enter a memstall, or were already inside a nested stall. However, there can be multiple PageWorkingset pages in the bio, at which point it's that path itself that enters repeatedly and overwrites pflags. This causes us to miss the exit. Enter the stall only once if needed, then unwind correctly. erofs has the same problem, fix that up too. And move the memstall exit past submit_bio() to restore submit accounting originally added by b8e24a9300b0 ("block: annotate refault stalls from IO submission"). Link: https://lkml.kernel.org/r/Y2UHRqthNUwuIQGS@cmpxchg.org Fixes: 4088a47e78f9 ("btrfs: add manual PSI accounting for compressed reads") Fixes: 99486c511f68 ("erofs: add manual PSI accounting for the compressed address space") Fixes: 118f3663fbc6 ("block: remove PSI accounting from the bio layer") Link: https://lore.kernel.org/r/d20a0a85-e415-cf78-27f9-77dd7a94bc8d@leemhuis.info/ Signed-off-by: Johannes Weiner Reported-by: Thorsten Leemhuis Tested-by: Thorsten Leemhuis Cc: Chao Yu Cc: Chris Mason Cc: Christoph Hellwig Cc: David Sterba Cc: Gao Xiang Cc: Jens Axboe Cc: Josef Bacik Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- fs/btrfs/compression.c | 14 ++++++++------ fs/erofs/zdata.c | 18 +++++++++++------- 2 files changed, 19 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index f1f051ad3147..e6635fe70067 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -512,7 +512,7 @@ static u64 bio_end_offset(struct bio *bio) static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, struct compressed_bio *cb, - unsigned long *pflags) + int *memstall, unsigned long *pflags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; @@ -581,8 +581,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } - if (PageWorkingset(page)) + if (!*memstall && PageWorkingset(page)) { psi_memstall_enter(pflags); + *memstall = 1; + } ret = set_page_extent_mapped(page); if (ret < 0) { @@ -670,8 +672,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - /* Initialize to 1 to make skip psi_memstall_leave unless needed */ - unsigned long pflags = 1; + unsigned long pflags; + int memstall = 0; blk_status_t ret; int ret2; int i; @@ -727,7 +729,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto fail; } - add_ra_bio_pages(inode, em_start + em_len, cb, &pflags); + add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags); /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; @@ -807,7 +809,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } } - if (!pflags) + if (memstall) psi_memstall_leave(&pflags); if (refcount_dec_and_test(&cb->pending_ios)) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c7f24fc7efd5..064a166324a7 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1412,8 +1412,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; - /* initialize to 1 to make skip psi_memstall_leave unless needed */ - unsigned long pflags = 1; + unsigned long pflags; + int memstall = 0; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; @@ -1463,14 +1463,18 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, if (bio && (cur != last_index + 1 || last_bdev != mdev.m_bdev)) { submit_bio_retry: - if (!pflags) - psi_memstall_leave(&pflags); submit_bio(bio); + if (memstall) { + psi_memstall_leave(&pflags); + memstall = 0; + } bio = NULL; } - if (unlikely(PageWorkingset(page))) + if (unlikely(PageWorkingset(page)) && !memstall) { psi_memstall_enter(&pflags); + memstall = 1; + } if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, @@ -1500,9 +1504,9 @@ submit_bio_retry: } while (owned_head != Z_EROFS_PCLUSTER_TAIL); if (bio) { - if (!pflags) - psi_memstall_leave(&pflags); submit_bio(bio); + if (memstall) + psi_memstall_leave(&pflags); } /* -- cgit v1.2.3