From 984a608377cb623351b8a3670b285f32ebeb2d32 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 20 Oct 2022 13:56:19 -0400 Subject: mm/kmemleak: prevent soft lockup in kmemleak_scan()'s object iteration loops Commit 6edda04ccc7c ("mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan()") adds cond_resched() in the first object iteration loop of kmemleak_scan(). However, it turns that the 2nd objection iteration loop can still cause soft lockup to happen in some cases. So add a cond_resched() call in the 2nd and 3rd loops as well to prevent that and for completeness. Link: https://lkml.kernel.org/r/20221020175619.366317-1-longman@redhat.com Fixes: 6edda04ccc7c ("mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan()") Signed-off-by: Waiman Long Cc: Catalin Marinas Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/kmemleak.c | 61 ++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 37af2dc8dac9..646e2979641f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1460,6 +1460,27 @@ static void scan_gray_list(void) WARN_ON(!list_empty(&gray_list)); } +/* + * Conditionally call resched() in a object iteration loop while making sure + * that the given object won't go away without RCU read lock by performing a + * get_object() if !pinned. + * + * Return: false if can't do a cond_resched() due to get_object() failure + * true otherwise + */ +static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +{ + if (!pinned && !get_object(object)) + return false; + + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (!pinned) + put_object(object); + return true; +} + /* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the @@ -1471,7 +1492,7 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop1_cnt = 0; + int loop_cnt = 0; jiffies_last_scan = jiffies; @@ -1480,7 +1501,6 @@ static void kmemleak_scan(void) list_for_each_entry_rcu(object, &object_list, object_list) { bool obj_pinned = false; - loop1_cnt++; raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1514,24 +1534,11 @@ static void kmemleak_scan(void) raw_spin_unlock_irq(&object->lock); /* - * Do a cond_resched() to avoid soft lockup every 64k objects. - * Make sure a reference has been taken so that the object - * won't go away without RCU read lock. + * Do a cond_resched() every 64k objects to avoid soft lockup. */ - if (!(loop1_cnt & 0xffff)) { - if (!obj_pinned && !get_object(object)) { - /* Try the next object instead */ - loop1_cnt--; - continue; - } - - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); - - if (!obj_pinned) - put_object(object); - } + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, obj_pinned)) + loop_cnt--; /* Try again on next object */ } rcu_read_unlock(); @@ -1598,7 +1605,15 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in @@ -1632,7 +1647,15 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in -- cgit v1.2.3 From 27d676a1c2010450d00d514a8a6c1c780cb8d77f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 20 Oct 2022 09:51:22 +0800 Subject: memory tier, sysfs: rename attribute "nodes" to "nodelist" In sysfs, we use attribute name "cpumap" or "cpus" for cpu mask and "cpulist" or "cpus_list" for cpu list. For example, in my system, $ cat /sys/devices/system/node/node0/cpumap f,ffffffff $ cat /sys/devices/system/cpu/cpu2/topology/core_cpus 0,00100004 $ cat cat /sys/devices/system/node/node0/cpulist 0-35 $ cat /sys/devices/system/cpu/cpu2/topology/core_cpus_list 2,20 It looks reasonable to use "nodemap" for node mask and "nodelist" for node list. So, rename the attribute to follow the naming convention. Link: https://lkml.kernel.org/r/20221020015122.290097-1-ying.huang@intel.com Fixes: 9832fb87834e2b ("mm/demotion: expose memory tier details via sysfs") Signed-off-by: "Huang, Ying" Acked-by: Wei Xu Reviewed-by: Aneesh Kumar K.V Reviewed-by: Yang Shi Reviewed-by: Davidlohr Bueso Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index f116b7b6333e..fa8c9d07f9ce 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -131,8 +131,8 @@ static void memory_tier_device_release(struct device *dev) kfree(tier); } -static ssize_t nodes_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t nodelist_show(struct device *dev, + struct device_attribute *attr, char *buf) { int ret; nodemask_t nmask; @@ -143,10 +143,10 @@ static ssize_t nodes_show(struct device *dev, mutex_unlock(&memory_tier_lock); return ret; } -static DEVICE_ATTR_RO(nodes); +static DEVICE_ATTR_RO(nodelist); static struct attribute *memtier_dev_attrs[] = { - &dev_attr_nodes.attr, + &dev_attr_nodelist.attr, NULL }; -- cgit v1.2.3 From fba4eaf93164a6a6eb3cc12a3391b06f6187aa20 Mon Sep 17 00:00:00 2001 From: Maria Yu Date: Fri, 21 Oct 2022 18:15:55 +0800 Subject: mm/page_isolation: fix clang deadcode warning When !CONFIG_VM_BUG_ON, there is warning of clang-analyzer-deadcode.DeadStores: Value stored to 'mt' during its initialization is never read. Link: https://lkml.kernel.org/r/20221021101555.7992-2-quic_aiquny@quicinc.com Signed-off-by: Maria Yu Cc: David Hildenbrand Cc: Doug Berger Cc: Mike Kravetz Cc: Zi Yan Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 04141a9bea70..47fbc1696466 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -330,7 +330,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); if (skip_isolation) { - int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); VM_BUG_ON(!is_migrate_isolate(mt)); } else { -- cgit v1.2.3 From 8ebe0a5eaaeb099de03d09ad20f54ed962e2261e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 21 Oct 2022 19:28:05 -0400 Subject: mm,madvise,hugetlb: fix unexpected data loss with MADV_DONTNEED on hugetlbfs A common use case for hugetlbfs is for the application to create memory pools backed by huge pages, which then get handed over to some malloc library (eg. jemalloc) for further management. That malloc library may be doing MADV_DONTNEED calls on memory that is no longer needed, expecting those calls to happen on PAGE_SIZE boundaries. However, currently the MADV_DONTNEED code rounds up any such requests to HPAGE_PMD_SIZE boundaries. This leads to undesired outcomes when jemalloc expects a 4kB MADV_DONTNEED, but 2MB of memory get zeroed out, instead. Use of pre-built shared libraries means that user code does not always know the page size of every memory arena in use. Avoid unexpected data loss with MADV_DONTNEED by rounding up only to PAGE_SIZE (in do_madvise), and rounding down to huge page granularity. That way programs will only get as much memory zeroed out as they requested. Link: https://lkml.kernel.org/r/20221021192805.366ad573@imladris.surriel.com Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings") Signed-off-by: Rik van Riel Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/madvise.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 2baa93ca2310..c7105ec6d08c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -813,7 +813,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, if (start & ~huge_page_mask(hstate_vma(vma))) return false; - *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + /* + * Madvise callers expect the length to be rounded up to PAGE_SIZE + * boundaries, and may be unaware that this VMA uses huge pages. + * Avoid unexpected data loss by rounding down the number of + * huge pages freed. + */ + *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + return true; } @@ -828,6 +835,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; + if (start == end) + return 0; + if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ -- cgit v1.2.3 From 5aae9265ee1a30cf716d6caf6b29fe99b9d55130 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 22 Oct 2022 00:51:06 -0700 Subject: mm: prep_compound_tail() clear page->private Although page allocation always clears page->private in the first page or head page of an allocation, it has never made a point of clearing page->private in the tails (though 0 is often what is already there). But now commit 71e2d666ef85 ("mm/huge_memory: do not clobber swp_entry_t during THP split") issues a warning when page_tail->private is found to be non-0 (unless it's swapcache). Change that warning to dump page_tail (which also dumps head), instead of just the head: so far we have seen dead000000000122, dead000000000003, dead000000000001 or 0000000000000002 in the raw output for tail private. We could just delete the warning, but today's consensus appears to want page->private to be 0, unless there's a good reason for it to be set: so now clear it in prep_compound_tail() (more general than just for THP; but not for high order allocation, which makes no pass down the tails). Link: https://lkml.kernel.org/r/1c4233bb-4e4d-5969-fbd4-96604268a285@google.com Fixes: 71e2d666ef85 ("mm/huge_memory: do not clobber swp_entry_t during THP split") Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/page_alloc.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03fc7e5edf07..561a42567477 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2462,7 +2462,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * Fix up and warn once if private is unexpectedly set. */ if (!folio_test_swapcache(page_folio(head))) { - VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, head); + VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); page_tail->private = 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5a6c815ae28..218b28ee49ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -807,6 +807,7 @@ static void prep_compound_tail(struct page *head, int tail_idx) p->mapping = TAIL_MAPPING; set_compound_head(p, head); + set_page_private(p, 0); } void prep_compound_page(struct page *page, unsigned int order) -- cgit v1.2.3 From 03e5f82ea632af329e32ec03d952b2d99497eeaa Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 24 Oct 2022 16:34:21 +0800 Subject: mm: migrate: fix return value if all subpages of THPs are migrated successfully During THP migration, if THPs are not migrated but they are split and all subpages are migrated successfully, migrate_pages() will still return the number of THP pages that were not migrated. This will confuse the callers of migrate_pages(). For example, the longterm pinning will failed though all pages are migrated successfully. Thus we should return 0 to indicate that all pages are migrated in this case Link: https://lkml.kernel.org/r/de386aa864be9158d2f3b344091419ea7c38b2f7.1666599848.git.baolin.wang@linux.alibaba.com Fixes: b5bade978e9b ("mm: migrate: fix the return value of migrate_pages()") Signed-off-by: Baolin Wang Reviewed-by: Alistair Popple Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- mm/migrate.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 1379e1912772..dff333593a8a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1582,6 +1582,13 @@ out: */ list_splice(&ret_pages, from); + /* + * Return 0 in case all subpages of fail-to-migrate THPs are + * migrated successfully. + */ + if (list_empty(from)) + rc = 0; + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); -- cgit v1.2.3 From f59a3ee6912997fc56ecee78613fef53aae668d9 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:40 +0200 Subject: mm: kmsan: export kmsan_copy_page_meta() Certain modules call copy_user_highpage(), which calls kmsan_copy_page_meta() under KMSAN, so we need to export the latter. Link: https://lkml.kernel.org/r/20221024212144.2852069-1-glider@google.com Link: https://github.com/google/kmsan/issues/89 Fixes: b073d7f8aee4 ("mm: kmsan: maintain KMSAN metadata for page operations") Signed-off-by: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kmsan/shadow.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 21e3e196ec3c..a787c04e9583 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -167,6 +167,7 @@ void kmsan_copy_page_meta(struct page *dst, struct page *src) __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); kmsan_leave_runtime(); } +EXPORT_SYMBOL(kmsan_copy_page_meta); void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { -- cgit v1.2.3 From 78a498c3a227f2ac773a8234b2ce092a4403f2c3 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:44 +0200 Subject: x86: fortify: kmsan: fix KMSAN fortify builds Ensure that KMSAN builds replace memset/memcpy/memmove calls with the respective __msan_XXX functions, and that none of the macros are redefined twice. This should allow building kernel with both CONFIG_KMSAN and CONFIG_FORTIFY_SOURCE. Link: https://lkml.kernel.org/r/20221024212144.2852069-5-glider@google.com Link: https://github.com/google/kmsan/issues/89 Signed-off-by: Alexander Potapenko Reported-by: Tamas K Lengyel Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Kees Cook Signed-off-by: Andrew Morton --- mm/kmsan/instrumentation.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 280d15413268..271f135f97a1 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -14,6 +14,7 @@ #include "kmsan.h" #include +#include #include #include -- cgit v1.2.3 From 5521de7dddd211e3a9403d7bde0b614fd0936ac6 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Sun, 23 Oct 2022 21:34:52 -0700 Subject: mm/userfaultfd: replace kmap/kmap_atomic() with kmap_local_page() kmap() and kmap_atomic() are being deprecated in favor of kmap_local_page() which is appropriate for any thread local context.[1] A recent locking bug report with userfaultfd showed that the conversion of the kmap_atomic()'s in those code flows requires care with regard to the prevention of deadlock.[2] git archaeology implied that the recursion may not be an actual bug.[3] However, depending on the implementation of the mmap_lock and the condition of the call there may still be a deadlock.[4] So this is not purely a lockdep issue. Considering a single threaded call stack there are 3 options. 1) Different mm's are in play (no issue) 2) Readlock implementation is recursive and same mm is in play (no issue) 3) Readlock implementation is _not_ recursive (issue) The mmap_lock is recursive so with a single thread there is no issue. However, Matthew pointed out a deadlock scenario when you consider additional process' and threads thusly. "The readlock implementation is only recursive if nobody else has taken a write lock. If you have a multithreaded process, one of the other threads can call mmap() and that will prevent recursion (due to fairness). Even if it's a different process that you're trying to acquire the mmap read lock on, you can still get into a deadly embrace. eg: process A thread 1 takes read lock on own mmap_lock process A thread 2 calls mmap, blocks taking write lock process B thread 1 takes page fault, read lock on own mmap lock process B thread 2 calls mmap, blocks taking write lock process A thread 1 blocks taking read lock on process B process B thread 1 blocks taking read lock on process A Now all four threads are blocked waiting for each other." Regardless using pagefault_disable() ensures that no matter what locking implementation is used a deadlock will not occur. Complete kmap conversion in userfaultfd by replacing the kmap() and kmap_atomic() calls with kmap_local_page(). When replacing the kmap_atomic() call ensure page faults continue to be disabled to support the correct fall back behavior and add a comment to inform future souls of the requirement. [1] https://lore.kernel.org/all/20220813220034.806698-1-ira.weiny@intel.com/ [2] https://lore.kernel.org/all/Y1Mh2S7fUGQ%2FiKFR@iweiny-desk3/ [3] https://lore.kernel.org/all/Y1MymJ%2FINb45AdaY@iweiny-desk3/ [4] https://lore.kernel.org/lkml/Y1bXBtGTCym77%2FoD@casper.infradead.org/ [ira.weiny@intel.com: v2] Link: https://lkml.kernel.org/r/20221025220136.2366143-1-ira.weiny@intel.com Link: https://lkml.kernel.org/r/20221024043452.1491677-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Matthew Wilcox Cc: Andrew Morton Cc: Andrea Arcangeli Cc: Peter Xu Cc: Axel Rasmussen Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e24e8a47ce8a..3d0fef3980b3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -157,11 +157,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out; - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_page(page); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + pagefault_enable(); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { @@ -646,11 +663,11 @@ retry: mmap_read_unlock(dst_mm); BUG_ON(!page); - page_kaddr = kmap(page); + page_kaddr = kmap_local_page(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap(page); + kunmap_local(page_kaddr); if (unlikely(err)) { err = -EFAULT; goto out; -- cgit v1.2.3 From 5dc21f0c0b1c02ea2c9014cbe7cd3b28884ff306 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 25 Oct 2022 15:01:08 -0700 Subject: mm/shmem: ensure proper fallback if page faults The kernel test robot flagged a recursive lock as a result of a conversion from kmap_atomic() to kmap_local_folio()[Link] The cause was due to the code depending on the kmap_atomic() side effect of disabling page faults. In that case the code expects the fault to fail and take the fallback case. git archaeology implied that the recursion may not be an actual bug.[1] However, depending on the implementation of the mmap_lock and the condition of the call there may still be a deadlock.[2] So this is not purely a lockdep issue. Considering a single threaded call stack there are 3 options. 1) Different mm's are in play (no issue) 2) Readlock implementation is recursive and same mm is in play (no issue) 3) Readlock implementation is _not_ recursive (issue) The mmap_lock is recursive so with a single thread there is no issue. However, Matthew pointed out a deadlock scenario when you consider additional process' and threads thusly. "The readlock implementation is only recursive if nobody else has taken a write lock. If you have a multithreaded process, one of the other threads can call mmap() and that will prevent recursion (due to fairness). Even if it's a different process that you're trying to acquire the mmap read lock on, you can still get into a deadly embrace. eg: process A thread 1 takes read lock on own mmap_lock process A thread 2 calls mmap, blocks taking write lock process B thread 1 takes page fault, read lock on own mmap lock process B thread 2 calls mmap, blocks taking write lock process A thread 1 blocks taking read lock on process B process B thread 1 blocks taking read lock on process A Now all four threads are blocked waiting for each other." Regardless using pagefault_disable() ensures that no matter what locking implementation is used a deadlock will not occur. Add an explicit pagefault_disable() and a big comment to explain this for future souls looking at this code. [1] https://lore.kernel.org/all/Y1MymJ%2FINb45AdaY@iweiny-desk3/ [2] https://lore.kernel.org/lkml/Y1bXBtGTCym77%2FoD@casper.infradead.org/ Link: https://lkml.kernel.org/r/20221025220108.2366043-1-ira.weiny@intel.com Link: https://lore.kernel.org/r/202210211215.9dc6efb5-yujie.liu@intel.com Fixes: 7a7256d5f512 ("shmem: convert shmem_mfill_atomic_pte() to use a folio") Signed-off-by: Ira Weiny Reported-by: Matthew Wilcox (Oracle) Reported-by: kernel test robot Cc: Randy Dunlap Cc: Peter Xu Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- mm/shmem.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 8280a5cb48df..c1d8b8a1aa3b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2424,9 +2424,26 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!zeropage) { /* COPY */ page_kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); + pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ -- cgit v1.2.3 From 1db43d3f3733351849ddca4b573c037c7821bfd8 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Tue, 25 Oct 2022 16:12:49 +0000 Subject: mmap: fix remap_file_pages() regression When using the VMA iterator, the final execution will set the variable 'next' to NULL which causes the function to fail out. Restore the break in the loop to exit the VMA iterator early without clearing NULL fixes the issue. Link: https://lore.kernel.org/lkml/29344.1666681759@jrobl/ Link: https://lkml.kernel.org/r/20221025161222.2634030-1-Liam.Howlett@oracle.com Fixes: 763ecb035029 (mm: remove the vma linked list) Signed-off-by: Liam R. Howlett Reported-by: "J. R. Okajima" Tested-by: "J. R. Okajima" Signed-off-by: Andrew Morton --- mm/mmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index e270057ed04e..2def55555e05 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2852,6 +2852,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; + if (start + size <= next->vm_end) + break; + prev = next; } -- cgit v1.2.3