summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/userfaultfd.c11
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/hmm.c2
-rw-r--r--mm/memcontrol.c8
-rw-r--r--mm/memory.c23
-rw-r--r--mm/mincore.c3
-rw-r--r--mm/mprotect.c3
7 files changed, 47 insertions, 8 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index aa0c47cb0d16..78b68e0f9774 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -29,6 +29,7 @@
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/swapops.h>
int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -249,9 +250,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us.
+ * changes under us. PTE markers should be handled the same as none
+ * ptes here.
*/
- if (huge_pte_none(pte))
+ if (huge_pte_none_mostly(pte))
ret = true;
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
ret = true;
@@ -330,9 +332,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pte = pte_offset_map(pmd, address);
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us.
+ * changes under us. PTE markers should be handled the same as none
+ * ptes here.
*/
- if (pte_none(*pte))
+ if (pte_none_mostly(*pte))
ret = true;
if (!pte_write(*pte) && (reason & VM_UFFD_WP))
ret = true;
diff --git a/mm/filemap.c b/mm/filemap.c
index 9a1eef6c5d35..de34c6c10384 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3376,6 +3376,11 @@ again:
vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
+ /*
+ * NOTE: If there're PTE markers, we'll leave them to be
+ * handled in the specific fault path, and it'll prohibit the
+ * fault-around logic.
+ */
if (!pte_none(*vmf->pte))
goto unlock;
diff --git a/mm/hmm.c b/mm/hmm.c
index af71aac3140e..3fd3242c5e50 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -239,7 +239,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
pte_t pte = *ptep;
uint64_t pfn_req_flags = *hmm_pfn;
- if (pte_none(pte)) {
+ if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 235c85e17c2d..6e74ca98c862 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5640,10 +5640,14 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
+ else if (pte_none_mostly(ptent))
+ /*
+ * PTE markers should be treated as a none pte here, separated
+ * from other swap handling below.
+ */
+ page = mc_handle_file_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
- else if (pte_none(ptent))
- page = mc_handle_file_pte(vma, addr, ptent);
if (!page && !ent.val)
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 90212057a546..9743c8b74bf2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -100,6 +100,8 @@ struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif
+static vm_fault_t do_fault(struct vm_fault *vmf);
+
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
@@ -1415,6 +1417,8 @@ again:
if (!should_zap_page(details, page))
continue;
rss[mm_counter(page)]--;
+ } else if (is_pte_marker_entry(entry)) {
+ /* By default, simply drop all pte markers when zap */
} else if (is_hwpoison_entry(entry)) {
if (!should_zap_cows(details))
continue;
@@ -3555,6 +3559,23 @@ static inline bool should_try_to_free_swap(struct page *page,
page_count(page) == 2;
}
+static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
+{
+ swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
+ unsigned long marker = pte_marker_get(entry);
+
+ /*
+ * PTE markers should always be with file-backed memories, and the
+ * marker should never be empty. If anything weird happened, the best
+ * thing to do is to kill the process along with its mm.
+ */
+ if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
+ return VM_FAULT_SIGBUS;
+
+ /* TODO: handle pte markers */
+ return 0;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -3592,6 +3613,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
+ } else if (is_pte_marker_entry(entry)) {
+ ret = handle_pte_marker(vmf);
} else {
print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
diff --git a/mm/mincore.c b/mm/mincore.c
index f4f627325e12..fa200c14185f 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -122,7 +122,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
for (; addr != end; ptep++, addr += PAGE_SIZE) {
pte_t pte = *ptep;
- if (pte_none(pte))
+ /* We need to do cache lookup too for pte markers */
+ if (pte_none_mostly(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE,
vma, vec);
else if (pte_present(pte))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 20a46f21cca8..e84694267b0f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -193,6 +193,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
newpte = pte_swp_mksoft_dirty(newpte);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (is_pte_marker_entry(entry)) {
+ /* Skip it, the same as none pte */
+ continue;
} else {
newpte = oldpte;
}