summaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-27 19:42:02 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-27 19:42:02 -0700
commit7fa8a8ee9400fe8ec188426e40e481717bc5e924 (patch)
treecc8fd6b4f936ec01e73238643757451e20478c07 /mm/shmem.c
parent91ec4b0d11fe115581ce2835300558802ce55e6c (diff)
parent4d4b6d66db63ceed399f1fb1a4b24081d2590eb1 (diff)
downloadlinux-stable-7fa8a8ee9400fe8ec188426e40e481717bc5e924.tar.gz
linux-stable-7fa8a8ee9400fe8ec188426e40e481717bc5e924.tar.bz2
linux-stable-7fa8a8ee9400fe8ec188426e40e481717bc5e924.zip
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of switching from a user process to a kernel thread. - More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav. - zsmalloc performance improvements from Sergey Senozhatsky. - Yue Zhao has found and fixed some data race issues around the alteration of memcg userspace tunables. - VFS rationalizations from Christoph Hellwig: - removal of most of the callers of write_one_page() - make __filemap_get_folio()'s return value more useful - Luis Chamberlain has changed tmpfs so it no longer requires swap backing. Use `mount -o noswap'. - Qi Zheng has made the slab shrinkers operate locklessly, providing some scalability benefits. - Keith Busch has improved dmapool's performance, making part of its operations O(1) rather than O(n). - Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd, permitting userspace to wr-protect anon memory unpopulated ptes. - Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather than exclusive, and has fixed a bunch of errors which were caused by its unintuitive meaning. - Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature, which causes minor faults to install a write-protected pte. - Vlastimil Babka has done some maintenance work on vma_merge(): cleanups to the kernel code and improvements to our userspace test harness. - Cleanups to do_fault_around() by Lorenzo Stoakes. - Mike Rapoport has moved a lot of initialization code out of various mm/ files and into mm/mm_init.c. - Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for DRM, but DRM doesn't use it any more. - Lorenzo has also coverted read_kcore() and vread() to use iterators and has thereby removed the use of bounce buffers in some cases. - Lorenzo has also contributed further cleanups of vma_merge(). - Chaitanya Prakash provides some fixes to the mmap selftesting code. - Matthew Wilcox changes xfs and afs so they no longer take sleeping locks in ->map_page(), a step towards RCUification of pagefaults. - Suren Baghdasaryan has improved mmap_lock scalability by switching to per-VMA locking. - Frederic Weisbecker has reworked the percpu cache draining so that it no longer causes latency glitches on cpu isolated workloads. - Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig logic. - Liu Shixin has changed zswap's initialization so we no longer waste a chunk of memory if zswap is not being used. - Yosry Ahmed has improved the performance of memcg statistics flushing. - David Stevens has fixed several issues involving khugepaged, userfaultfd and shmem. - Christoph Hellwig has provided some cleanup work to zram's IO-related code paths. - David Hildenbrand has fixed up some issues in the selftest code's testing of our pte state changing. - Pankaj Raghav has made page_endio() unneeded and has removed it. - Peter Xu contributed some rationalizations of the userfaultfd selftests. - Yosry Ahmed has fixed an issue around memcg's page recalim accounting. - Chaitanya Prakash has fixed some arm-related issues in the selftests/mm code. - Longlong Xia has improved the way in which KSM handles hwpoisoned pages. - Peter Xu fixes a few issues with uffd-wp at fork() time. - Stefan Roesch has changed KSM so that it may now be used on a per-process and per-cgroup basis. * tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits) mm,unmap: avoid flushing TLB in batch if PTE is inaccessible shmem: restrict noswap option to initial user namespace mm/khugepaged: fix conflicting mods to collapse_file() sparse: remove unnecessary 0 values from rc mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area() hugetlb: pte_alloc_huge() to replace huge pte_alloc_map() maple_tree: fix allocation in mas_sparse_area() mm: do not increment pgfault stats when page fault handler retries zsmalloc: allow only one active pool compaction context selftests/mm: add new selftests for KSM mm: add new KSM process and sysfs knobs mm: add new api to enable ksm per process mm: shrinkers: fix debugfs file permissions mm: don't check VMA write permissions if the PTE/PMD indicates write permissions migrate_pages_batch: fix statistics for longterm pin retry userfaultfd: use helper function range_in_vma() lib/show_mem.c: use for_each_populated_zone() simplify code mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list() fs/buffer: convert create_page_buffers to folio_create_buffers fs/buffer: add folio_create_empty_buffers helper ...
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c137
1 files changed, 87 insertions, 50 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index b76521ed372d..e40a08c5c6d7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -76,7 +76,6 @@ static struct vfsmount *shm_mnt;
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
-#include <linux/userfaultfd_k.h>
#include <linux/rmap.h>
#include <linux/uuid.h>
@@ -116,10 +115,12 @@ struct shmem_options {
bool full_inums;
int huge;
int seen;
+ bool noswap;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
+#define SHMEM_SEEN_NOSWAP 16
};
#ifdef CONFIG_TMPFS
@@ -603,7 +604,7 @@ next:
index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
folio = filemap_get_folio(inode->i_mapping, index);
- if (!folio)
+ if (IS_ERR(folio))
goto drop;
/* No huge page at the end of the file: nothing to split */
@@ -883,14 +884,21 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
/*
* At first avoid shmem_get_folio(,,,SGP_READ): that fails
- * beyond i_size, and reports fallocated pages as holes.
+ * beyond i_size, and reports fallocated folios as holes.
*/
- folio = __filemap_get_folio(inode->i_mapping, index,
- FGP_ENTRY | FGP_LOCK, 0);
- if (!xa_is_value(folio))
+ folio = filemap_get_entry(inode->i_mapping, index);
+ if (!folio)
return folio;
+ if (!xa_is_value(folio)) {
+ folio_lock(folio);
+ if (folio->mapping == inode->i_mapping)
+ return folio;
+ /* The folio has been swapped out */
+ folio_unlock(folio);
+ folio_put(folio);
+ }
/*
- * But read a page back from swap if any of it is within i_size
+ * But read a folio back from swap if any of it is within i_size
* (although in some cases this is just a waste of time).
*/
folio = NULL;
@@ -1331,13 +1339,30 @@ int shmem_unuse(unsigned int type)
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
struct folio *folio = page_folio(page);
- struct shmem_inode_info *info;
- struct address_space *mapping;
- struct inode *inode;
+ struct address_space *mapping = folio->mapping;
+ struct inode *inode = mapping->host;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
swp_entry_t swap;
pgoff_t index;
/*
+ * Our capabilities prevent regular writeback or sync from ever calling
+ * shmem_writepage; but a stacking filesystem might use ->writepage of
+ * its underlying filesystem, in which case tmpfs should write out to
+ * swap only in response to memory pressure, and not for the writeback
+ * threads or sync.
+ */
+ if (WARN_ON_ONCE(!wbc->for_reclaim))
+ goto redirty;
+
+ if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
+ goto redirty;
+
+ if (!total_swap_pages)
+ goto redirty;
+
+ /*
* If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
* "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
* and its shmem_writeback() needs them to be split when swapping.
@@ -1351,27 +1376,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
folio_clear_dirty(folio);
}
- BUG_ON(!folio_test_locked(folio));
- mapping = folio->mapping;
index = folio->index;
- inode = mapping->host;
- info = SHMEM_I(inode);
- if (info->flags & VM_LOCKED)
- goto redirty;
- if (!total_swap_pages)
- goto redirty;
-
- /*
- * Our capabilities prevent regular writeback or sync from ever calling
- * shmem_writepage; but a stacking filesystem might use ->writepage of
- * its underlying filesystem, in which case tmpfs should write out to
- * swap only in response to memory pressure, and not for the writeback
- * threads or sync.
- */
- if (!wbc->for_reclaim) {
- WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
- goto redirty;
- }
/*
* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
@@ -1874,12 +1879,10 @@ repeat:
sbinfo = SHMEM_SB(inode->i_sb);
charge_mm = vma ? vma->vm_mm : NULL;
- folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
+ folio = filemap_get_entry(mapping, index);
if (folio && vma && userfaultfd_minor(vma)) {
- if (!xa_is_value(folio)) {
- folio_unlock(folio);
+ if (!xa_is_value(folio))
folio_put(folio);
- }
*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
return 0;
}
@@ -1895,6 +1898,14 @@ repeat:
}
if (folio) {
+ folio_lock(folio);
+
+ /* Has the folio been truncated or swapped out? */
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto repeat;
+ }
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
if (folio_test_uptodate(folio))
@@ -2376,6 +2387,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
shmem_set_inode_flags(inode, info->fsflags);
INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
+ if (sbinfo->noswap)
+ mapping_set_unevictable(inode->i_mapping);
simple_xattrs_init(&info->xattrs);
cache_no_acl(inode);
mapping_set_large_folios(inode->i_mapping);
@@ -2415,13 +2428,12 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
}
#ifdef CONFIG_USERFAULTFD
-int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
+int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- bool zeropage, bool wp_copy,
- struct page **pagep)
+ uffd_flags_t flags,
+ struct folio **foliop)
{
struct inode *inode = file_inode(dst_vma->vm_file);
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2439,20 +2451,20 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
* and now we find ourselves with -ENOMEM. Release the page, to
* avoid a BUG_ON in our caller.
*/
- if (unlikely(*pagep)) {
- put_page(*pagep);
- *pagep = NULL;
+ if (unlikely(*foliop)) {
+ folio_put(*foliop);
+ *foliop = NULL;
}
return -ENOMEM;
}
- if (!*pagep) {
+ if (!*foliop) {
ret = -ENOMEM;
folio = shmem_alloc_folio(gfp, info, pgoff);
if (!folio)
goto out_unacct_blocks;
- if (!zeropage) { /* COPY */
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
page_kaddr = kmap_local_folio(folio, 0);
/*
* The read mmap_lock is held here. Despite the
@@ -2478,7 +2490,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
- *pagep = &folio->page;
+ *foliop = folio;
ret = -ENOENT;
/* don't free the page */
goto out_unacct_blocks;
@@ -2489,9 +2501,9 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
clear_user_highpage(&folio->page, dst_addr);
}
} else {
- folio = page_folio(*pagep);
+ folio = *foliop;
VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
- *pagep = NULL;
+ *foliop = NULL;
}
VM_BUG_ON(folio_test_locked(folio));
@@ -2506,12 +2518,12 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
goto out_release;
ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK, dst_mm);
+ gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
if (ret)
goto out_release;
- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- &folio->page, true, wp_copy);
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ &folio->page, true, flags);
if (ret)
goto out_delete_from_cache;
@@ -3200,7 +3212,7 @@ static const char *shmem_get_link(struct dentry *dentry,
if (!dentry) {
folio = filemap_get_folio(inode->i_mapping, 0);
- if (!folio)
+ if (IS_ERR(folio))
return ERR_PTR(-ECHILD);
if (PageHWPoison(folio_page(folio, 0)) ||
!folio_test_uptodate(folio)) {
@@ -3459,6 +3471,7 @@ enum shmem_param {
Opt_uid,
Opt_inode32,
Opt_inode64,
+ Opt_noswap,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3480,6 +3493,7 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_u32 ("uid", Opt_uid),
fsparam_flag ("inode32", Opt_inode32),
fsparam_flag ("inode64", Opt_inode64),
+ fsparam_flag ("noswap", Opt_noswap),
{}
};
@@ -3563,6 +3577,14 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->full_inums = true;
ctx->seen |= SHMEM_SEEN_INUMS;
break;
+ case Opt_noswap:
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
+ return invalfc(fc,
+ "Turning off swap in unprivileged tmpfs mounts unsupported");
+ }
+ ctx->noswap = true;
+ ctx->seen |= SHMEM_SEEN_NOSWAP;
+ break;
}
return 0;
@@ -3661,6 +3683,14 @@ static int shmem_reconfigure(struct fs_context *fc)
err = "Current inum too high to switch to 32-bit inums";
goto out;
}
+ if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
+ err = "Cannot disable swap on remount";
+ goto out;
+ }
+ if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
+ err = "Cannot enable swap on remount if it was disabled on first mount";
+ goto out;
+ }
if (ctx->seen & SHMEM_SEEN_HUGE)
sbinfo->huge = ctx->huge;
@@ -3681,6 +3711,10 @@ static int shmem_reconfigure(struct fs_context *fc)
sbinfo->mpol = ctx->mpol; /* transfers initial ref */
ctx->mpol = NULL;
}
+
+ if (ctx->noswap)
+ sbinfo->noswap = true;
+
raw_spin_unlock(&sbinfo->stat_lock);
mpol_put(mpol);
return 0;
@@ -3735,6 +3769,8 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
#endif
shmem_show_mpol(seq, sbinfo->mpol);
+ if (sbinfo->noswap)
+ seq_printf(seq, ",noswap");
return 0;
}
@@ -3778,6 +3814,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
ctx->inodes = shmem_default_max_inodes();
if (!(ctx->seen & SHMEM_SEEN_INUMS))
ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
+ sbinfo->noswap = ctx->noswap;
} else {
sb->s_flags |= SB_NOUSER;
}