summaryrefslogtreecommitdiffstats
path: root/include/linux/mm_types.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm_types.h')
-rw-r--r--include/linux/mm_types.h203
1 files changed, 148 insertions, 55 deletions
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 75e8850cec3a..56d07edd01f9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -19,6 +19,7 @@
#include <linux/workqueue.h>
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
+#include <linux/types.h>
#include <asm/mmu.h>
@@ -133,8 +134,11 @@ struct page {
unsigned long compound_head; /* Bit zero is set */
};
struct { /* ZONE_DEVICE pages */
- /** @pgmap: Points to the hosting device page map. */
- struct dev_pagemap *pgmap;
+ /*
+ * The first word is used for compound_head or folio
+ * pgmap
+ */
+ void *_unused_pgmap_compound_head;
void *zone_device_data;
/*
* ZONE_DEVICE private pages are counted as being
@@ -287,6 +291,49 @@ typedef struct {
unsigned long val;
} swp_entry_t;
+#if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT)
+/* We have some extra room after the refcount in tail pages. */
+#define NR_PAGES_IN_LARGE_FOLIO
+#endif
+
+/*
+ * On 32bit, we can cut the required metadata in half, because:
+ * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have,
+ * so we can limit MM IDs to 15 bit (32767).
+ * (b) We don't expect folios where even a single complete PTE mapping by
+ * one MM would exceed 15 bits (order-15).
+ */
+#ifdef CONFIG_64BIT
+typedef int mm_id_mapcount_t;
+#define MM_ID_MAPCOUNT_MAX INT_MAX
+typedef unsigned int mm_id_t;
+#else /* !CONFIG_64BIT */
+typedef short mm_id_mapcount_t;
+#define MM_ID_MAPCOUNT_MAX SHRT_MAX
+typedef unsigned short mm_id_t;
+#endif /* CONFIG_64BIT */
+
+/* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */
+#define MM_ID_DUMMY 0
+#define MM_ID_MIN (MM_ID_DUMMY + 1)
+
+/*
+ * We leave the highest bit of each MM id unused, so we can store a flag
+ * in the highest bit of each folio->_mm_id[].
+ */
+#define MM_ID_BITS ((sizeof(mm_id_t) * BITS_PER_BYTE) - 1)
+#define MM_ID_MASK ((1U << MM_ID_BITS) - 1)
+#define MM_ID_MAX MM_ID_MASK
+
+/*
+ * In order to use bit_spin_lock(), which requires an unsigned long, we
+ * operate on folio->_mm_ids when working on flags.
+ */
+#define FOLIO_MM_IDS_LOCK_BITNUM MM_ID_BITS
+#define FOLIO_MM_IDS_LOCK_BIT BIT(FOLIO_MM_IDS_LOCK_BITNUM)
+#define FOLIO_MM_IDS_SHARED_BITNUM (2 * MM_ID_BITS + 1)
+#define FOLIO_MM_IDS_SHARED_BIT BIT(FOLIO_MM_IDS_SHARED_BITNUM)
+
/**
* struct folio - Represents a contiguous set of bytes.
* @flags: Identical to the page flags.
@@ -296,6 +343,8 @@ typedef struct {
* anonymous memory.
* @index: Offset within the file, in units of pages. For anonymous memory,
* this is the index from the beginning of the mmap.
+ * @share: number of DAX mappings that reference this folio. See
+ * dax_associate_entry.
* @private: Filesystem per-folio data (see folio_attach_private()).
* @swap: Used for swp_entry_t if folio_test_swapcache().
* @_mapcount: Do not access this member directly. Use folio_mapcount() to
@@ -303,13 +352,17 @@ typedef struct {
* @_refcount: Do not access this member directly. Use folio_ref_count()
* to find how many references there are to this folio.
* @memcg_data: Memory Control Group data.
+ * @pgmap: Metadata for ZONE_DEVICE mappings
* @virtual: Virtual address in the kernel direct map.
* @_last_cpupid: IDs of last CPU and last process that accessed the folio.
* @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
* @_large_mapcount: Do not use directly, call folio_mapcount().
* @_nr_pages_mapped: Do not use outside of rmap and debug code.
* @_pincount: Do not use directly, call folio_maybe_dma_pinned().
- * @_folio_nr_pages: Do not use directly, call folio_nr_pages().
+ * @_nr_pages: Do not use directly, call folio_nr_pages().
+ * @_mm_id: Do not use outside of rmap code.
+ * @_mm_ids: Do not use outside of rmap code.
+ * @_mm_id_mapcount: Do not use outside of rmap code.
* @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
* @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h.
* @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h.
@@ -341,9 +394,13 @@ struct folio {
/* private: */
};
/* public: */
+ struct dev_pagemap *pgmap;
};
struct address_space *mapping;
- pgoff_t index;
+ union {
+ pgoff_t index;
+ unsigned long share;
+ };
union {
void *private;
swp_entry_t swap;
@@ -369,14 +426,30 @@ struct folio {
struct {
unsigned long _flags_1;
unsigned long _head_1;
+ union {
+ struct {
/* public: */
- atomic_t _large_mapcount;
- atomic_t _entire_mapcount;
- atomic_t _nr_pages_mapped;
- atomic_t _pincount;
+ atomic_t _large_mapcount;
+ atomic_t _nr_pages_mapped;
#ifdef CONFIG_64BIT
- unsigned int _folio_nr_pages;
-#endif
+ atomic_t _entire_mapcount;
+ atomic_t _pincount;
+#endif /* CONFIG_64BIT */
+ mm_id_mapcount_t _mm_id_mapcount[2];
+ union {
+ mm_id_t _mm_id[2];
+ unsigned long _mm_ids;
+ };
+ /* private: the union with struct page is transitional */
+ };
+ unsigned long _usable_1[4];
+ };
+ atomic_t _mapcount_1;
+ atomic_t _refcount_1;
+ /* public: */
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ unsigned int _nr_pages;
+#endif /* NR_PAGES_IN_LARGE_FOLIO */
/* private: the union with struct page is transitional */
};
struct page __page_1;
@@ -386,20 +459,27 @@ struct folio {
unsigned long _flags_2;
unsigned long _head_2;
/* public: */
- void *_hugetlb_subpool;
- void *_hugetlb_cgroup;
- void *_hugetlb_cgroup_rsvd;
- void *_hugetlb_hwpoison;
+ struct list_head _deferred_list;
+#ifndef CONFIG_64BIT
+ atomic_t _entire_mapcount;
+ atomic_t _pincount;
+#endif /* !CONFIG_64BIT */
/* private: the union with struct page is transitional */
};
+ struct page __page_2;
+ };
+ union {
struct {
- unsigned long _flags_2a;
- unsigned long _head_2a;
+ unsigned long _flags_3;
+ unsigned long _head_3;
/* public: */
- struct list_head _deferred_list;
+ void *_hugetlb_subpool;
+ void *_hugetlb_cgroup;
+ void *_hugetlb_cgroup_rsvd;
+ void *_hugetlb_hwpoison;
/* private: the union with struct page is transitional */
};
- struct page __page_2;
+ struct page __page_3;
};
};
@@ -428,14 +508,20 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid);
offsetof(struct page, pg) + sizeof(struct page))
FOLIO_MATCH(flags, _flags_1);
FOLIO_MATCH(compound_head, _head_1);
+FOLIO_MATCH(_mapcount, _mapcount_1);
+FOLIO_MATCH(_refcount, _refcount_1);
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl) \
static_assert(offsetof(struct folio, fl) == \
offsetof(struct page, pg) + 2 * sizeof(struct page))
FOLIO_MATCH(flags, _flags_2);
FOLIO_MATCH(compound_head, _head_2);
-FOLIO_MATCH(flags, _flags_2a);
-FOLIO_MATCH(compound_head, _head_2a);
+#undef FOLIO_MATCH
+#define FOLIO_MATCH(pg, fl) \
+ static_assert(offsetof(struct folio, fl) == \
+ offsetof(struct page, pg) + 3 * sizeof(struct page))
+FOLIO_MATCH(flags, _flags_3);
+FOLIO_MATCH(compound_head, _head_3);
#undef FOLIO_MATCH
/**
@@ -578,6 +664,12 @@ static inline void *folio_get_private(struct folio *folio)
typedef unsigned long vm_flags_t;
/*
+ * freeptr_t represents a SLUB freelist pointer, which might be encoded
+ * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
+ */
+typedef struct { unsigned long v; } freeptr_t;
+
+/*
* A region containing a mapping of a non-memory backed file under NOMMU
* conditions. These are held in a global tree and are pinned by the VMAs that
* map parts of them.
@@ -633,9 +725,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
}
#endif
-struct vma_lock {
- struct rw_semaphore lock;
-};
+#define VMA_LOCK_OFFSET 0x40000000
+#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1)
struct vma_numab_state {
/*
@@ -681,6 +772,9 @@ struct vma_numab_state {
*
* Only explicitly marked struct members may be accessed by RCU readers before
* getting a stable reference.
+ *
+ * WARNING: when adding new members, please update vm_area_init_from() to copy
+ * them during vm_area_struct content duplication.
*/
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
@@ -691,9 +785,7 @@ struct vm_area_struct {
unsigned long vm_start;
unsigned long vm_end;
};
-#ifdef CONFIG_PER_VMA_LOCK
- struct rcu_head vm_rcu; /* Used for deferred freeing. */
-#endif
+ freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
};
/*
@@ -714,18 +806,12 @@ struct vm_area_struct {
#ifdef CONFIG_PER_VMA_LOCK
/*
- * Flag to indicate areas detached from the mm->mm_mt tree.
- * Unstable RCU readers are allowed to read this.
- */
- bool detached;
-
- /*
* Can only be written (using WRITE_ONCE()) while holding both:
* - mmap_lock (in write mode)
- * - vm_lock->lock (in write mode)
+ * - vm_refcnt bit at VMA_LOCK_OFFSET is set
* Can be read reliably while holding one of:
* - mmap_lock (in read or write mode)
- * - vm_lock->lock (in read or write mode)
+ * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
* Can be read unreliably (using READ_ONCE()) for pessimistic bailout
* while holding nothing (except RCU to keep the VMA struct allocated).
*
@@ -734,20 +820,7 @@ struct vm_area_struct {
* slowpath.
*/
unsigned int vm_lock_seq;
- /* Unstable RCU readers are allowed to read this. */
- struct vma_lock *vm_lock;
#endif
-
- /*
- * For areas with an address space and backing store,
- * linkage into the address_space->i_mmap interval tree.
- *
- */
- struct {
- struct rb_node rb;
- unsigned long rb_subtree_last;
- } shared;
-
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages. A MAP_SHARED vma
@@ -767,14 +840,6 @@ struct vm_area_struct {
struct file * vm_file; /* File we map to (can be NULL). */
void * vm_private_data; /* was vm_pte (shared mem) */
-#ifdef CONFIG_ANON_VMA_NAME
- /*
- * For private and shared anonymous mappings, a pointer to a null
- * terminated string containing the name given to the vma, or NULL if
- * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
- */
- struct anon_vma_name *anon_name;
-#endif
#ifdef CONFIG_SWAP
atomic_long_t swap_readahead_info;
#endif
@@ -787,6 +852,30 @@ struct vm_area_struct {
#ifdef CONFIG_NUMA_BALANCING
struct vma_numab_state *numab_state; /* NUMA Balancing state */
#endif
+#ifdef CONFIG_PER_VMA_LOCK
+ /* Unstable RCU readers are allowed to read this. */
+ refcount_t vm_refcnt ____cacheline_aligned_in_smp;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map vmlock_dep_map;
+#endif
+#endif
+ /*
+ * For areas with an address space and backing store,
+ * linkage into the address_space->i_mmap interval tree.
+ *
+ */
+ struct {
+ struct rb_node rb;
+ unsigned long rb_subtree_last;
+ } shared;
+#ifdef CONFIG_ANON_VMA_NAME
+ /*
+ * For private and shared anonymous mappings, a pointer to a null
+ * terminated string containing the name given to the vma, or NULL if
+ * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
+ */
+ struct anon_vma_name *anon_name;
+#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
@@ -922,6 +1011,7 @@ struct mm_struct {
* by mmlist_lock
*/
#ifdef CONFIG_PER_VMA_LOCK
+ struct rcuwait vma_writer_wait;
/*
* This field has lock-like semantics, meaning it is sometimes
* accessed with ACQUIRE/RELEASE semantics.
@@ -1074,6 +1164,9 @@ struct mm_struct {
#endif
} lru_gen;
#endif /* CONFIG_LRU_GEN_WALKS_MMU */
+#ifdef CONFIG_MM_ID
+ mm_id_t mm_id;
+#endif /* CONFIG_MM_ID */
} __randomize_layout;
/*