summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c19
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/hugetlb.c11
-rw-r--r--mm/kasan/quarantine.c7
-rw-r--r--mm/memblock.c9
-rw-r--r--mm/memcontrol.c82
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c76
-rw-r--r--mm/page_io.c5
-rw-r--r--mm/rmap.c7
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slab.c30
-rw-r--r--mm/slub.c48
-rw-r--r--mm/usercopy.c268
17 files changed, 518 insertions, 66 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c0837845c17c..78a23c5c302d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -187,6 +187,7 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on !KASAN
config MEMORY_HOTPLUG_SPARSE
def_bool y
diff --git a/mm/Makefile b/mm/Makefile
index fc059666c760..2ca1faf3fa09 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
+# Since __builtin_frame_address does work as used, disable the warning.
+CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -99,3 +102,4 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
+obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index efe237742074..8fde443f36d7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -825,6 +825,20 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
}
EXPORT_SYMBOL(bdi_register_dev);
+int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
+{
+ int rc;
+
+ rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt),
+ MINOR(owner->devt));
+ if (rc)
+ return rc;
+ bdi->owner = owner;
+ get_device(owner);
+ return 0;
+}
+EXPORT_SYMBOL(bdi_register_owner);
+
/*
* Remove bdi from bdi_list, and ensure that it is no longer visible
*/
@@ -849,6 +863,11 @@ void bdi_unregister(struct backing_dev_info *bdi)
device_unregister(bdi->dev);
bdi->dev = NULL;
}
+
+ if (bdi->owner) {
+ put_device(bdi->owner);
+ bdi->owner = NULL;
+ }
}
void bdi_exit(struct backing_dev_info *bdi)
diff --git a/mm/filemap.c b/mm/filemap.c
index 3083ded98b15..8a287dfc5372 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -887,9 +887,9 @@ EXPORT_SYMBOL(end_page_writeback);
* After completing I/O on a page, call this routine to update the page
* flags appropriately
*/
-void page_endio(struct page *page, int rw, int err)
+void page_endio(struct page *page, bool is_write, int err)
{
- if (rw == READ) {
+ if (!is_write) {
if (!err) {
SetPageUptodate(page);
} else {
@@ -897,7 +897,7 @@ void page_endio(struct page *page, int rw, int err)
SetPageError(page);
}
unlock_page(page);
- } else { /* rw == WRITE */
+ } else {
if (err) {
SetPageError(page);
if (page->mapping)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ef968306fd5b..87e11d8ad536 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1448,6 +1448,7 @@ static void dissolve_free_huge_page(struct page *page)
list_del(&page->lru);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
+ h->max_huge_pages--;
update_and_free_page(h, page);
}
spin_unlock(&hugetlb_lock);
@@ -3942,6 +3943,14 @@ same_page:
return i ? i : -EFAULT;
}
+#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
+/*
+ * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
+ * implement this.
+ */
+#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
+#endif
+
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
@@ -4002,7 +4011,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* once we release i_mmap_rwsem, another task can do the final put_page
* and that page table be reused and filled with junk.
*/
- flush_tlb_range(vma, start, end);
+ flush_hugetlb_tlb_range(vma, start, end);
mmu_notifier_invalidate_range(mm, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(mm, start, end);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index b6728a33a4ac..baabaad4a4aa 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -217,11 +217,8 @@ void quarantine_reduce(void)
new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
QUARANTINE_FRACTION;
percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
- if (WARN_ONCE(new_quarantine_size < percpu_quarantines,
- "Too little memory, disabling global KASAN quarantine.\n"))
- new_quarantine_size = 0;
- else
- new_quarantine_size -= percpu_quarantines;
+ new_quarantine_size = (new_quarantine_size < percpu_quarantines) ?
+ 0 : new_quarantine_size - percpu_quarantines;
WRITE_ONCE(quarantine_size, new_quarantine_size);
last = global_quarantine.head;
diff --git a/mm/memblock.c b/mm/memblock.c
index ff5ff3b5f1ea..483197ef613f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -482,7 +482,7 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
* @flags: flags of the new region
*
* Insert new memblock region [@base,@base+@size) into @type at @idx.
- * @type must already have extra room to accomodate the new region.
+ * @type must already have extra room to accommodate the new region.
*/
static void __init_memblock memblock_insert_region(struct memblock_type *type,
int idx, phys_addr_t base,
@@ -544,7 +544,7 @@ repeat:
/*
* The following is executed twice. Once with %false @insert and
* then with %true. The first counts the number of regions needed
- * to accomodate the new area. The second actually inserts them.
+ * to accommodate the new area. The second actually inserts them.
*/
base = obase;
nr_new = 0;
@@ -994,7 +994,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
if (*idx == (u64)ULLONG_MAX) {
idx_a = type_a->cnt - 1;
- idx_b = type_b->cnt;
+ if (type_b != NULL)
+ idx_b = type_b->cnt;
+ else
+ idx_b = 0;
}
for (; idx_a >= 0; idx_a--) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 66beca1ad92f..2ff0289ad061 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2337,8 +2337,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
return 0;
memcg = get_mem_cgroup_from_mm(current->mm);
- if (!mem_cgroup_is_root(memcg))
+ if (!mem_cgroup_is_root(memcg)) {
ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ if (!ret)
+ __SetPageKmemcg(page);
+ }
css_put(&memcg->css);
return ret;
}
@@ -2365,6 +2368,11 @@ void memcg_kmem_uncharge(struct page *page, int order)
page_counter_uncharge(&memcg->memsw, nr_pages);
page->mem_cgroup = NULL;
+
+ /* slab pages do not have PageKmemcg flag set */
+ if (PageKmemcg(page))
+ __ClearPageKmemcg(page);
+
css_put_many(&memcg->css, nr_pages);
}
#endif /* !CONFIG_SLOB */
@@ -4069,14 +4077,32 @@ static struct cftype mem_cgroup_legacy_files[] = {
static DEFINE_IDR(mem_cgroup_idr);
-static void mem_cgroup_id_get(struct mem_cgroup *memcg)
+static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
- atomic_inc(&memcg->id.ref);
+ atomic_add(n, &memcg->id.ref);
}
-static void mem_cgroup_id_put(struct mem_cgroup *memcg)
+static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
- if (atomic_dec_and_test(&memcg->id.ref)) {
+ while (!atomic_inc_not_zero(&memcg->id.ref)) {
+ /*
+ * The root cgroup cannot be destroyed, so it's refcount must
+ * always be >= 1.
+ */
+ if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
+ VM_BUG_ON(1);
+ break;
+ }
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ }
+ return memcg;
+}
+
+static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
+{
+ if (atomic_sub_and_test(n, &memcg->id.ref)) {
idr_remove(&mem_cgroup_idr, memcg->id.id);
memcg->id.id = 0;
@@ -4085,6 +4111,16 @@ static void mem_cgroup_id_put(struct mem_cgroup *memcg)
}
}
+static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
+{
+ mem_cgroup_id_get_many(memcg, 1);
+}
+
+static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
+{
+ mem_cgroup_id_put_many(memcg, 1);
+}
+
/**
* mem_cgroup_from_id - look up a memcg from a memcg id
* @id: the memcg id to look up
@@ -4719,6 +4755,8 @@ static void __mem_cgroup_clear_mc(void)
if (!mem_cgroup_is_root(mc.from))
page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
+ mem_cgroup_id_put_many(mc.from, mc.moved_swap);
+
/*
* we charged both to->memory and to->memsw, so we
* should uncharge to->memory.
@@ -4726,9 +4764,9 @@ static void __mem_cgroup_clear_mc(void)
if (!mem_cgroup_is_root(mc.to))
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
- css_put_many(&mc.from->css, mc.moved_swap);
+ mem_cgroup_id_get_many(mc.to, mc.moved_swap);
+ css_put_many(&mc.to->css, mc.moved_swap);
- /* we've already done css_get(mc.to) */
mc.moved_swap = 0;
}
memcg_oom_recover(from);
@@ -5537,8 +5575,10 @@ static void uncharge_list(struct list_head *page_list)
else
nr_file += nr_pages;
pgpgout++;
- } else
+ } else {
nr_kmem += 1 << compound_order(page);
+ __ClearPageKmemcg(page);
+ }
page->mem_cgroup = NULL;
} while (next != page_list);
@@ -5790,7 +5830,7 @@ subsys_initcall(mem_cgroup_init);
*/
void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *swap_memcg;
unsigned short oldid;
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5805,16 +5845,27 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!memcg)
return;
- mem_cgroup_id_get(memcg);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ /*
+ * In case the memcg owning these pages has been offlined and doesn't
+ * have an ID allocated to it anymore, charge the closest online
+ * ancestor for the swap instead and transfer the memory+swap charge.
+ */
+ swap_memcg = mem_cgroup_id_get_online(memcg);
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg));
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(memcg, true);
+ mem_cgroup_swap_statistics(swap_memcg, true);
page->mem_cgroup = NULL;
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, 1);
+ if (memcg != swap_memcg) {
+ if (!mem_cgroup_is_root(swap_memcg))
+ page_counter_charge(&swap_memcg->memsw, 1);
+ page_counter_uncharge(&memcg->memsw, 1);
+ }
+
/*
* Interrupts should be disabled here because the caller holds the
* mapping->tree_lock lock which is taken with interrupts-off. It is
@@ -5853,11 +5904,14 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (!memcg)
return 0;
+ memcg = mem_cgroup_id_get_online(memcg);
+
if (!mem_cgroup_is_root(memcg) &&
- !page_counter_try_charge(&memcg->swap, 1, &counter))
+ !page_counter_try_charge(&memcg->swap, 1, &counter)) {
+ mem_cgroup_id_put(memcg);
return -ENOMEM;
+ }
- mem_cgroup_id_get(memcg);
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
VM_BUG_ON_PAGE(oldid, page);
mem_cgroup_swap_statistics(memcg, true);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3894b65b1555..41266dc29f33 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1219,6 +1219,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+ pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
/*
* The node we allocated has no zone fallback lists. For avoiding
@@ -1249,6 +1250,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
{
arch_refresh_nodedata(nid, NULL);
+ free_percpu(pgdat->per_cpu_nodestats);
arch_free_nodedata(pgdat);
return;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7d0a275df822..d53a9aa00977 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -764,7 +764,7 @@ bool task_will_free_mem(struct task_struct *task)
{
struct mm_struct *mm = task->mm;
struct task_struct *p;
- bool ret;
+ bool ret = true;
/*
* Skip tasks without mm because it might have passed its exit_mm and
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 39a372a2a1d6..3fbe73a6fe4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1008,10 +1008,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
}
if (PageMappingFlags(page))
page->mapping = NULL;
- if (memcg_kmem_enabled() && PageKmemcg(page)) {
+ if (memcg_kmem_enabled() && PageKmemcg(page))
memcg_kmem_uncharge(page, order);
- __ClearPageKmemcg(page);
- }
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -3756,12 +3754,10 @@ no_zone:
}
out:
- if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) {
- if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) {
- __free_pages(page, order);
- page = NULL;
- } else
- __SetPageKmemcg(page);
+ if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
+ unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+ __free_pages(page, order);
+ page = NULL;
}
if (kmemcheck_enabled && page)
@@ -4064,7 +4060,7 @@ long si_mem_available(void)
int lru;
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_page_state(NR_LRU_BASE + lru);
+ pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
for_each_zone(zone)
wmark_low += zone->watermark[WMARK_LOW];
@@ -4761,6 +4757,8 @@ int local_memory_node(int node)
}
#endif
+static void setup_min_unmapped_ratio(void);
+static void setup_min_slab_ratio(void);
#else /* CONFIG_NUMA */
static void set_zonelist_order(void)
@@ -5257,11 +5255,6 @@ static void __meminit setup_zone_pageset(struct zone *zone)
zone->pageset = alloc_percpu(struct per_cpu_pageset);
for_each_possible_cpu(cpu)
zone_pageset_init(zone, cpu);
-
- if (!zone->zone_pgdat->per_cpu_nodestats) {
- zone->zone_pgdat->per_cpu_nodestats =
- alloc_percpu(struct per_cpu_nodestat);
- }
}
/*
@@ -5270,10 +5263,15 @@ static void __meminit setup_zone_pageset(struct zone *zone)
*/
void __init setup_per_cpu_pageset(void)
{
+ struct pglist_data *pgdat;
struct zone *zone;
for_each_populated_zone(zone)
setup_zone_pageset(zone);
+
+ for_each_online_pgdat(pgdat)
+ pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
}
static noinline __ref
@@ -5882,9 +5880,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
- / 100;
- pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
zone->zone_pgdat = pgdat;
@@ -6805,6 +6800,12 @@ int __meminit init_per_zone_wmark_min(void)
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
+
+#ifdef CONFIG_NUMA
+ setup_min_unmapped_ratio();
+ setup_min_slab_ratio();
+#endif
+
return 0;
}
core_initcall(init_per_zone_wmark_min)
@@ -6846,43 +6847,58 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
}
#ifdef CONFIG_NUMA
+static void setup_min_unmapped_ratio(void)
+{
+ pg_data_t *pgdat;
+ struct zone *zone;
+
+ for_each_online_pgdat(pgdat)
+ pgdat->min_unmapped_pages = 0;
+
+ for_each_zone(zone)
+ zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
+ sysctl_min_unmapped_ratio) / 100;
+}
+
+
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- struct pglist_data *pgdat;
- struct zone *zone;
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
+ setup_min_unmapped_ratio();
+
+ return 0;
+}
+
+static void setup_min_slab_ratio(void)
+{
+ pg_data_t *pgdat;
+ struct zone *zone;
+
for_each_online_pgdat(pgdat)
pgdat->min_slab_pages = 0;
for_each_zone(zone)
- zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
- sysctl_min_unmapped_ratio) / 100;
- return 0;
+ zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
+ sysctl_min_slab_ratio) / 100;
}
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- struct pglist_data *pgdat;
- struct zone *zone;
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
- for_each_online_pgdat(pgdat)
- pgdat->min_slab_pages = 0;
+ setup_min_slab_ratio();
- for_each_zone(zone)
- zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
- sysctl_min_slab_ratio) / 100;
return 0;
}
#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index fb1fa269d3a0..16bd82fad38c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -319,9 +319,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
ret = -ENOMEM;
goto out;
}
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
if (wbc->sync_mode == WB_SYNC_ALL)
- bio->bi_rw |= REQ_SYNC;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC);
+ else
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 709bc83703b1..1ef36404e7b2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1284,8 +1284,9 @@ void page_add_file_rmap(struct page *page, bool compound)
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
} else {
- if (PageTransCompound(page)) {
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (PageTransCompound(page) && page_mapping(page)) {
+ VM_WARN_ON_ONCE(!PageLocked(page));
+
SetPageDoubleMap(compound_head(page));
if (PageMlocked(page))
clear_page_mlock(compound_head(page));
@@ -1303,7 +1304,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
{
int i, nr = 1;
- VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
+ VM_BUG_ON_PAGE(compound && !PageHead(page), page);
lock_page_memcg(page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
diff --git a/mm/shmem.c b/mm/shmem.c
index 2ac19a61d565..fd8b2b5741b1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1362,13 +1362,14 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
struct vm_area_struct pvma;
struct inode *inode = &info->vfs_inode;
struct address_space *mapping = inode->i_mapping;
- pgoff_t idx, hindex = round_down(index, HPAGE_PMD_NR);
+ pgoff_t idx, hindex;
void __rcu **results;
struct page *page;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
return NULL;
+ hindex = round_down(index, HPAGE_PMD_NR);
rcu_read_lock();
if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx,
hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
@@ -3974,7 +3975,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
struct kobj_attribute shmem_enabled_attr =
__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
+#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
+#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
bool shmem_huge_enabled(struct vm_area_struct *vma)
{
struct inode *inode = file_inode(vma->vm_file);
@@ -4005,7 +4008,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
return false;
}
}
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
+#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
#else /* !CONFIG_SHMEM */
diff --git a/mm/slab.c b/mm/slab.c
index 261147ba156f..b67271024135 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4441,6 +4441,36 @@ static int __init slab_proc_init(void)
module_init(slab_proc_init);
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+ struct page *page)
+{
+ struct kmem_cache *cachep;
+ unsigned int objnr;
+ unsigned long offset;
+
+ /* Find and validate object. */
+ cachep = page->slab_cache;
+ objnr = obj_to_index(cachep, page, (void *)ptr);
+ BUG_ON(objnr >= cachep->num);
+
+ /* Find offset within object. */
+ offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+
+ /* Allow address range falling entirely within object size. */
+ if (offset <= cachep->object_size && n <= cachep->object_size - offset)
+ return NULL;
+
+ return cachep->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
/**
* ksize - get the actual amount of memory allocated for a given object
* @objp: Pointer to the object
diff --git a/mm/slub.c b/mm/slub.c
index 26eb6a99540e..9adae58462f8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,7 +124,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#endif
}
-inline void *fixup_red_left(struct kmem_cache *s, void *p)
+void *fixup_red_left(struct kmem_cache *s, void *p)
{
if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
p += s->red_left_pad;
@@ -3629,6 +3629,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
*/
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
+ LIST_HEAD(discard);
struct page *page, *h;
BUG_ON(irqs_disabled());
@@ -3636,13 +3637,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
remove_partial(n, page);
- discard_slab(s, page);
+ list_add(&page->lru, &discard);
} else {
list_slab_objects(s, page,
"Objects remaining in %s on __kmem_cache_shutdown()");
}
}
spin_unlock_irq(&n->list_lock);
+
+ list_for_each_entry_safe(page, h, &discard, lru)
+ discard_slab(s, page);
}
/*
@@ -3764,6 +3768,46 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
EXPORT_SYMBOL(__kmalloc_node);
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+ struct page *page)
+{
+ struct kmem_cache *s;
+ unsigned long offset;
+ size_t object_size;
+
+ /* Find object and usable object size. */
+ s = page->slab_cache;
+ object_size = slab_ksize(s);
+
+ /* Reject impossible pointers. */
+ if (ptr < page_address(page))
+ return s->name;
+
+ /* Find offset within object. */
+ offset = (ptr - page_address(page)) % s->size;
+
+ /* Adjust for redzone and reject if within the redzone. */
+ if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+ if (offset < s->red_left_pad)
+ return s->name;
+ offset -= s->red_left_pad;
+ }
+
+ /* Allow address range falling entirely within object size. */
+ if (offset <= object_size && n <= object_size - offset)
+ return NULL;
+
+ return s->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
static size_t __ksize(const void *object)
{
struct page *page;
diff --git a/mm/usercopy.c b/mm/usercopy.c
new file mode 100644
index 000000000000..8ebae91a6b55
--- /dev/null
+++ b/mm/usercopy.c
@@ -0,0 +1,268 @@
+/*
+ * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
+ * which are designed to protect kernel memory from needless exposure
+ * and overwrite under many unintended conditions. This code is based
+ * on PAX_USERCOPY, which is:
+ *
+ * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
+ * Security Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/sections.h>
+
+enum {
+ BAD_STACK = -1,
+ NOT_STACK = 0,
+ GOOD_FRAME,
+ GOOD_STACK,
+};
+
+/*
+ * Checks if a given pointer and length is contained by the current
+ * stack frame (if possible).
+ *
+ * Returns:
+ * NOT_STACK: not at all on the stack
+ * GOOD_FRAME: fully within a valid stack frame
+ * GOOD_STACK: fully on the stack (when can't do frame-checking)
+ * BAD_STACK: error condition (invalid stack position or bad stack frame)
+ */
+static noinline int check_stack_object(const void *obj, unsigned long len)
+{
+ const void * const stack = task_stack_page(current);
+ const void * const stackend = stack + THREAD_SIZE;
+ int ret;
+
+ /* Object is not on the stack at all. */
+ if (obj + len <= stack || stackend <= obj)
+ return NOT_STACK;
+
+ /*
+ * Reject: object partially overlaps the stack (passing the
+ * the check above means at least one end is within the stack,
+ * so if this check fails, the other end is outside the stack).
+ */
+ if (obj < stack || stackend < obj + len)
+ return BAD_STACK;
+
+ /* Check if object is safely within a valid frame. */
+ ret = arch_within_stack_frames(stack, stackend, obj, len);
+ if (ret)
+ return ret;
+
+ return GOOD_STACK;
+}
+
+static void report_usercopy(const void *ptr, unsigned long len,
+ bool to_user, const char *type)
+{
+ pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
+ to_user ? "exposure" : "overwrite",
+ to_user ? "from" : "to", ptr, type ? : "unknown", len);
+ /*
+ * For greater effect, it would be nice to do do_group_exit(),
+ * but BUG() actually hooks all the lock-breaking and per-arch
+ * Oops code, so that is used here instead.
+ */
+ BUG();
+}
+
+/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
+static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
+ unsigned long high)
+{
+ unsigned long check_low = (uintptr_t)ptr;
+ unsigned long check_high = check_low + n;
+
+ /* Does not overlap if entirely above or entirely below. */
+ if (check_low >= high || check_high < low)
+ return false;
+
+ return true;
+}
+
+/* Is this address range in the kernel text area? */
+static inline const char *check_kernel_text_object(const void *ptr,
+ unsigned long n)
+{
+ unsigned long textlow = (unsigned long)_stext;
+ unsigned long texthigh = (unsigned long)_etext;
+ unsigned long textlow_linear, texthigh_linear;
+
+ if (overlaps(ptr, n, textlow, texthigh))
+ return "<kernel text>";
+
+ /*
+ * Some architectures have virtual memory mappings with a secondary
+ * mapping of the kernel text, i.e. there is more than one virtual
+ * kernel address that points to the kernel image. It is usually
+ * when there is a separate linear physical memory mapping, in that
+ * __pa() is not just the reverse of __va(). This can be detected
+ * and checked:
+ */
+ textlow_linear = (unsigned long)__va(__pa(textlow));
+ /* No different mapping: we're done. */
+ if (textlow_linear == textlow)
+ return NULL;
+
+ /* Check the secondary mapping... */
+ texthigh_linear = (unsigned long)__va(__pa(texthigh));
+ if (overlaps(ptr, n, textlow_linear, texthigh_linear))
+ return "<linear kernel text>";
+
+ return NULL;
+}
+
+static inline const char *check_bogus_address(const void *ptr, unsigned long n)
+{
+ /* Reject if object wraps past end of memory. */
+ if (ptr + n < ptr)
+ return "<wrapped address>";
+
+ /* Reject if NULL or ZERO-allocation. */
+ if (ZERO_OR_NULL_PTR(ptr))
+ return "<null>";
+
+ return NULL;
+}
+
+static inline const char *check_heap_object(const void *ptr, unsigned long n,
+ bool to_user)
+{
+ struct page *page, *endpage;
+ const void *end = ptr + n - 1;
+ bool is_reserved, is_cma;
+
+ /*
+ * Some architectures (arm64) return true for virt_addr_valid() on
+ * vmalloced addresses. Work around this by checking for vmalloc
+ * first.
+ */
+ if (is_vmalloc_addr(ptr))
+ return NULL;
+
+ if (!virt_addr_valid(ptr))
+ return NULL;
+
+ page = virt_to_head_page(ptr);
+
+ /* Check slab allocator for flags and size. */
+ if (PageSlab(page))
+ return __check_heap_object(ptr, n, page);
+
+ /*
+ * Sometimes the kernel data regions are not marked Reserved (see
+ * check below). And sometimes [_sdata,_edata) does not cover
+ * rodata and/or bss, so check each range explicitly.
+ */
+
+ /* Allow reads of kernel rodata region (if not marked as Reserved). */
+ if (ptr >= (const void *)__start_rodata &&
+ end <= (const void *)__end_rodata) {
+ if (!to_user)
+ return "<rodata>";
+ return NULL;
+ }
+
+ /* Allow kernel data region (if not marked as Reserved). */
+ if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
+ return NULL;
+
+ /* Allow kernel bss region (if not marked as Reserved). */
+ if (ptr >= (const void *)__bss_start &&
+ end <= (const void *)__bss_stop)
+ return NULL;
+
+ /* Is the object wholly within one base page? */
+ if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
+ ((unsigned long)end & (unsigned long)PAGE_MASK)))
+ return NULL;
+
+ /* Allow if start and end are inside the same compound page. */
+ endpage = virt_to_head_page(end);
+ if (likely(endpage == page))
+ return NULL;
+
+ /*
+ * Reject if range is entirely either Reserved (i.e. special or
+ * device memory), or CMA. Otherwise, reject since the object spans
+ * several independently allocated pages.
+ */
+ is_reserved = PageReserved(page);
+ is_cma = is_migrate_cma_page(page);
+ if (!is_reserved && !is_cma)
+ goto reject;
+
+ for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
+ page = virt_to_head_page(ptr);
+ if (is_reserved && !PageReserved(page))
+ goto reject;
+ if (is_cma && !is_migrate_cma_page(page))
+ goto reject;
+ }
+
+ return NULL;
+
+reject:
+ return "<spans multiple pages>";
+}
+
+/*
+ * Validates that the given object is:
+ * - not bogus address
+ * - known-safe heap or stack object
+ * - not in kernel text
+ */
+void __check_object_size(const void *ptr, unsigned long n, bool to_user)
+{
+ const char *err;
+
+ /* Skip all tests if size is zero. */
+ if (!n)
+ return;
+
+ /* Check for invalid addresses. */
+ err = check_bogus_address(ptr, n);
+ if (err)
+ goto report;
+
+ /* Check for bad heap object. */
+ err = check_heap_object(ptr, n, to_user);
+ if (err)
+ goto report;
+
+ /* Check for bad stack object. */
+ switch (check_stack_object(ptr, n)) {
+ case NOT_STACK:
+ /* Object is not touching the current process stack. */
+ break;
+ case GOOD_FRAME:
+ case GOOD_STACK:
+ /*
+ * Object is either in the correct frame (when it
+ * is possible to check) or just generally on the
+ * process stack (when frame checking not available).
+ */
+ return;
+ default:
+ err = "<process stack>";
+ goto report;
+ }
+
+ /* Check for object in kernel to avoid text exposure. */
+ err = check_kernel_text_object(ptr, n);
+ if (!err)
+ return;
+
+report:
+ report_usercopy(ptr, n, to_user, err);
+}
+EXPORT_SYMBOL(__check_object_size);