summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/memory.txt70
-rw-r--r--Documentation/sysctl/vm.txt50
-rw-r--r--Documentation/vm/overcommit-accounting8
-rw-r--r--arch/alpha/kernel/sys_nautilus.c5
-rw-r--r--arch/alpha/mm/init.c24
-rw-r--r--arch/alpha/mm/numa.c3
-rw-r--r--arch/arc/mm/init.c23
-rw-r--r--arch/arm/include/asm/pgtable.h9
-rw-r--r--arch/arm/mm/init.c50
-rw-r--r--arch/arm64/mm/init.c26
-rw-r--r--arch/arm64/mm/mmu.c13
-rw-r--r--arch/avr32/mm/init.c24
-rw-r--r--arch/blackfin/mm/init.c22
-rw-r--r--arch/c6x/mm/init.c30
-rw-r--r--arch/cris/mm/init.c16
-rw-r--r--arch/frv/mm/init.c38
-rw-r--r--arch/h8300/mm/init.c30
-rw-r--r--arch/ia64/include/asm/hugetlb.h1
-rw-r--r--arch/ia64/mm/contig.c2
-rw-r--r--arch/ia64/mm/discontig.c9
-rw-r--r--arch/ia64/mm/init.c23
-rw-r--r--arch/ia64/mm/numa.c15
-rw-r--r--arch/m32r/mm/init.c26
-rw-r--r--arch/m68k/mm/init.c24
-rw-r--r--arch/metag/mm/init.c31
-rw-r--r--arch/microblaze/include/asm/setup.h1
-rw-r--r--arch/microblaze/mm/init.c34
-rw-r--r--arch/mips/include/asm/hugetlb.h1
-rw-r--r--arch/mips/mm/init.c37
-rw-r--r--arch/mips/sgi-ip27/ip27-memory.c4
-rw-r--r--arch/mn10300/mm/init.c23
-rw-r--r--arch/openrisc/mm/init.c27
-rw-r--r--arch/parisc/mm/init.c25
-rw-r--r--arch/powerpc/include/asm/hugetlb.h1
-rw-r--r--arch/powerpc/kernel/crash_dump.c5
-rw-r--r--arch/powerpc/kernel/fadump.c5
-rw-r--r--arch/powerpc/kernel/kvm.c7
-rw-r--r--arch/powerpc/mm/init_64.c11
-rw-r--r--arch/powerpc/mm/mem.c35
-rw-r--r--arch/powerpc/mm/numa.c9
-rw-r--r--arch/powerpc/platforms/512x/mpc512x_shared.c5
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c12
-rw-r--r--arch/s390/include/asm/hugetlb.h56
-rw-r--r--arch/s390/include/asm/pgtable.h95
-rw-r--r--arch/s390/mm/hugetlbpage.c2
-rw-r--r--arch/s390/mm/init.c35
-rw-r--r--arch/s390/mm/vmem.c15
-rw-r--r--arch/score/mm/init.c33
-rw-r--r--arch/sh/include/asm/hugetlb.h1
-rw-r--r--arch/sh/mm/init.c26
-rw-r--r--arch/sparc/include/asm/hugetlb.h1
-rw-r--r--arch/sparc/mm/init_32.c12
-rw-r--r--arch/sparc/mm/init_64.c7
-rw-r--r--arch/tile/include/asm/hugetlb.h1
-rw-r--r--arch/tile/mm/pgtable.c7
-rw-r--r--arch/um/kernel/mem.c26
-rw-r--r--arch/unicore32/mm/init.c31
-rw-r--r--arch/unicore32/mm/ioremap.c17
-rw-r--r--arch/x86/include/asm/hugetlb.h1
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl48
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh41
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init.c5
-rw-r--r--arch/x86/mm/init_32.c10
-rw-r--r--arch/x86/mm/init_64.c73
-rw-r--r--arch/x86/mm/ioremap.c7
-rw-r--r--arch/x86/mm/numa.c9
-rw-r--r--arch/xtensa/mm/init.c21
-rw-r--r--drivers/base/cpu.c25
-rw-r--r--drivers/base/memory.c49
-rw-r--r--drivers/base/node.c8
-rw-r--r--drivers/firmware/memmap.c9
-rw-r--r--drivers/video/Kconfig9
-rw-r--r--drivers/video/Makefile1
-rw-r--r--drivers/video/console/fbcon_cw.c3
-rw-r--r--drivers/video/ep93xx-fb.c18
-rw-r--r--drivers/video/exynos/exynos_mipi_dsi.c8
-rw-r--r--drivers/video/hyperv_fb.c829
-rw-r--r--drivers/video/matrox/matroxfb_maven.c16
-rw-r--r--drivers/video/mmp/hw/mmp_ctrl.h479
-rw-r--r--fs/Makefile3
-rw-r--r--fs/buffer.c11
-rw-r--r--fs/direct-io.c31
-rw-r--r--fs/exec.c4
-rw-r--r--fs/ext3/super.c1
-rw-r--r--fs/fscache/stats.c2
-rw-r--r--fs/jbd/commit.c25
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c6
-rw-r--r--fs/ocfs2/ioctl.c22
-rw-r--r--fs/ocfs2/move_extents.c53
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/internal.h18
-rw-r--r--fs/proc/kcore.c9
-rw-r--r--fs/proc/meminfo.c1
-rw-r--r--fs/proc/mmu.c60
-rw-r--r--fs/read_write.c2
-rw-r--r--include/asm-generic/hugetlb.h40
-rw-r--r--include/asm-generic/pgtable.h10
-rw-r--r--include/linux/blk_types.h3
-rw-r--r--include/linux/bootmem.h1
-rw-r--r--include/linux/buffer_head.h1
-rw-r--r--include/linux/cgroup.h7
-rw-r--r--include/linux/debug_locks.h2
-rw-r--r--include/linux/huge_mm.h11
-rw-r--r--include/linux/hugetlb.h4
-rw-r--r--include/linux/hyperv.h11
-rw-r--r--include/linux/ioport.h4
-rw-r--r--include/linux/memory.h17
-rw-r--r--include/linux/memory_hotplug.h4
-rw-r--r--include/linux/mm.h79
-rw-r--r--include/linux/notifier.h5
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/ramfs.h8
-rw-r--r--include/linux/swap.h10
-rw-r--r--include/linux/vmalloc.h34
-rw-r--r--include/linux/vmpressure.h47
-rw-r--r--include/linux/vmstat.h7
-rw-r--r--include/trace/events/filemap.h58
-rw-r--r--include/uapi/linux/fs.h1
-rw-r--r--ipc/util.c15
-rw-r--r--kernel/audit.c12
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/auditsc.c14
-rw-r--r--kernel/cgroup.c49
-rw-r--r--kernel/cpuset.c12
-rw-r--r--kernel/kexec.c13
-rw-r--r--kernel/kthread.c44
-rw-r--r--kernel/resource.c198
-rw-r--r--kernel/sysctl.c15
-rw-r--r--lib/show_mem.c3
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bounce.c21
-rw-r--r--mm/filemap.c34
-rw-r--r--mm/huge_memory.c77
-rw-r--r--mm/hugetlb.c39
-rw-r--r--mm/madvise.c31
-rw-r--r--mm/memblock.c12
-rw-r--r--mm/memcontrol.c258
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory.c5
-rw-r--r--mm/memory_hotplug.c93
-rw-r--r--mm/migrate.c24
-rw-r--r--mm/mmap.c196
-rw-r--r--mm/nobootmem.c6
-rw-r--r--mm/nommu.c68
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c77
-rw-r--r--mm/page_io.c36
-rw-r--r--mm/rmap.c3
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/slub.c9
-rw-r--r--mm/sparse-vmemmap.c27
-rw-r--r--mm/sparse.c82
-rw-r--r--mm/swap.c11
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/vmalloc.c218
-rw-r--r--mm/vmpressure.c374
-rw-r--r--mm/vmscan.c14
-rw-r--r--mm/vmstat.c6
-rwxr-xr-xscripts/decodecode8
166 files changed, 3438 insertions, 2177 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 8b8c28b9864c..f336ede58e62 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -40,6 +40,7 @@ Features:
- soft limit
- moving (recharging) account at moving a task is selectable.
- usage threshold notifier
+ - memory pressure notifier
- oom-killer disable knob and oom-notifier
- Root cgroup has no limit controls.
@@ -65,6 +66,7 @@ Brief summary of control files.
memory.stat # show various statistics
memory.use_hierarchy # set/show hierarchical account enabled
memory.force_empty # trigger forced move charge to parent
+ memory.pressure_level # set memory pressure notifications
memory.swappiness # set/show swappiness parameter of vmscan
(See sysctl's vm.swappiness)
memory.move_charge_at_immigrate # set/show controls of moving charges
@@ -762,7 +764,73 @@ At reading, current status of OOM is shown.
under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
be stopped.)
-11. TODO
+11. Memory Pressure
+
+The pressure level notifications can be used to monitor the memory
+allocation cost; based on the pressure, applications can implement
+different strategies of managing their memory resources. The pressure
+levels are defined as following:
+
+The "low" level means that the system is reclaiming memory for new
+allocations. Monitoring this reclaiming activity might be useful for
+maintaining cache level. Upon notification, the program (typically
+"Activity Manager") might analyze vmstat and act in advance (i.e.
+prematurely shutdown unimportant services).
+
+The "medium" level means that the system is experiencing medium memory
+pressure, the system might be making swap, paging out active file caches,
+etc. Upon this event applications may decide to further analyze
+vmstat/zoneinfo/memcg or internal memory usage statistics and free any
+resources that can be easily reconstructed or re-read from a disk.
+
+The "critical" level means that the system is actively thrashing, it is
+about to out of memory (OOM) or even the in-kernel OOM killer is on its
+way to trigger. Applications should do whatever they can to help the
+system. It might be too late to consult with vmstat or any other
+statistics, so it's advisable to take an immediate action.
+
+The events are propagated upward until the event is handled, i.e. the
+events are not pass-through. Here is what this means: for example you have
+three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
+and C, and suppose group C experiences some pressure. In this situation,
+only group C will receive the notification, i.e. groups A and B will not
+receive it. This is done to avoid excessive "broadcasting" of messages,
+which disturbs the system and which is especially bad if we are low on
+memory or thrashing. So, organize the cgroups wisely, or propagate the
+events manually (or, ask us to implement the pass-through events,
+explaining why would you need them.)
+
+The file memory.pressure_level is only used to setup an eventfd. To
+register a notification, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.pressure_level;
+- write string like "<event_fd> <fd of memory.pressure_level> <level>"
+ to cgroup.event_control.
+
+Application will be notified through eventfd when memory pressure is at
+the specific level (or higher). Read/write operations to
+memory.pressure_level are no implemented.
+
+Test:
+
+ Here is a small script example that makes a new cgroup, sets up a
+ memory limit, sets up a notification in the cgroup and then makes child
+ cgroup experience a critical pressure:
+
+ # cd /sys/fs/cgroup/memory/
+ # mkdir foo
+ # cd foo
+ # cgroup_event_listener memory.pressure_level low &
+ # echo 8000000 > memory.limit_in_bytes
+ # echo 8000000 > memory.memsw.limit_in_bytes
+ # echo $$ > tasks
+ # dd if=/dev/zero | read x
+
+ (Expect a bunch of notifications, and eventually, the oom-killer will
+ trigger.)
+
+12. TODO
1. Add support for accounting huge pages (as a separate controller)
2. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 078701fdbd4d..dcc75a9ed919 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -18,6 +18,7 @@ files can be found in mm/swap.c.
Currently, these files are in /proc/sys/vm:
+- admin_reserve_kbytes
- block_dump
- compact_memory
- dirty_background_bytes
@@ -53,11 +54,41 @@ Currently, these files are in /proc/sys/vm:
- percpu_pagelist_fraction
- stat_interval
- swappiness
+- user_reserve_kbytes
- vfs_cache_pressure
- zone_reclaim_mode
==============================================================
+admin_reserve_kbytes
+
+The amount of free memory in the system that should be reserved for users
+with the capability cap_sys_admin.
+
+admin_reserve_kbytes defaults to min(3% of free pages, 8MB)
+
+That should provide enough for the admin to log in and kill a process,
+if necessary, under the default overcommit 'guess' mode.
+
+Systems running under overcommit 'never' should increase this to account
+for the full Virtual Memory Size of programs used to recover. Otherwise,
+root may not be able to log in to recover the system.
+
+How do you calculate a minimum useful reserve?
+
+sshd or login + bash (or some other shell) + top (or ps, kill, etc.)
+
+For overcommit 'guess', we can sum resident set sizes (RSS).
+On x86_64 this is about 8MB.
+
+For overcommit 'never', we can take the max of their virtual sizes (VSZ)
+and add the sum of their RSS.
+On x86_64 this is about 128MB.
+
+Changing this takes effect whenever an application requests memory.
+
+==============================================================
+
block_dump
block_dump enables block I/O debugging when set to a nonzero value. More
@@ -542,6 +573,7 @@ memory until it actually runs out.
When this flag is 2, the kernel uses a "never overcommit"
policy that attempts to prevent any overcommit of memory.
+Note that user_reserve_kbytes affects this policy.
This feature can be very useful because there are a lot of
programs that malloc() huge amounts of memory "just-in-case"
@@ -645,6 +677,24 @@ The default value is 60.
==============================================================
+- user_reserve_kbytes
+
+When overcommit_memory is set to 2, "never overommit" mode, reserve
+min(3% of current process size, user_reserve_kbytes) of free memory.
+This is intended to prevent a user from starting a single memory hogging
+process, such that they cannot recover (kill the hog).
+
+user_reserve_kbytes defaults to min(3% of the current process size, 128MB).
+
+If this is reduced to zero, then the user will be allowed to allocate
+all free memory with a single process, minus admin_reserve_kbytes.
+Any subsequent attempts to execute a command will result in
+"fork: Cannot allocate memory".
+
+Changing this takes effect whenever an application requests memory.
+
+==============================================================
+
vfs_cache_pressure
------------------
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
index 706d7ed9d8d2..8eaa2fc4b8fa 100644
--- a/Documentation/vm/overcommit-accounting
+++ b/Documentation/vm/overcommit-accounting
@@ -8,7 +8,9 @@ The Linux kernel supports the following overcommit handling modes
default.
1 - Always overcommit. Appropriate for some scientific
- applications.
+ applications. Classic example is code using sparse arrays
+ and just relying on the virtual memory consisting almost
+ entirely of zero pages.
2 - Don't overcommit. The total address space commit
for the system is not permitted to exceed swap + a
@@ -18,6 +20,10 @@ The Linux kernel supports the following overcommit handling modes
pages but will receive errors on memory allocation as
appropriate.
+ Useful for applications that want to guarantee their
+ memory allocations will be available in the future
+ without having to initialize every page.
+
The overcommit policy is set via the sysctl `vm.overcommit_memory'.
The overcommit percentage is set via `vm.overcommit_ratio'.
diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c
index 1383f8601a93..1d4aabfcf9a1 100644
--- a/arch/alpha/kernel/sys_nautilus.c
+++ b/arch/alpha/kernel/sys_nautilus.c
@@ -185,7 +185,6 @@ nautilus_machine_check(unsigned long vector, unsigned long la_ptr)
mb();
}
-extern void free_reserved_mem(void *, void *);
extern void pcibios_claim_one_bus(struct pci_bus *);
static struct resource irongate_io = {
@@ -239,8 +238,8 @@ nautilus_init_pci(void)
if (pci_mem < memtop)
memtop = pci_mem;
if (memtop > alpha_mv.min_mem_address) {
- free_reserved_mem(__va(alpha_mv.min_mem_address),
- __va(memtop));
+ free_reserved_area((unsigned long)__va(alpha_mv.min_mem_address),
+ (unsigned long)__va(memtop), 0, NULL);
printk("nautilus_init_pci: %ldk freed\n",
(memtop - alpha_mv.min_mem_address) >> 10);
}
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 1ad6ca74bed2..0ba85ee4a466 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -31,6 +31,7 @@
#include <asm/console.h>
#include <asm/tlb.h>
#include <asm/setup.h>
+#include <asm/sections.h>
extern void die_if_kernel(char *,struct pt_regs *,long);
@@ -281,8 +282,6 @@ printk_memory_info(void)
{
unsigned long codesize, reservedpages, datasize, initsize, tmp;
extern int page_is_ram(unsigned long) __init;
- extern char _text, _etext, _data, _edata;
- extern char __init_begin, __init_end;
/* printk all informations */
reservedpages = 0;
@@ -318,32 +317,15 @@ mem_init(void)
#endif /* CONFIG_DISCONTIGMEM */
void
-free_reserved_mem(void *start, void *end)
-{
- void *__start = start;
- for (; __start < end; __start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(__start));
- init_page_count(virt_to_page(__start));
- free_page((long)__start);
- totalram_pages++;
- }
-}
-
-void
free_initmem(void)
{
- extern char __init_begin, __init_end;
-
- free_reserved_mem(&__init_begin, &__init_end);
- printk ("Freeing unused kernel memory: %ldk freed\n",
- (&__init_end - &__init_begin) >> 10);
+ free_initmem_default(0);
}
#ifdef CONFIG_BLK_DEV_INITRD
void
free_initrd_mem(unsigned long start, unsigned long end)
{
- free_reserved_mem((void *)start, (void *)end);
- printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index 3973ae395772..33885048fa36 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -17,6 +17,7 @@
#include <asm/hwrpb.h>
#include <asm/pgalloc.h>
+#include <asm/sections.h>
pg_data_t node_data[MAX_NUMNODES];
EXPORT_SYMBOL(node_data);
@@ -325,8 +326,6 @@ void __init mem_init(void)
{
unsigned long codesize, reservedpages, datasize, initsize, pfn;
extern int page_is_ram(unsigned long) __init;
- extern char _text, _etext, _data, _edata;
- extern char __init_begin, __init_end;
unsigned long nid, i;
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index caf797de23fc..727d4794ea0f 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -144,37 +144,18 @@ void __init mem_init(void)
PAGES_TO_KB(reserved_pages));
}
-static void __init free_init_pages(const char *what, unsigned long begin,
- unsigned long end)
-{
- unsigned long addr;
-
- pr_info("Freeing %s: %ldk [%lx] to [%lx]\n",
- what, TO_KB(end - begin), begin, end);
-
- /* need to check that the page we free is not a partial page */
- for (addr = begin; addr + PAGE_SIZE <= end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
-}
-
/*
* free_initmem: Free all the __init memory.
*/
void __init_refok free_initmem(void)
{
- free_init_pages("unused kernel memory",
- (unsigned long)__init_begin,
- (unsigned long)__init_end);
+ free_initmem_default(0);
}
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
- free_init_pages("initrd memory", start, end);
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 80d6fc4dbe4a..9bcd262a9008 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -61,6 +61,15 @@ extern void __pgd_error(const char *file, int line, pgd_t);
#define FIRST_USER_ADDRESS PAGE_SIZE
/*
+ * Use TASK_SIZE as the ceiling argument for free_pgtables() and
+ * free_pgd_range() to avoid freeing the modules pmd when LPAE is enabled (pmd
+ * page shared between user and kernel).
+ */
+#ifdef CONFIG_ARM_LPAE
+#define USER_PGTABLES_CEILING TASK_SIZE
+#endif
+
+/*
* The pgprot_* and protection_map entries will be fixed up in runtime
* to include the cachable and bufferable bits based on memory policy,
* as well as any architecture dependent bits like global/ASID and SMP
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index ad722f1208a5..9a5cdc01fcdf 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -99,6 +99,9 @@ void show_mem(unsigned int filter)
printk("Mem-info:\n");
show_free_areas(filter);
+ if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+ return;
+
for_each_bank (i, mi) {
struct membank *bank = &mi->bank[i];
unsigned int pfn1, pfn2;
@@ -424,24 +427,6 @@ void __init bootmem_init(void)
max_pfn = max_high - PHYS_PFN_OFFSET;
}
-static inline int free_area(unsigned long pfn, unsigned long end, char *s)
-{
- unsigned int pages = 0, size = (end - pfn) << (PAGE_SHIFT - 10);
-
- for (; pfn < end; pfn++) {
- struct page *page = pfn_to_page(pfn);
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- pages++;
- }
-
- if (size && s)
- printk(KERN_INFO "Freeing %s memory: %dK\n", s, size);
-
- return pages;
-}
-
/*
* Poison init memory with an undefined instruction (ARM) or a branch to an
* undefined instruction (Thumb).
@@ -534,6 +519,14 @@ static void __init free_unused_memmap(struct meminfo *mi)
#endif
}
+#ifdef CONFIG_HIGHMEM
+static inline void free_area_high(unsigned long pfn, unsigned long end)
+{
+ for (; pfn < end; pfn++)
+ free_highmem_page(pfn_to_page(pfn));
+}
+#endif
+
static void __init free_highpages(void)
{
#ifdef CONFIG_HIGHMEM
@@ -569,8 +562,7 @@ static void __init free_highpages(void)
if (res_end > end)
res_end = end;
if (res_start != start)
- totalhigh_pages += free_area(start, res_start,
- NULL);
+ free_area_high(start, res_start);
start = res_end;
if (start == end)
break;
@@ -578,9 +570,8 @@ static void __init free_highpages(void)
/* And now free anything which remains */
if (start < end)
- totalhigh_pages += free_area(start, end, NULL);
+ free_area_high(start, end);
}
- totalram_pages += totalhigh_pages;
#endif
}
@@ -609,8 +600,7 @@ void __init mem_init(void)
#ifdef CONFIG_SA1111
/* now that our DMA memory is actually so designated, we can free it */
- totalram_pages += free_area(PHYS_PFN_OFFSET,
- __phys_to_pfn(__pa(swapper_pg_dir)), NULL);
+ free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, 0, NULL);
#endif
free_highpages();
@@ -738,16 +728,12 @@ void free_initmem(void)
extern char __tcm_start, __tcm_end;
poison_init_mem(&__tcm_start, &__tcm_end - &__tcm_start);
- totalram_pages += free_area(__phys_to_pfn(__pa(&__tcm_start)),
- __phys_to_pfn(__pa(&__tcm_end)),
- "TCM link");
+ free_reserved_area(&__tcm_start, &__tcm_end, 0, "TCM link");
#endif
poison_init_mem(__init_begin, __init_end - __init_begin);
if (!machine_is_integrator() && !machine_is_cintegrator())
- totalram_pages += free_area(__phys_to_pfn(__pa(__init_begin)),
- __phys_to_pfn(__pa(__init_end)),
- "init");
+ free_initmem_default(0);
}
#ifdef CONFIG_BLK_DEV_INITRD
@@ -758,9 +744,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
{
if (!keep_initrd) {
poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
- totalram_pages += free_area(__phys_to_pfn(__pa(start)),
- __phys_to_pfn(__pa(end)),
- "initrd");
+ free_reserved_area(start, end, 0, "initrd");
}
}
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 800aac306a08..f497ca77925a 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -197,24 +197,6 @@ void __init bootmem_init(void)
max_pfn = max_low_pfn = max;
}
-static inline int free_area(unsigned long pfn, unsigned long end, char *s)
-{
- unsigned int pages = 0, size = (end - pfn) << (PAGE_SHIFT - 10);
-
- for (; pfn < end; pfn++) {
- struct page *page = pfn_to_page(pfn);
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- pages++;
- }
-
- if (size && s)
- pr_info("Freeing %s memory: %dK\n", s, size);
-
- return pages;
-}
-
/*
* Poison init memory with an undefined instruction (0x0).
*/
@@ -405,9 +387,7 @@ void __init mem_init(void)
void free_initmem(void)
{
poison_init_mem(__init_begin, __init_end - __init_begin);
- totalram_pages += free_area(__phys_to_pfn(__pa(__init_begin)),
- __phys_to_pfn(__pa(__init_end)),
- "init");
+ free_initmem_default(0);
}
#ifdef CONFIG_BLK_DEV_INITRD
@@ -418,9 +398,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
{
if (!keep_initrd) {
poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
- totalram_pages += free_area(__phys_to_pfn(__pa(start)),
- __phys_to_pfn(__pa(end)),
- "initrd");
+ free_reserved_area(start, end, 0, "initrd");
}
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 70b8cd4021c4..eeecc9c8ed68 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -391,17 +391,14 @@ int kern_addr_valid(unsigned long addr)
}
#ifdef CONFIG_SPARSEMEM_VMEMMAP
#ifdef CONFIG_ARM64_64K_PAGES
-int __meminit vmemmap_populate(struct page *start_page,
- unsigned long size, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
- return vmemmap_populate_basepages(start_page, size, node);
+ return vmemmap_populate_basepages(start, end, node);
}
#else /* !CONFIG_ARM64_64K_PAGES */
-int __meminit vmemmap_populate(struct page *start_page,
- unsigned long size, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
- unsigned long addr = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + size);
+ unsigned long addr = start;
unsigned long next;
pgd_t *pgd;
pud_t *pud;
@@ -434,7 +431,7 @@ int __meminit vmemmap_populate(struct page *start_page,
return 0;
}
#endif /* CONFIG_ARM64_64K_PAGES */
-void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void vmemmap_free(unsigned long start, unsigned long end)
{
}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c
index 2798c2d4a1cf..e66e8406f992 100644
--- a/arch/avr32/mm/init.c
+++ b/arch/avr32/mm/init.c
@@ -146,34 +146,14 @@ void __init mem_init(void)
initsize >> 10);
}
-static inline void free_area(unsigned long addr, unsigned long end, char *s)
-{
- unsigned int size = (end - addr) >> 10;
-
- for (; addr < end; addr += PAGE_SIZE) {
- struct page *page = virt_to_page(addr);
- ClearPageReserved(page);
- init_page_count(page);
- free_page(addr);
- totalram_pages++;
- }
-
- if (size && s)
- printk(KERN_INFO "Freeing %s memory: %dK (%lx - %lx)\n",
- s, size, end - (size << 10), end);
-}
-
void free_initmem(void)
{
- free_area((unsigned long)__init_begin, (unsigned long)__init_end,
- "init");
+ free_initmem_default(0);
}
#ifdef CONFIG_BLK_DEV_INITRD
-
void free_initrd_mem(unsigned long start, unsigned long end)
{
- free_area(start, end, "initrd");
+ free_reserved_area(start, end, 0, "initrd");
}
-
#endif
diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c
index 9cb85537bd2b..82d01a71207f 100644
--- a/arch/blackfin/mm/init.c
+++ b/arch/blackfin/mm/init.c
@@ -103,7 +103,7 @@ void __init mem_init(void)
max_mapnr = num_physpages = MAP_NR(high_memory);
printk(KERN_DEBUG "Kernel managed physical pages: %lu\n", num_physpages);
- /* This will put all memory onto the freelists. */
+ /* This will put all low memory onto the freelists. */
totalram_pages = free_all_bootmem();
reservedpages = 0;
@@ -129,24 +129,11 @@ void __init mem_init(void)
initk, codek, datak, DMA_UNCACHED_REGION >> 10, (reservedpages << (PAGE_SHIFT-10)));
}
-static void __init free_init_pages(const char *what, unsigned long begin, unsigned long end)
-{
- unsigned long addr;
- /* next to check that the page we free is not a partial page */
- for (addr = begin; addr + PAGE_SIZE <= end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
-}
-
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
#ifndef CONFIG_MPU
- free_init_pages("initrd memory", start, end);
+ free_reserved_area(start, end, 0, "initrd");
#endif
}
#endif
@@ -154,10 +141,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
void __init_refok free_initmem(void)
{
#if defined CONFIG_RAMKERNEL && !defined CONFIG_MPU
- free_init_pages("unused kernel memory",
- (unsigned long)(&__init_begin),
- (unsigned long)(&__init_end));
-
+ free_initmem_default(0);
if (memory_start == (unsigned long)(&__init_end))
memory_start = (unsigned long)(&__init_begin);
#endif
diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c
index 89395f09648a..a9fcd89b251b 100644
--- a/arch/c6x/mm/init.c
+++ b/arch/c6x/mm/init.c
@@ -77,37 +77,11 @@ void __init mem_init(void)
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
- int pages = 0;
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- pages++;
- }
- printk(KERN_INFO "Freeing initrd memory: %luk freed\n",
- (pages * PAGE_SIZE) >> 10);
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
void __init free_initmem(void)
{
- unsigned long addr;
-
- /*
- * The following code should be cool even if these sections
- * are not page aligned.
- */
- addr = PAGE_ALIGN((unsigned long)(__init_begin));
-
- /* next to check that the page we free is not a partial page */
- for (; addr + PAGE_SIZE < (unsigned long)(__init_end);
- addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- printk(KERN_INFO "Freeing unused kernel memory: %dK freed\n",
- (int) ((addr - PAGE_ALIGN((long) &__init_begin)) >> 10));
+ free_initmem_default(0);
}
diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c
index d72ab58fd83e..9ac80946dada 100644
--- a/arch/cris/mm/init.c
+++ b/arch/cris/mm/init.c
@@ -12,12 +12,10 @@
#include <linux/init.h>
#include <linux/bootmem.h>
#include <asm/tlb.h>
+#include <asm/sections.h>
unsigned long empty_zero_page;
-extern char _stext, _edata, _etext; /* From linkerscript */
-extern char __init_begin, __init_end;
-
void __init
mem_init(void)
{
@@ -67,15 +65,5 @@ mem_init(void)
void
free_initmem(void)
{
- unsigned long addr;
-
- addr = (unsigned long)(&__init_begin);
- for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- printk (KERN_INFO "Freeing unused kernel memory: %luk freed\n",
- (unsigned long)((&__init_end - &__init_begin) >> 10));
+ free_initmem_default(0);
}
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index 92e97b0894a6..dee354fa6b64 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -122,7 +122,7 @@ void __init mem_init(void)
#endif
int codek = 0, datak = 0;
- /* this will put all memory onto the freelists */
+ /* this will put all low memory onto the freelists */
totalram_pages = free_all_bootmem();
#ifdef CONFIG_MMU
@@ -131,14 +131,8 @@ void __init mem_init(void)
datapages++;
#ifdef CONFIG_HIGHMEM
- for (pfn = num_physpages - 1; pfn >= num_mappedpages; pfn--) {
- struct page *page = &mem_map[pfn];
-
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalram_pages++;
- }
+ for (pfn = num_physpages - 1; pfn >= num_mappedpages; pfn--)
+ free_highmem_page(&mem_map[pfn]);
#endif
codek = ((unsigned long) &_etext - (unsigned long) &_stext) >> 10;
@@ -168,21 +162,7 @@ void __init mem_init(void)
void free_initmem(void)
{
#if defined(CONFIG_RAMKERNEL) && !defined(CONFIG_PROTECT_KERNEL)
- unsigned long start, end, addr;
-
- start = PAGE_ALIGN((unsigned long) &__init_begin); /* round up */
- end = ((unsigned long) &__init_end) & PAGE_MASK; /* round down */
-
- /* next to check that the page we free is not a partial page */
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
-
- printk("Freeing unused kernel memory: %ldKiB freed (0x%lx - 0x%lx)\n",
- (end - start) >> 10, start, end);
+ free_initmem_default(0);
#endif
} /* end free_initmem() */
@@ -193,14 +173,6 @@ void free_initmem(void)
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
- int pages = 0;
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- pages++;
- }
- printk("Freeing initrd memory: %dKiB freed\n", (pages * PAGE_SIZE) >> 10);
+ free_reserved_area(start, end, 0, "initrd");
} /* end free_initrd_mem() */
#endif
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 981e25094b1a..ff349d70a29b 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -139,7 +139,7 @@ void __init mem_init(void)
start_mem = PAGE_ALIGN(start_mem);
max_mapnr = num_physpages = MAP_NR(high_memory);
- /* this will put all memory onto the freelists */
+ /* this will put all low memory onto the freelists */
totalram_pages = free_all_bootmem();
codek = (_etext - _stext) >> 10;
@@ -161,15 +161,7 @@ void __init mem_init(void)
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- int pages = 0;
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- pages++;
- }
- printk ("Freeing initrd memory: %dk freed\n", pages);
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
@@ -177,23 +169,7 @@ void
free_initmem(void)
{
#ifdef CONFIG_RAMKERNEL
- unsigned long addr;
-/*
- * the following code should be cool even if these sections
- * are not page aligned.
- */
- addr = PAGE_ALIGN((unsigned long)(__init_begin));
- /* next to check that the page we free is not a partial page */
- for (; addr + PAGE_SIZE < (unsigned long)__init_end; addr +=PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- printk(KERN_INFO "Freeing unused kernel memory: %ldk freed (0x%x - 0x%x)\n",
- (addr - PAGE_ALIGN((long) __init_begin)) >> 10,
- (int)(PAGE_ALIGN((unsigned long)__init_begin)),
- (int)(addr - PAGE_SIZE));
+ free_initmem_default(0);
#endif
}
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index 94eaa5bd5d0c..aa910054b8e7 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -2,6 +2,7 @@
#define _ASM_IA64_HUGETLB_H
#include <asm/page.h>
+#include <asm-generic/hugetlb.h>
void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 80dab509dfb0..67c59ebec899 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -47,6 +47,8 @@ void show_mem(unsigned int filter)
printk(KERN_INFO "Mem-info:\n");
show_free_areas(filter);
printk(KERN_INFO "Node memory in pages:\n");
+ if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+ return;
for_each_online_pgdat(pgdat) {
unsigned long present;
unsigned long flags;
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c2e955ee79a8..ae4db4bd6d97 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -623,6 +623,8 @@ void show_mem(unsigned int filter)
printk(KERN_INFO "Mem-info:\n");
show_free_areas(filter);
+ if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+ return;
printk(KERN_INFO "Node memory in pages:\n");
for_each_online_pgdat(pgdat) {
unsigned long present;
@@ -817,13 +819,12 @@ void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-int __meminit vmemmap_populate(struct page *start_page,
- unsigned long size, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
- return vmemmap_populate_basepages(start_page, size, node);
+ return vmemmap_populate_basepages(start, end, node);
}
-void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void vmemmap_free(unsigned long start, unsigned long end)
{
}
#endif
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 20bc967c7209..d1fe4b402601 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -154,25 +154,14 @@ ia64_init_addr_space (void)
void
free_initmem (void)
{
- unsigned long addr, eaddr;
-
- addr = (unsigned long) ia64_imva(__init_begin);
- eaddr = (unsigned long) ia64_imva(__init_end);
- while (addr < eaddr) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- ++totalram_pages;
- addr += PAGE_SIZE;
- }
- printk(KERN_INFO "Freeing unused kernel memory: %ldkB freed\n",
- (__init_end - __init_begin) >> 10);
+ free_reserved_area((unsigned long)ia64_imva(__init_begin),
+ (unsigned long)ia64_imva(__init_end),
+ 0, "unused kernel");
}
void __init
free_initrd_mem (unsigned long start, unsigned long end)
{
- struct page *page;
/*
* EFI uses 4KB pages while the kernel can use 4KB or bigger.
* Thus EFI and the kernel may have different page sizes. It is
@@ -213,11 +202,7 @@ free_initrd_mem (unsigned long start, unsigned long end)
for (; start < end; start += PAGE_SIZE) {
if (!virt_addr_valid(start))
continue;
- page = virt_to_page(start);
- ClearPageReserved(page);
- init_page_count(page);
- free_page(start);
- ++totalram_pages;
+ free_reserved_page(virt_to_page(start));
}
}
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
index def782e31aac..4248492b9321 100644
--- a/arch/ia64/mm/numa.c
+++ b/arch/ia64/mm/numa.c
@@ -61,13 +61,26 @@ paddr_to_nid(unsigned long paddr)
int __meminit __early_pfn_to_nid(unsigned long pfn)
{
int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec;
+ /*
+ * NOTE: The following SMP-unsafe globals are only used early in boot
+ * when the kernel is running single-threaded.
+ */
+ static int __meminitdata last_ssec, last_esec;
+ static int __meminitdata last_nid;
+
+ if (section >= last_ssec && section < last_esec)
+ return last_nid;
for (i = 0; i < num_node_memblks; i++) {
ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT;
esec = (node_memblk[i].start_paddr + node_memblk[i].size +
((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT;
- if (section >= ssec && section < esec)
+ if (section >= ssec && section < esec) {
+ last_ssec = ssec;
+ last_esec = esec;
+ last_nid = node_memblk[i].nid;
return node_memblk[i].nid;
+ }
}
return -1;
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 78b660e903da..ab4cbce91a9b 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -28,10 +28,7 @@
#include <asm/mmu_context.h>
#include <asm/setup.h>
#include <asm/tlb.h>
-
-/* References to section boundaries */
-extern char _text, _etext, _edata;
-extern char __init_begin, __init_end;
+#include <asm/sections.h>
pgd_t swapper_pg_dir[1024];
@@ -184,17 +181,7 @@ void __init mem_init(void)
*======================================================================*/
void free_initmem(void)
{
- unsigned long addr;
-
- addr = (unsigned long)(&__init_begin);
- for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", \
- (int)(&__init_end - &__init_begin) >> 10);
+ free_initmem_default(0);
}
#ifdef CONFIG_BLK_DEV_INITRD
@@ -204,13 +191,6 @@ void free_initmem(void)
*======================================================================*/
void free_initrd_mem(unsigned long start, unsigned long end)
{
- unsigned long p;
- for (p = start; p < end; p += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(p));
- init_page_count(virt_to_page(p));
- free_page(p);
- totalram_pages++;
- }
- printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index 519aad8fa812..1af2ca3411f6 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -110,18 +110,7 @@ void __init paging_init(void)
void free_initmem(void)
{
#ifndef CONFIG_MMU_SUN3
- unsigned long addr;
-
- addr = (unsigned long) __init_begin;
- for (; addr < ((unsigned long) __init_end); addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- pr_notice("Freeing unused kernel memory: %luk freed (0x%x - 0x%x)\n",
- (addr - (unsigned long) __init_begin) >> 10,
- (unsigned int) __init_begin, (unsigned int) __init_end);
+ free_initmem_default(0);
#endif /* CONFIG_MMU_SUN3 */
}
@@ -213,15 +202,6 @@ void __init mem_init(void)
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- int pages = 0;
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- pages++;
- }
- pr_notice("Freeing initrd memory: %dk freed\n",
- pages << (PAGE_SHIFT - 10));
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c
index 504a398d5f8b..d05b8455c44c 100644
--- a/arch/metag/mm/init.c
+++ b/arch/metag/mm/init.c
@@ -380,14 +380,8 @@ void __init mem_init(void)
#ifdef CONFIG_HIGHMEM
unsigned long tmp;
- for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) {
- struct page *page = pfn_to_page(tmp);
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
- }
- totalram_pages += totalhigh_pages;
+ for (tmp = highstart_pfn; tmp < highend_pfn; tmp++)
+ free_highmem_page(pfn_to_page(tmp));
num_physpages += totalhigh_pages;
#endif /* CONFIG_HIGHMEM */
@@ -412,32 +406,15 @@ void __init mem_init(void)
return;
}
-static void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
- unsigned long addr;
-
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- free_page(addr);
- totalram_pages++;
- }
- pr_info("Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-}
-
void free_initmem(void)
{
- free_init_pages("unused kernel memory",
- (unsigned long)(&__init_begin),
- (unsigned long)(&__init_end));
+ free_initmem_default(POISON_FREE_INITMEM);
}
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- end = end & PAGE_MASK;
- free_init_pages("initrd memory", start, end);
+ free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd");
}
#endif
diff --git a/arch/microblaze/include/asm/setup.h b/arch/microblaze/include/asm/setup.h
index 0e0b0a5ec756..f05df5630c84 100644
--- a/arch/microblaze/include/asm/setup.h
+++ b/arch/microblaze/include/asm/setup.h
@@ -46,7 +46,6 @@ void machine_shutdown(void);
void machine_halt(void);
void machine_power_off(void);
-void free_init_pages(char *what, unsigned long begin, unsigned long end);
extern void *alloc_maybe_bootmem(size_t size, gfp_t mask);
extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 8f8b367c079e..4ec137d13ad7 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -82,13 +82,9 @@ static unsigned long highmem_setup(void)
/* FIXME not sure about */
if (memblock_is_reserved(pfn << PAGE_SHIFT))
continue;
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
+ free_highmem_page(page);
reservedpages++;
}
- totalram_pages += totalhigh_pages;
pr_info("High memory: %luk\n",
totalhigh_pages << (PAGE_SHIFT-10));
@@ -236,40 +232,16 @@ void __init setup_memory(void)
paging_init();
}
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
- unsigned long addr;
-
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- pr_info("Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
-}
-
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- int pages = 0;
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- pages++;
- }
- pr_notice("Freeing initrd memory: %dk freed\n",
- (int)(pages * (PAGE_SIZE / 1024)));
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
void free_initmem(void)
{
- free_init_pages("unused kernel memory",
- (unsigned long)(&__init_begin),
- (unsigned long)(&__init_end));
+ free_initmem_default(0);
}
void __init mem_init(void)
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index ef99db994c2f..fe0d15d32660 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -10,6 +10,7 @@
#define __ASM_HUGETLB_H
#include <asm/page.h>
+#include <asm-generic/hugetlb.h>
static inline int is_hugepage_only_range(struct mm_struct *mm,
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 67929251286c..3d0346dbccf4 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -77,10 +77,9 @@ EXPORT_SYMBOL_GPL(empty_zero_page);
/*
* Not static inline because used by IP27 special magic initialization code
*/
-unsigned long setup_zero_pages(void)
+void setup_zero_pages(void)
{
- unsigned int order;
- unsigned long size;
+ unsigned int order, i;
struct page *page;
if (cpu_has_vce)
@@ -94,15 +93,10 @@ unsigned long setup_zero_pages(void)
page = virt_to_page((void *)empty_zero_page);
split_page(page, order);
- while (page < virt_to_page((void *)(empty_zero_page + (PAGE_SIZE << order)))) {
- SetPageReserved(page);
- page++;
- }
-
- size = PAGE_SIZE << order;
- zero_page_mask = (size - 1) & PAGE_MASK;
+ for (i = 0; i < (1 << order); i++, page++)
+ mark_page_reserved(page);
- return 1UL << order;
+ zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK;
}
#ifdef CONFIG_MIPS_MT_SMTC
@@ -380,7 +374,7 @@ void __init mem_init(void)
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
totalram_pages += free_all_bootmem();
- totalram_pages -= setup_zero_pages(); /* Setup zeroed pages. */
+ setup_zero_pages(); /* Setup zeroed pages. */
reservedpages = ram = 0;
for (tmp = 0; tmp < max_low_pfn; tmp++)
@@ -399,12 +393,8 @@ void __init mem_init(void)
SetPageReserved(page);
continue;
}
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
+ free_highmem_page(page);
}
- totalram_pages += totalhigh_pages;
num_physpages += totalhigh_pages;
#endif
@@ -440,11 +430,8 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end)
struct page *page = pfn_to_page(pfn);
void *addr = phys_to_virt(PFN_PHYS(pfn));
- ClearPageReserved(page);
- init_page_count(page);
memset(addr, POISON_FREE_INITMEM, PAGE_SIZE);
- __free_page(page);
- totalram_pages++;
+ free_reserved_page(page);
}
printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
}
@@ -452,18 +439,14 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end)
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- free_init_pages("initrd memory",
- virt_to_phys((void *)start),
- virt_to_phys((void *)end));
+ free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd");
}
#endif
void __init_refok free_initmem(void)
{
prom_free_prom_memory();
- free_init_pages("unused kernel memory",
- __pa_symbol(&__init_begin),
- __pa_symbol(&__init_end));
+ free_initmem_default(POISON_FREE_INITMEM);
}
#ifndef CONFIG_MIPS_PGD_C0_CONTEXT
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index 3505d08ff2fd..5f2bddb1860e 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -457,7 +457,7 @@ void __init prom_free_prom_memory(void)
/* We got nothing to free here ... */
}
-extern unsigned long setup_zero_pages(void);
+extern void setup_zero_pages(void);
void __init paging_init(void)
{
@@ -492,7 +492,7 @@ void __init mem_init(void)
totalram_pages += free_all_bootmem_node(NODE_DATA(node));
}
- totalram_pages -= setup_zero_pages(); /* This comes from node 0 */
+ setup_zero_pages(); /* This comes from node 0 */
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
diff --git a/arch/mn10300/mm/init.c b/arch/mn10300/mm/init.c
index e57e5bc23562..5a8ace63a6b4 100644
--- a/arch/mn10300/mm/init.c
+++ b/arch/mn10300/mm/init.c
@@ -139,30 +139,11 @@ void __init mem_init(void)
}
/*
- *
- */
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
- unsigned long addr;
-
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- memset((void *) addr, 0xcc, PAGE_SIZE);
- free_page(addr);
- totalram_pages++;
- }
- printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
-}
-
-/*
* recycle memory containing stuff only required for initialisation
*/
void free_initmem(void)
{
- free_init_pages("unused kernel memory",
- (unsigned long) &__init_begin,
- (unsigned long) &__init_end);
+ free_initmem_default(POISON_FREE_INITMEM);
}
/*
@@ -171,6 +152,6 @@ void free_initmem(void)
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- free_init_pages("initrd memory", start, end);
+ free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd");
}
#endif
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index e7fdc50c4bf0..b3cbc6703837 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -43,6 +43,7 @@
#include <asm/kmap_types.h>
#include <asm/fixmap.h>
#include <asm/tlbflush.h>
+#include <asm/sections.h>
int mem_init_done;
@@ -201,9 +202,6 @@ void __init paging_init(void)
/* References to section boundaries */
-extern char _stext, _etext, _edata, __bss_start, _end;
-extern char __init_begin, __init_end;
-
static int __init free_pages_init(void)
{
int reservedpages, pfn;
@@ -263,30 +261,11 @@ void __init mem_init(void)
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- printk(KERN_INFO "Freeing initrd memory: %ldk freed\n",
- (end - start) >> 10);
-
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- }
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
void free_initmem(void)
{
- unsigned long addr;
-
- addr = (unsigned long)(&__init_begin);
- for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- printk(KERN_INFO "Freeing unused kernel memory: %luk freed\n",
- ((unsigned long)&__init_end -
- (unsigned long)&__init_begin) >> 10);
+ free_initmem_default(0);
}
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 3ac462de53a4..157b931e7b09 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -505,7 +505,6 @@ static void __init map_pages(unsigned long start_vaddr,
void free_initmem(void)
{
- unsigned long addr;
unsigned long init_begin = (unsigned long)__init_begin;
unsigned long init_end = (unsigned long)__init_end;
@@ -533,19 +532,10 @@ void free_initmem(void)
* pages are no-longer executable */
flush_icache_range(init_begin, init_end);
- for (addr = init_begin; addr < init_end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- num_physpages++;
- totalram_pages++;
- }
+ num_physpages += free_initmem_default(0);
/* set up a new led state on systems shipped LED State panel */
pdc_chassis_send_status(PDC_CHASSIS_DIRECT_BCOMPLETE);
-
- printk(KERN_INFO "Freeing unused kernel memory: %luk freed\n",
- (init_end - init_begin) >> 10);
}
@@ -697,6 +687,8 @@ void show_mem(unsigned int filter)
printk(KERN_INFO "Mem-info:\n");
show_free_areas(filter);
+ if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+ return;
#ifndef CONFIG_DISCONTIGMEM
i = max_mapnr;
while (i-- > 0) {
@@ -1107,15 +1099,6 @@ void flush_tlb_all(void)
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start >= end)
- return;
- printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- num_physpages++;
- totalram_pages++;
- }
+ num_physpages += free_reserved_area(start, end, 0, "initrd");
}
#endif
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 62e11a32c4c2..4fcbd6b14a3a 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -3,6 +3,7 @@
#ifdef CONFIG_HUGETLB_PAGE
#include <asm/page.h>
+#include <asm-generic/hugetlb.h>
extern struct kmem_cache *hugepte_cache;
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index b3ba5163eae2..9ec3fe174cba 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -150,10 +150,7 @@ void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
if (addr <= rtas_end && ((addr + PAGE_SIZE) > rtas_start))
continue;
- ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
- init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
- free_page((unsigned long)__va(addr));
- totalram_pages++;
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
}
}
#endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 06c8202a69cf..2230fd0ca3e4 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1045,10 +1045,7 @@ static void fadump_release_memory(unsigned long begin, unsigned long end)
if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start))
continue;
- ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
- init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
- free_page((unsigned long)__va(addr));
- totalram_pages++;
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
}
}
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index a61b133c4f99..6782221d49bd 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -756,12 +756,7 @@ static __init void kvm_free_tmp(void)
end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK;
/* Free the tmp space we don't need */
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- }
+ free_reserved_area(start, end, 0, NULL);
}
static int __init kvm_guest_init(void)
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 7e2246fb2f31..5a535b73ea18 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -263,19 +263,14 @@ static __meminit void vmemmap_list_populate(unsigned long phys,
vmemmap_list = vmem_back;
}
-int __meminit vmemmap_populate(struct page *start_page,
- unsigned long nr_pages, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
- unsigned long start = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + nr_pages);
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
/* Align to the page size of the linear mapping. */
start = _ALIGN_DOWN(start, page_size);
- pr_debug("vmemmap_populate page %p, %ld pages, node %d\n",
- start_page, nr_pages, node);
- pr_debug(" -> map %lx..%lx\n", start, end);
+ pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
for (; start < end; start += page_size) {
void *p;
@@ -298,7 +293,7 @@ int __meminit vmemmap_populate(struct page *start_page,
return 0;
}
-void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void vmemmap_free(unsigned long start, unsigned long end)
{
}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index f1f7409a4183..cd76c454942f 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -352,13 +352,9 @@ void __init mem_init(void)
struct page *page = pfn_to_page(pfn);
if (memblock_is_reserved(paddr))
continue;
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
+ free_highmem_page(page);
reservedpages--;
}
- totalram_pages += totalhigh_pages;
printk(KERN_DEBUG "High memory: %luk\n",
totalhigh_pages << (PAGE_SHIFT-10));
}
@@ -405,39 +401,14 @@ void __init mem_init(void)
void free_initmem(void)
{
- unsigned long addr;
-
ppc_md.progress = ppc_printk_progress;
-
- addr = (unsigned long)__init_begin;
- for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- pr_info("Freeing unused kernel memory: %luk freed\n",
- ((unsigned long)__init_end -
- (unsigned long)__init_begin) >> 10);
+ free_initmem_default(POISON_FREE_INITMEM);
}
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start >= end)
- return;
-
- start = _ALIGN_DOWN(start, PAGE_SIZE);
- end = _ALIGN_UP(end, PAGE_SIZE);
- pr_info("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
-
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- }
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index bba87ca2b4d7..b8020dc7b71e 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -62,14 +62,11 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
*/
static void __init setup_node_to_cpumask_map(void)
{
- unsigned int node, num = 0;
+ unsigned int node;
/* setup nr_node_ids if not done yet */
- if (nr_node_ids == MAX_NUMNODES) {
- for_each_node_mask(node, node_possible_map)
- num = node;
- nr_node_ids = num + 1;
- }
+ if (nr_node_ids == MAX_NUMNODES)
+ setup_nr_node_ids();
/* allocate the map */
for (node = 0; node < nr_node_ids; node++)
diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c
index d30235b7e3f7..db6ac389ef8c 100644
--- a/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -172,12 +172,9 @@ static struct fsl_diu_shared_fb __attribute__ ((__aligned__(8))) diu_shared_fb;
static inline void mpc512x_free_bootmem(struct page *page)
{
- __ClearPageReserved(page);
BUG_ON(PageTail(page));
BUG_ON(atomic_read(&page->_count) > 1);
- atomic_set(&page->_count, 1);
- __free_page(page);
- totalram_pages++;
+ free_reserved_page(page);
}
void mpc512x_release_bootmem(void)
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2372c609fa2b..9a432de363b8 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -72,6 +72,7 @@ unsigned long memory_block_size_bytes(void)
return get_memblock_size();
}
+#ifdef CONFIG_MEMORY_HOTREMOVE
static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
{
unsigned long start, start_pfn;
@@ -153,6 +154,17 @@ static int pseries_remove_memory(struct device_node *np)
ret = pseries_remove_memblock(base, lmb_size);
return ret;
}
+#else
+static inline int pseries_remove_memblock(unsigned long base,
+ unsigned int memblock_size)
+{
+ return -EOPNOTSUPP;
+}
+static inline int pseries_remove_memory(struct device_node *np)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
static int pseries_add_memory(struct device_node *np)
{
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 593753ee07f3..bd90359d6d22 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -114,7 +114,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
#define huge_ptep_set_wrprotect(__mm, __addr, __ptep) \
({ \
pte_t __pte = huge_ptep_get(__ptep); \
- if (pte_write(__pte)) { \
+ if (huge_pte_write(__pte)) { \
huge_ptep_invalidate(__mm, __addr, __ptep); \
set_huge_pte_at(__mm, __addr, __ptep, \
huge_pte_wrprotect(__pte)); \
@@ -127,4 +127,58 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
huge_ptep_invalidate(vma->vm_mm, address, ptep);
}
+static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
+{
+ pte_t pte;
+ pmd_t pmd;
+
+ pmd = mk_pmd_phys(page_to_phys(page), pgprot);
+ pte_val(pte) = pmd_val(pmd);
+ return pte;
+}
+
+static inline int huge_pte_write(pte_t pte)
+{
+ pmd_t pmd;
+
+ pmd_val(pmd) = pte_val(pte);
+ return pmd_write(pmd);
+}
+
+static inline int huge_pte_dirty(pte_t pte)
+{
+ /* No dirty bit in the segment table entry. */
+ return 0;
+}
+
+static inline pte_t huge_pte_mkwrite(pte_t pte)
+{
+ pmd_t pmd;
+
+ pmd_val(pmd) = pte_val(pte);
+ pte_val(pte) = pmd_val(pmd_mkwrite(pmd));
+ return pte;
+}
+
+static inline pte_t huge_pte_mkdirty(pte_t pte)
+{
+ /* No dirty bit in the segment table entry. */
+ return pte;
+}
+
+static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
+{
+ pmd_t pmd;
+
+ pmd_val(pmd) = pte_val(pte);
+ pte_val(pte) = pmd_val(pmd_modify(pmd, newprot));
+ return pte;
+}
+
+static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pmd_clear((pmd_t *) ptep);
+}
+
#endif /* _ASM_S390_HUGETLB_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 4a64c0e5428f..b4622915bd15 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -424,6 +424,13 @@ extern unsigned long MODULES_END;
#define __S110 PAGE_RW
#define __S111 PAGE_RW
+/*
+ * Segment entry (large page) protection definitions.
+ */
+#define SEGMENT_NONE __pgprot(_HPAGE_TYPE_NONE)
+#define SEGMENT_RO __pgprot(_HPAGE_TYPE_RO)
+#define SEGMENT_RW __pgprot(_HPAGE_TYPE_RW)
+
static inline int mm_exclusive(struct mm_struct *mm)
{
return likely(mm == current->active_mm &&
@@ -914,26 +921,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
#ifdef CONFIG_HUGETLB_PAGE
static inline pte_t pte_mkhuge(pte_t pte)
{
- /*
- * PROT_NONE needs to be remapped from the pte type to the ste type.
- * The HW invalid bit is also different for pte and ste. The pte
- * invalid bit happens to be the same as the ste _SEGMENT_ENTRY_LARGE
- * bit, so we don't have to clear it.
- */
- if (pte_val(pte) & _PAGE_INVALID) {
- if (pte_val(pte) & _PAGE_SWT)
- pte_val(pte) |= _HPAGE_TYPE_NONE;
- pte_val(pte) |= _SEGMENT_ENTRY_INV;
- }
- /*
- * Clear SW pte bits, there are no SW bits in a segment table entry.
- */
- pte_val(pte) &= ~(_PAGE_SWT | _PAGE_SWX | _PAGE_SWC |
- _PAGE_SWR | _PAGE_SWW);
- /*
- * Also set the change-override bit because we don't need dirty bit
- * tracking for hugetlbfs pages.
- */
pte_val(pte) |= (_SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_CO);
return pte;
}
@@ -1278,31 +1265,7 @@ static inline void __pmd_idte(unsigned long address, pmd_t *pmdp)
}
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-#define SEGMENT_NONE __pgprot(_HPAGE_TYPE_NONE)
-#define SEGMENT_RO __pgprot(_HPAGE_TYPE_RO)
-#define SEGMENT_RW __pgprot(_HPAGE_TYPE_RW)
-
-#define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
-
-#define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
-
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT;
-}
-
-static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t entry)
-{
- if (!(pmd_val(entry) & _SEGMENT_ENTRY_INV) && MACHINE_HAS_EDAT1)
- pmd_val(entry) |= _SEGMENT_ENTRY_CO;
- *pmdp = entry;
-}
-
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
{
/*
@@ -1323,10 +1286,11 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
return pmd;
}
-static inline pmd_t pmd_mkhuge(pmd_t pmd)
+static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
- return pmd;
+ pmd_t __pmd;
+ pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
+ return __pmd;
}
static inline pmd_t pmd_mkwrite(pmd_t pmd)
@@ -1336,6 +1300,34 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
pmd_val(pmd) &= ~_SEGMENT_ENTRY_RO;
return pmd;
}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+#define __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+
+#define __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+ return pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT;
+}
+
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t entry)
+{
+ if (!(pmd_val(entry) & _SEGMENT_ENTRY_INV) && MACHINE_HAS_EDAT1)
+ pmd_val(entry) |= _SEGMENT_ENTRY_CO;
+ *pmdp = entry;
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
+ return pmd;
+}
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
@@ -1432,13 +1424,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
}
}
-static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
-{
- pmd_t __pmd;
- pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
- return __pmd;
-}
-
#define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 532525ec88c1..121089d57802 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -39,7 +39,7 @@ int arch_prepare_hugepage(struct page *page)
if (!ptep)
return -ENOMEM;
- pte = mk_pte(page, PAGE_RW);
+ pte_val(pte) = addr;
for (i = 0; i < PTRS_PER_PTE; i++) {
set_pte_at(&init_mm, addr + i * PAGE_SIZE, ptep + i, pte);
pte_val(pte) += PAGE_SIZE;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 9f9c315b4c07..0b09b2342302 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -42,11 +42,10 @@ pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE)));
unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
-static unsigned long __init setup_zero_pages(void)
+static void __init setup_zero_pages(void)
{
struct cpuid cpu_id;
unsigned int order;
- unsigned long size;
struct page *page;
int i;
@@ -83,14 +82,11 @@ static unsigned long __init setup_zero_pages(void)
page = virt_to_page((void *) empty_zero_page);
split_page(page, order);
for (i = 1 << order; i > 0; i--) {
- SetPageReserved(page);
+ mark_page_reserved(page);
page++;
}
- size = PAGE_SIZE << order;
- zero_page_mask = (size - 1) & PAGE_MASK;
-
- return 1UL << order;
+ zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK;
}
/*
@@ -147,7 +143,7 @@ void __init mem_init(void)
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
- totalram_pages -= setup_zero_pages(); /* Setup zeroed pages. */
+ setup_zero_pages(); /* Setup zeroed pages. */
reservedpages = 0;
@@ -166,34 +162,15 @@ void __init mem_init(void)
PFN_ALIGN((unsigned long)&_eshared) - 1);
}
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
- unsigned long addr = begin;
-
- if (begin >= end)
- return;
- for (; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- memset((void *)(addr & PAGE_MASK), POISON_FREE_INITMEM,
- PAGE_SIZE);
- free_page(addr);
- totalram_pages++;
- }
- printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-}
-
void free_initmem(void)
{
- free_init_pages("unused kernel memory",
- (unsigned long)&__init_begin,
- (unsigned long)&__init_end);
+ free_initmem_default(0);
}
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
- free_init_pages("initrd memory", start, end);
+ free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd");
}
#endif
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index ffab84db6907..35837054f734 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -191,19 +191,16 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
/*
* Add a backed mem_map array to the virtual mem_map array.
*/
-int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
- unsigned long address, start_addr, end_addr;
+ unsigned long address = start;
pgd_t *pg_dir;
pud_t *pu_dir;
pmd_t *pm_dir;
pte_t *pt_dir;
int ret = -ENOMEM;
- start_addr = (unsigned long) start;
- end_addr = (unsigned long) (start + nr);
-
- for (address = start_addr; address < end_addr;) {
+ for (address = start; address < end;) {
pg_dir = pgd_offset_k(address);
if (pgd_none(*pg_dir)) {
pu_dir = vmem_pud_alloc();
@@ -262,14 +259,14 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
}
address += PAGE_SIZE;
}
- memset(start, 0, nr * sizeof(struct page));
+ memset((void *)start, 0, end - start);
ret = 0;
out:
- flush_tlb_kernel_range(start_addr, end_addr);
+ flush_tlb_kernel_range(start, end);
return ret;
}
-void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void vmemmap_free(unsigned long start, unsigned long end)
{
}
diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c
index cee6bce1e30c..1592aad7dbc4 100644
--- a/arch/score/mm/init.c
+++ b/arch/score/mm/init.c
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(empty_zero_page);
static struct kcore_list kcore_mem, kcore_vmalloc;
-static unsigned long setup_zero_page(void)
+static void setup_zero_page(void)
{
struct page *page;
@@ -52,9 +52,7 @@ static unsigned long setup_zero_page(void)
panic("Oh boy, that early out of memory?");
page = virt_to_page((void *) empty_zero_page);
- SetPageReserved(page);
-
- return 1UL;
+ mark_page_reserved(page);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -84,7 +82,7 @@ void __init mem_init(void)
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
totalram_pages += free_all_bootmem();
- totalram_pages -= setup_zero_page(); /* Setup zeroed pages. */
+ setup_zero_page(); /* Setup zeroed pages. */
reservedpages = 0;
for (tmp = 0; tmp < max_low_pfn; tmp++)
@@ -109,37 +107,16 @@ void __init mem_init(void)
}
#endif /* !CONFIG_NEED_MULTIPLE_NODES */
-static void free_init_pages(const char *what, unsigned long begin, unsigned long end)
-{
- unsigned long pfn;
-
- for (pfn = PFN_UP(begin); pfn < PFN_DOWN(end); pfn++) {
- struct page *page = pfn_to_page(pfn);
- void *addr = phys_to_virt(PFN_PHYS(pfn));
-
- ClearPageReserved(page);
- init_page_count(page);
- memset(addr, POISON_FREE_INITMEM, PAGE_SIZE);
- __free_page(page);
- totalram_pages++;
- }
- printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
-}
-
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- free_init_pages("initrd memory",
- virt_to_phys((void *) start),
- virt_to_phys((void *) end));
+ free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd");
}
#endif
void __init_refok free_initmem(void)
{
- free_init_pages("unused kernel memory",
- __pa(&__init_begin),
- __pa(&__init_end));
+ free_initmem_default(POISON_FREE_INITMEM);
}
unsigned long pgd_current;
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index b3808c7d67b2..699255d6d1c6 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -3,6 +3,7 @@
#include <asm/cacheflush.h>
#include <asm/page.h>
+#include <asm-generic/hugetlb.h>
static inline int is_hugepage_only_range(struct mm_struct *mm,
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 105794037143..20f9ead650d3 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -417,15 +417,13 @@ void __init mem_init(void)
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long node_pages = 0;
void *node_high_memory;
num_physpages += pgdat->node_present_pages;
if (pgdat->node_spanned_pages)
- node_pages = free_all_bootmem_node(pgdat);
+ totalram_pages += free_all_bootmem_node(pgdat);
- totalram_pages += node_pages;
node_high_memory = (void *)__va((pgdat->node_start_pfn +
pgdat->node_spanned_pages) <<
@@ -501,31 +499,13 @@ void __init mem_init(void)
void free_initmem(void)
{
- unsigned long addr;
-
- addr = (unsigned long)(&__init_begin);
- for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- printk("Freeing unused kernel memory: %ldk freed\n",
- ((unsigned long)&__init_end -
- (unsigned long)&__init_begin) >> 10);
+ free_initmem_default(0);
}
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- unsigned long p;
- for (p = start; p < end; p += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(p));
- init_page_count(virt_to_page(p));
- free_page(p);
- totalram_pages++;
- }
- printk("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index 7eb57d245044..e4cab465b81f 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -2,6 +2,7 @@
#define _ASM_SPARC64_HUGETLB_H
#include <asm/page.h>
+#include <asm-generic/hugetlb.h>
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 48e0c030e8f5..4490c397bb5b 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -282,14 +282,8 @@ static void map_high_region(unsigned long start_pfn, unsigned long end_pfn)
printk("mapping high region %08lx - %08lx\n", start_pfn, end_pfn);
#endif
- for (tmp = start_pfn; tmp < end_pfn; tmp++) {
- struct page *page = pfn_to_page(tmp);
-
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
- }
+ for (tmp = start_pfn; tmp < end_pfn; tmp++)
+ free_highmem_page(pfn_to_page(tmp));
}
void __init mem_init(void)
@@ -347,8 +341,6 @@ void __init mem_init(void)
map_high_region(start_pfn, end_pfn);
}
- totalram_pages += totalhigh_pages;
-
codepages = (((unsigned long) &_etext) - ((unsigned long)&_start));
codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT;
datapages = (((unsigned long) &_edata) - ((unsigned long)&_etext));
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 1588d33d5492..6ac99d64a13c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2181,10 +2181,9 @@ unsigned long vmemmap_table[VMEMMAP_SIZE];
static long __meminitdata addr_start, addr_end;
static int __meminitdata node_start;
-int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
+int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
+ int node)
{
- unsigned long vstart = (unsigned long) start;
- unsigned long vend = (unsigned long) (start + nr);
unsigned long phys_start = (vstart - VMEMMAP_BASE);
unsigned long phys_end = (vend - VMEMMAP_BASE);
unsigned long addr = phys_start & VMEMMAP_CHUNK_MASK;
@@ -2236,7 +2235,7 @@ void __meminit vmemmap_populate_print_last(void)
}
}
-void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void vmemmap_free(unsigned long start, unsigned long end)
{
}
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index 0f885af2b621..3257733003f8 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -16,6 +16,7 @@
#define _ASM_TILE_HUGETLB_H
#include <asm/page.h>
+#include <asm-generic/hugetlb.h>
static inline int is_hugepage_only_range(struct mm_struct *mm,
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index b3b4972c2451..dfd63ce87327 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -592,12 +592,7 @@ void iounmap(volatile void __iomem *addr_in)
in parallel. Reuse of the virtual address is prevented by
leaving it in the global lists until we're done with it.
cpa takes care of the direct mappings. */
- read_lock(&vmlist_lock);
- for (p = vmlist; p; p = p->next) {
- if (p->addr == addr)
- break;
- }
- read_unlock(&vmlist_lock);
+ p = find_vm_area((void *)addr);
if (!p) {
pr_err("iounmap: bad address %p\n", addr);
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 5abcbfbe7e25..9df292b270a8 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -42,17 +42,12 @@ static unsigned long brk_end;
static void setup_highmem(unsigned long highmem_start,
unsigned long highmem_len)
{
- struct page *page;
unsigned long highmem_pfn;
int i;
highmem_pfn = __pa(highmem_start) >> PAGE_SHIFT;
- for (i = 0; i < highmem_len >> PAGE_SHIFT; i++) {
- page = &mem_map[highmem_pfn + i];
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- }
+ for (i = 0; i < highmem_len >> PAGE_SHIFT; i++)
+ free_highmem_page(&mem_map[highmem_pfn + i]);
}
#endif
@@ -73,18 +68,13 @@ void __init mem_init(void)
totalram_pages = free_all_bootmem();
max_low_pfn = totalram_pages;
#ifdef CONFIG_HIGHMEM
- totalhigh_pages = highmem >> PAGE_SHIFT;
- totalram_pages += totalhigh_pages;
+ setup_highmem(end_iomem, highmem);
#endif
num_physpages = totalram_pages;
max_pfn = totalram_pages;
printk(KERN_INFO "Memory: %luk available\n",
nr_free_pages() << (PAGE_SHIFT-10));
kmalloc_ok = 1;
-
-#ifdef CONFIG_HIGHMEM
- setup_highmem(end_iomem, highmem);
-#endif
}
/*
@@ -254,15 +244,7 @@ void free_initmem(void)
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start < end)
- printk(KERN_INFO "Freeing initrd memory: %ldk freed\n",
- (end - start) >> 10);
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- }
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index de186bde8975..63df12d71ce3 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -66,6 +66,9 @@ void show_mem(unsigned int filter)
printk(KERN_DEFAULT "Mem-info:\n");
show_free_areas(filter);
+ if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+ return;
+
for_each_bank(i, mi) {
struct membank *bank = &mi->bank[i];
unsigned int pfn1, pfn2;
@@ -313,24 +316,6 @@ void __init bootmem_init(void)
max_pfn = max_high - PHYS_PFN_OFFSET;
}
-static inline int free_area(unsigned long pfn, unsigned long end, char *s)
-{
- unsigned int pages = 0, size = (end - pfn) << (PAGE_SHIFT - 10);
-
- for (; pfn < end; pfn++) {
- struct page *page = pfn_to_page(pfn);
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- pages++;
- }
-
- if (size && s)
- printk(KERN_INFO "Freeing %s memory: %dK\n", s, size);
-
- return pages;
-}
-
static inline void
free_memmap(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -404,9 +389,9 @@ void __init mem_init(void)
max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;
- /* this will put all unused low memory onto the freelists */
free_unused_memmap(&meminfo);
+ /* this will put all unused low memory onto the freelists */
totalram_pages += free_all_bootmem();
reserved_pages = free_pages = 0;
@@ -491,9 +476,7 @@ void __init mem_init(void)
void free_initmem(void)
{
- totalram_pages += free_area(__phys_to_pfn(__pa(__init_begin)),
- __phys_to_pfn(__pa(__init_end)),
- "init");
+ free_initmem_default(0);
}
#ifdef CONFIG_BLK_DEV_INITRD
@@ -503,9 +486,7 @@ static int keep_initrd;
void free_initrd_mem(unsigned long start, unsigned long end)
{
if (!keep_initrd)
- totalram_pages += free_area(__phys_to_pfn(__pa(start)),
- __phys_to_pfn(__pa(end)),
- "initrd");
+ free_reserved_area(start, end, 0, "initrd");
}
static int __init keepinitrd_setup(char *__unused)
diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c
index b7a605597b08..13068ee22f33 100644
--- a/arch/unicore32/mm/ioremap.c
+++ b/arch/unicore32/mm/ioremap.c
@@ -235,7 +235,7 @@ EXPORT_SYMBOL(__uc32_ioremap_cached);
void __uc32_iounmap(volatile void __iomem *io_addr)
{
void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr);
- struct vm_struct **p, *tmp;
+ struct vm_struct *vm;
/*
* If this is a section based mapping we need to handle it
@@ -244,17 +244,10 @@ void __uc32_iounmap(volatile void __iomem *io_addr)
* all the mappings before the area can be reclaimed
* by someone else.
*/
- write_lock(&vmlist_lock);
- for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
- if ((tmp->flags & VM_IOREMAP) && (tmp->addr == addr)) {
- if (tmp->flags & VM_UNICORE_SECTION_MAPPING) {
- unmap_area_sections((unsigned long)tmp->addr,
- tmp->size);
- }
- break;
- }
- }
- write_unlock(&vmlist_lock);
+ vm = find_vm_area(addr);
+ if (vm && (vm->flags & VM_IOREMAP) &&
+ (vm->flags & VM_UNICORE_SECTION_MAPPING))
+ unmap_area_sections((unsigned long)vm->addr, vm->size);
vunmap(addr);
}
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index bdd35dbd0605..a8091216963b 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -2,6 +2,7 @@
#define _ASM_X86_HUGETLB_H
#include <asm/page.h>
+#include <asm-generic/hugetlb.h>
static inline int is_hugepage_only_range(struct mm_struct *mm,
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a0e067d3d96c..c9496313843b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -43,10 +43,10 @@ obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
cpufeature = $(src)/../../include/asm/cpufeature.h
targets += capflags.c
-$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
+$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
deleted file mode 100644
index 091972ef49de..000000000000
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/perl -w
-#
-# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
-#
-
-($in, $out) = @ARGV;
-
-open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
-open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
-
-print OUT "#ifndef _ASM_X86_CPUFEATURE_H\n";
-print OUT "#include <asm/cpufeature.h>\n";
-print OUT "#endif\n";
-print OUT "\n";
-print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
-
-%features = ();
-$err = 0;
-
-while (defined($line = <IN>)) {
- if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
- $macro = $1;
- $feature = "\L$2";
- $tail = $3;
- if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
- $feature = "\L$1";
- }
-
- next if ($feature eq '');
-
- if ($features{$feature}++) {
- print STDERR "$in: duplicate feature name: $feature\n";
- $err++;
- }
- printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature;
- }
-}
-print OUT "};\n";
-
-close(IN);
-close(OUT);
-
-if ($err) {
- unlink($out);
- exit(1);
-}
-
-exit(0);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
new file mode 100644
index 000000000000..2bf616505499
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+#
+# Generate the x86_cap_flags[] array from include/asm/cpufeature.h
+#
+
+IN=$1
+OUT=$2
+
+TABS="$(printf '\t\t\t\t\t')"
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURE_H"
+ echo "#include <asm/cpufeature.h>"
+ echo "#endif"
+ echo ""
+ echo "const char * const x86_cap_flags[NCAPINTS*32] = {"
+
+ # Iterate through any input lines starting with #define X86_FEATURE_
+ sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN |
+ while read i
+ do
+ # Name is everything up to the first whitespace
+ NAME="$(echo "$i" | sed 's/ .*//')"
+
+ # If the /* comment */ starts with a quote string, grab that.
+ VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
+ [ -z "$VALUE" ] && VALUE="\"$NAME\""
+ [ "$VALUE" == '""' ] && continue
+
+ # Name is uppercase, VALUE is all lowercase
+ VALUE="$(echo "$VALUE" | tr A-Z a-z)"
+
+ TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" \
+ "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ done
+ echo "};"
+) > $OUT
+
+trap - EXIT
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 6f31ee56c008..252b8f5489ba 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -137,5 +137,4 @@ void __init set_highmem_pages_init(void)
add_highpages_with_active_regions(nid, zone_start_pfn,
zone_end_pfn);
}
- totalram_pages += totalhigh_pages;
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 59b7fc453277..fdc5dca14fb3 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -515,11 +515,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
for (; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- free_page(addr);
- totalram_pages++;
+ free_reserved_page(virt_to_page(addr));
}
#endif
}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2d19001151d5..3ac7e319918d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -427,14 +427,6 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
pkmap_page_table = pte;
}
-static void __init add_one_highpage_init(struct page *page)
-{
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
-}
-
void __init add_highpages_with_active_regions(int nid,
unsigned long start_pfn, unsigned long end_pfn)
{
@@ -448,7 +440,7 @@ void __init add_highpages_with_active_regions(int nid,
start_pfn, end_pfn);
for ( ; pfn < e_pfn; pfn++)
if (pfn_valid(pfn))
- add_one_highpage_init(pfn_to_page(pfn));
+ free_highmem_page(pfn_to_page(pfn));
}
}
#else
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 474e28f10815..71ff55a1b287 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1011,11 +1011,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
flush_tlb_all();
}
-void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void __ref vmemmap_free(unsigned long start, unsigned long end)
{
- unsigned long start = (unsigned long)memmap;
- unsigned long end = (unsigned long)(memmap + nr_pages);
-
remove_pagetable(start, end, false);
}
@@ -1067,10 +1064,9 @@ void __init mem_init(void)
/* clear_bss() already clear the empty_zero_page */
- reservedpages = 0;
-
- /* this will put all low memory onto the freelists */
register_page_bootmem_info();
+
+ /* this will put all memory onto the freelists */
totalram_pages = free_all_bootmem();
absent_pages = absent_pages_in_range(0, max_pfn);
@@ -1285,18 +1281,17 @@ static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;
-int __meminit
-vmemmap_populate(struct page *start_page, unsigned long size, int node)
+static int __meminit vmemmap_populate_hugepages(unsigned long start,
+ unsigned long end, int node)
{
- unsigned long addr = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + size);
+ unsigned long addr;
unsigned long next;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- for (; addr < end; addr = next) {
- void *p = NULL;
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
@@ -1306,31 +1301,14 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
if (!pud)
return -ENOMEM;
- if (!cpu_has_pse) {
- next = (addr + PAGE_SIZE) & PAGE_MASK;
- pmd = vmemmap_pmd_populate(pud, addr, node);
-
- if (!pmd)
- return -ENOMEM;
-
- p = vmemmap_pte_populate(pmd, addr, node);
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ void *p;
- if (!p)
- return -ENOMEM;
-
- addr_end = addr + PAGE_SIZE;
- p_end = p + PAGE_SIZE;
- } else {
- next = pmd_addr_end(addr, end);
-
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd)) {
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ if (p) {
pte_t entry;
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
- if (!p)
- return -ENOMEM;
-
entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
PAGE_KERNEL_LARGE);
set_pmd(pmd, __pmd(pte_val(entry)));
@@ -1347,15 +1325,32 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
addr_end = addr + PMD_SIZE;
p_end = p + PMD_SIZE;
- } else
- vmemmap_verify((pte_t *)pmd, node, addr, next);
+ continue;
+ }
+ } else if (pmd_large(*pmd)) {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+ continue;
}
-
+ pr_warn_once("vmemmap: falling back to regular page backing\n");
+ if (vmemmap_populate_basepages(addr, next, node))
+ return -ENOMEM;
}
- sync_global_pgds((unsigned long)start_page, end - 1);
return 0;
}
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
+{
+ int err;
+
+ if (cpu_has_pse)
+ err = vmemmap_populate_hugepages(start, end, node);
+ else
+ err = vmemmap_populate_basepages(start, end, node);
+ if (!err)
+ sync_global_pgds(start, end - 1);
+ return err;
+}
+
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 78fe3f1ac49f..9a1e6583910c 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -282,12 +282,7 @@ void iounmap(volatile void __iomem *addr)
in parallel. Reuse of the virtual address is prevented by
leaving it in the global lists until we're done with it.
cpa takes care of the direct mappings. */
- read_lock(&vmlist_lock);
- for (p = vmlist; p; p = p->next) {
- if (p->addr == (void __force *)addr)
- break;
- }
- read_unlock(&vmlist_lock);
+ p = find_vm_area((void __force *)addr);
if (!p) {
printk(KERN_ERR "iounmap: bad address %p\n", addr);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 72fe01e9e414..a71c4e207679 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -114,14 +114,11 @@ void numa_clear_node(int cpu)
*/
void __init setup_node_to_cpumask_map(void)
{
- unsigned int node, num = 0;
+ unsigned int node;
/* setup nr_node_ids if not done yet */
- if (nr_node_ids == MAX_NUMNODES) {
- for_each_node_mask(node, node_possible_map)
- num = node;
- nr_node_ids = num + 1;
- }
+ if (nr_node_ids == MAX_NUMNODES)
+ setup_nr_node_ids();
/* allocate the map */
for (node = 0; node < nr_node_ids; node++)
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 7a5156ffebb6..bba125b4bb06 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -208,32 +208,17 @@ void __init mem_init(void)
highmemsize >> 10);
}
-void
-free_reserved_mem(void *start, void *end)
-{
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page((unsigned long)start);
- totalram_pages++;
- }
-}
-
#ifdef CONFIG_BLK_DEV_INITRD
extern int initrd_is_mapped;
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (initrd_is_mapped) {
- free_reserved_mem((void*)start, (void*)end);
- printk ("Freeing initrd memory: %ldk freed\n",(end-start)>>10);
- }
+ if (initrd_is_mapped)
+ free_reserved_area(start, end, 0, "initrd");
}
#endif
void free_initmem(void)
{
- free_reserved_mem(__init_begin, __init_end);
- printk("Freeing unused kernel memory: %zuk freed\n",
- (__init_end - __init_begin) >> 10);
+ free_initmem_default(0);
}
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index d8c7f3ee6e19..3d48fc887ef4 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -25,6 +25,15 @@ EXPORT_SYMBOL_GPL(cpu_subsys);
static DEFINE_PER_CPU(struct device *, cpu_sys_devices);
#ifdef CONFIG_HOTPLUG_CPU
+static void change_cpu_under_node(struct cpu *cpu,
+ unsigned int from_nid, unsigned int to_nid)
+{
+ int cpuid = cpu->dev.id;
+ unregister_cpu_under_node(cpuid, from_nid);
+ register_cpu_under_node(cpuid, to_nid);
+ cpu->node_id = to_nid;
+}
+
static ssize_t show_online(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -39,17 +48,29 @@ static ssize_t __ref store_online(struct device *dev,
const char *buf, size_t count)
{
struct cpu *cpu = container_of(dev, struct cpu, dev);
+ int cpuid = cpu->dev.id;
+ int from_nid, to_nid;
ssize_t ret;
cpu_hotplug_driver_lock();
switch (buf[0]) {
case '0':
- ret = cpu_down(cpu->dev.id);
+ ret = cpu_down(cpuid);
if (!ret)
kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
break;
case '1':
- ret = cpu_up(cpu->dev.id);
+ from_nid = cpu_to_node(cpuid);
+ ret = cpu_up(cpuid);
+
+ /*
+ * When hot adding memory to memoryless node and enabling a cpu
+ * on the node, node number of the cpu may internally change.
+ */
+ to_nid = cpu_to_node(cpuid);
+ if (from_nid != to_nid)
+ change_cpu_under_node(cpu, from_nid, to_nid);
+
if (!ret)
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
break;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index a51007b79032..14f8a6954da0 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -93,16 +93,6 @@ int register_memory(struct memory_block *memory)
return error;
}
-static void
-unregister_memory(struct memory_block *memory)
-{
- BUG_ON(memory->dev.bus != &memory_subsys);
-
- /* drop the ref. we got in remove_memory_block() */
- kobject_put(&memory->dev.kobj);
- device_unregister(&memory->dev);
-}
-
unsigned long __weak memory_block_size_bytes(void)
{
return MIN_MEMORY_BLOCK_SIZE;
@@ -217,8 +207,7 @@ int memory_isolate_notify(unsigned long val, void *v)
* The probe routines leave the pages reserved, just as the bootmem code does.
* Make sure they're still that way.
*/
-static bool pages_correctly_reserved(unsigned long start_pfn,
- unsigned long nr_pages)
+static bool pages_correctly_reserved(unsigned long start_pfn)
{
int i, j;
struct page *page;
@@ -266,7 +255,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
switch (action) {
case MEM_ONLINE:
- if (!pages_correctly_reserved(start_pfn, nr_pages))
+ if (!pages_correctly_reserved(start_pfn))
return -EBUSY;
ret = online_pages(start_pfn, nr_pages, online_type);
@@ -637,8 +626,28 @@ static int add_memory_section(int nid, struct mem_section *section,
return ret;
}
-int remove_memory_block(unsigned long node_id, struct mem_section *section,
- int phys_device)
+/*
+ * need an interface for the VM to add new memory regions,
+ * but without onlining it.
+ */
+int register_new_memory(int nid, struct mem_section *section)
+{
+ return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void
+unregister_memory(struct memory_block *memory)
+{
+ BUG_ON(memory->dev.bus != &memory_subsys);
+
+ /* drop the ref. we got in remove_memory_block() */
+ kobject_put(&memory->dev.kobj);
+ device_unregister(&memory->dev);
+}
+
+static int remove_memory_block(unsigned long node_id,
+ struct mem_section *section, int phys_device)
{
struct memory_block *mem;
@@ -661,15 +670,6 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section,
return 0;
}
-/*
- * need an interface for the VM to add new memory regions,
- * but without onlining it.
- */
-int register_new_memory(int nid, struct mem_section *section)
-{
- return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG);
-}
-
int unregister_memory_section(struct mem_section *section)
{
if (!present_section(section))
@@ -677,6 +677,7 @@ int unregister_memory_section(struct mem_section *section)
return remove_memory_block(0, section, 0);
}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
/*
* offline one memory block. If the memory block has been offlined, do nothing.
diff --git a/drivers/base/node.c b/drivers/base/node.c
index fac124a7e1c5..7616a77ca322 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/vmstat.h>
+#include <linux/notifier.h>
#include <linux/node.h>
#include <linux/hugetlb.h>
#include <linux/compaction.h>
@@ -683,8 +684,11 @@ static int __init register_node_type(void)
ret = subsys_system_register(&node_subsys, cpu_root_attr_groups);
if (!ret) {
- hotplug_memory_notifier(node_memory_callback,
- NODE_CALLBACK_PRI);
+ static struct notifier_block node_memory_callback_nb = {
+ .notifier_call = node_memory_callback,
+ .priority = NODE_CALLBACK_PRI,
+ };
+ register_hotmemory_notifier(&node_memory_callback_nb);
}
/*
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index 0b5b5f619c75..e2e04b007e15 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -114,12 +114,9 @@ static void __meminit release_firmware_map_entry(struct kobject *kobj)
* map_entries_bootmem here, and deleted from &map_entries in
* firmware_map_remove_entry().
*/
- if (firmware_map_find_entry(entry->start, entry->end,
- entry->type)) {
- spin_lock(&map_entries_bootmem_lock);
- list_add(&entry->list, &map_entries_bootmem);
- spin_unlock(&map_entries_bootmem_lock);
- }
+ spin_lock(&map_entries_bootmem_lock);
+ list_add(&entry->list, &map_entries_bootmem);
+ spin_unlock(&map_entries_bootmem_lock);
return;
}
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 981c1c08c727..76be61701c97 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -2440,6 +2440,15 @@ config FB_PUV3_UNIGFX
Choose this option if you want to use the Unigfx device as a
framebuffer device. Without the support of PCI & AGP.
+config FB_HYPERV
+ tristate "Microsoft Hyper-V Synthetic Video support"
+ depends on FB && HYPERV
+ select FB_CFB_FILLRECT
+ select FB_CFB_COPYAREA
+ select FB_CFB_IMAGEBLIT
+ help
+ This framebuffer driver supports Microsoft Hyper-V Synthetic Video.
+
source "drivers/video/omap/Kconfig"
source "drivers/video/omap2/Kconfig"
source "drivers/video/exynos/Kconfig"
diff --git a/drivers/video/Makefile b/drivers/video/Makefile
index e414378d6a51..7234e4a959e8 100644
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -149,6 +149,7 @@ obj-$(CONFIG_FB_MSM) += msm/
obj-$(CONFIG_FB_NUC900) += nuc900fb.o
obj-$(CONFIG_FB_JZ4740) += jz4740_fb.o
obj-$(CONFIG_FB_PUV3_UNIGFX) += fb-puv3.o
+obj-$(CONFIG_FB_HYPERV) += hyperv_fb.o
# Platform or fallback drivers go here
obj-$(CONFIG_FB_UVESA) += uvesafb.o
diff --git a/drivers/video/console/fbcon_cw.c b/drivers/video/console/fbcon_cw.c
index 6a737827beb1..a93670ef7f89 100644
--- a/drivers/video/console/fbcon_cw.c
+++ b/drivers/video/console/fbcon_cw.c
@@ -27,7 +27,7 @@ static void cw_update_attr(u8 *dst, u8 *src, int attribute,
{
int i, j, offset = (vc->vc_font.height < 10) ? 1 : 2;
int width = (vc->vc_font.height + 7) >> 3;
- u8 c, t = 0, msk = ~(0xff >> offset);
+ u8 c, msk = ~(0xff >> offset);
for (i = 0; i < vc->vc_font.width; i++) {
for (j = 0; j < width; j++) {
@@ -40,7 +40,6 @@ static void cw_update_attr(u8 *dst, u8 *src, int attribute,
c = ~c;
src++;
*dst++ = c;
- t = c;
}
}
}
diff --git a/drivers/video/ep93xx-fb.c b/drivers/video/ep93xx-fb.c
index e06cd5d90c97..ee1ee5401544 100644
--- a/drivers/video/ep93xx-fb.c
+++ b/drivers/video/ep93xx-fb.c
@@ -419,7 +419,7 @@ static struct fb_ops ep93xxfb_ops = {
.fb_mmap = ep93xxfb_mmap,
};
-static int __init ep93xxfb_calc_fbsize(struct ep93xxfb_mach_info *mach_info)
+static int ep93xxfb_calc_fbsize(struct ep93xxfb_mach_info *mach_info)
{
int i, fb_size = 0;
@@ -441,7 +441,7 @@ static int __init ep93xxfb_calc_fbsize(struct ep93xxfb_mach_info *mach_info)
return fb_size;
}
-static int __init ep93xxfb_alloc_videomem(struct fb_info *info)
+static int ep93xxfb_alloc_videomem(struct fb_info *info)
{
struct ep93xx_fbi *fbi = info->par;
char __iomem *virt_addr;
@@ -627,19 +627,7 @@ static struct platform_driver ep93xxfb_driver = {
.owner = THIS_MODULE,
},
};
-
-static int ep93xxfb_init(void)
-{
- return platform_driver_register(&ep93xxfb_driver);
-}
-
-static void __exit ep93xxfb_exit(void)
-{
- platform_driver_unregister(&ep93xxfb_driver);
-}
-
-module_init(ep93xxfb_init);
-module_exit(ep93xxfb_exit);
+module_platform_driver(ep93xxfb_driver);
MODULE_DESCRIPTION("EP93XX Framebuffer Driver");
MODULE_ALIAS("platform:ep93xx-fb");
diff --git a/drivers/video/exynos/exynos_mipi_dsi.c b/drivers/video/exynos/exynos_mipi_dsi.c
index 3dd43ca2b95f..32e540600f99 100644
--- a/drivers/video/exynos/exynos_mipi_dsi.c
+++ b/drivers/video/exynos/exynos_mipi_dsi.c
@@ -32,6 +32,7 @@
#include <linux/notifier.h>
#include <linux/regulator/consumer.h>
#include <linux/pm_runtime.h>
+#include <linux/err.h>
#include <video/exynos_mipi_dsim.h>
@@ -382,10 +383,9 @@ static int exynos_mipi_dsi_probe(struct platform_device *pdev)
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- dsim->reg_base = devm_request_and_ioremap(&pdev->dev, res);
- if (!dsim->reg_base) {
- dev_err(&pdev->dev, "failed to remap io region\n");
- ret = -ENOMEM;
+ dsim->reg_base = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(dsim->reg_base)) {
+ ret = PTR_ERR(dsim->reg_base);
goto error;
}
diff --git a/drivers/video/hyperv_fb.c b/drivers/video/hyperv_fb.c
new file mode 100644
index 000000000000..d4d2c5fe2488
--- /dev/null
+++ b/drivers/video/hyperv_fb.c
@@ -0,0 +1,829 @@
+/*
+ * Copyright (c) 2012, Microsoft Corporation.
+ *
+ * Author:
+ * Haiyang Zhang <haiyangz@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ */
+
+/*
+ * Hyper-V Synthetic Video Frame Buffer Driver
+ *
+ * This is the driver for the Hyper-V Synthetic Video, which supports
+ * screen resolution up to Full HD 1920x1080 with 32 bit color on Windows
+ * Server 2012, and 1600x1200 with 16 bit color on Windows Server 2008 R2
+ * or earlier.
+ *
+ * It also solves the double mouse cursor issue of the emulated video mode.
+ *
+ * The default screen resolution is 1152x864, which may be changed by a
+ * kernel parameter:
+ * video=hyperv_fb:<width>x<height>
+ * For example: video=hyperv_fb:1280x1024
+ *
+ * Portrait orientation is also supported:
+ * For example: video=hyperv_fb:864x1152
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/completion.h>
+#include <linux/fb.h>
+#include <linux/pci.h>
+
+#include <linux/hyperv.h>
+
+
+/* Hyper-V Synthetic Video Protocol definitions and structures */
+#define MAX_VMBUS_PKT_SIZE 0x4000
+
+#define SYNTHVID_VERSION(major, minor) ((minor) << 16 | (major))
+#define SYNTHVID_VERSION_WIN7 SYNTHVID_VERSION(3, 0)
+#define SYNTHVID_VERSION_WIN8 SYNTHVID_VERSION(3, 2)
+
+#define SYNTHVID_DEPTH_WIN7 16
+#define SYNTHVID_DEPTH_WIN8 32
+
+#define SYNTHVID_FB_SIZE_WIN7 (4 * 1024 * 1024)
+#define SYNTHVID_WIDTH_MAX_WIN7 1600
+#define SYNTHVID_HEIGHT_MAX_WIN7 1200
+
+#define SYNTHVID_FB_SIZE_WIN8 (8 * 1024 * 1024)
+
+#define PCI_VENDOR_ID_MICROSOFT 0x1414
+#define PCI_DEVICE_ID_HYPERV_VIDEO 0x5353
+
+
+enum pipe_msg_type {
+ PIPE_MSG_INVALID,
+ PIPE_MSG_DATA,
+ PIPE_MSG_MAX
+};
+
+struct pipe_msg_hdr {
+ u32 type;
+ u32 size; /* size of message after this field */
+} __packed;
+
+
+enum synthvid_msg_type {
+ SYNTHVID_ERROR = 0,
+ SYNTHVID_VERSION_REQUEST = 1,
+ SYNTHVID_VERSION_RESPONSE = 2,
+ SYNTHVID_VRAM_LOCATION = 3,
+ SYNTHVID_VRAM_LOCATION_ACK = 4,
+ SYNTHVID_SITUATION_UPDATE = 5,
+ SYNTHVID_SITUATION_UPDATE_ACK = 6,
+ SYNTHVID_POINTER_POSITION = 7,
+ SYNTHVID_POINTER_SHAPE = 8,
+ SYNTHVID_FEATURE_CHANGE = 9,
+ SYNTHVID_DIRT = 10,
+
+ SYNTHVID_MAX = 11
+};
+
+struct synthvid_msg_hdr {
+ u32 type;
+ u32 size; /* size of this header + payload after this field*/
+} __packed;
+
+
+struct synthvid_version_req {
+ u32 version;
+} __packed;
+
+struct synthvid_version_resp {
+ u32 version;
+ u8 is_accepted;
+ u8 max_video_outputs;
+} __packed;
+
+struct synthvid_vram_location {
+ u64 user_ctx;
+ u8 is_vram_gpa_specified;
+ u64 vram_gpa;
+} __packed;
+
+struct synthvid_vram_location_ack {
+ u64 user_ctx;
+} __packed;
+
+struct video_output_situation {
+ u8 active;
+ u32 vram_offset;
+ u8 depth_bits;
+ u32 width_pixels;
+ u32 height_pixels;
+ u32 pitch_bytes;
+} __packed;
+
+struct synthvid_situation_update {
+ u64 user_ctx;
+ u8 video_output_count;
+ struct video_output_situation video_output[1];
+} __packed;
+
+struct synthvid_situation_update_ack {
+ u64 user_ctx;
+} __packed;
+
+struct synthvid_pointer_position {
+ u8 is_visible;
+ u8 video_output;
+ s32 image_x;
+ s32 image_y;
+} __packed;
+
+
+#define CURSOR_MAX_X 96
+#define CURSOR_MAX_Y 96
+#define CURSOR_ARGB_PIXEL_SIZE 4
+#define CURSOR_MAX_SIZE (CURSOR_MAX_X * CURSOR_MAX_Y * CURSOR_ARGB_PIXEL_SIZE)
+#define CURSOR_COMPLETE (-1)
+
+struct synthvid_pointer_shape {
+ u8 part_idx;
+ u8 is_argb;
+ u32 width; /* CURSOR_MAX_X at most */
+ u32 height; /* CURSOR_MAX_Y at most */
+ u32 hot_x; /* hotspot relative to upper-left of pointer image */
+ u32 hot_y;
+ u8 data[4];
+} __packed;
+
+struct synthvid_feature_change {
+ u8 is_dirt_needed;
+ u8 is_ptr_pos_needed;
+ u8 is_ptr_shape_needed;
+ u8 is_situ_needed;
+} __packed;
+
+struct rect {
+ s32 x1, y1; /* top left corner */
+ s32 x2, y2; /* bottom right corner, exclusive */
+} __packed;
+
+struct synthvid_dirt {
+ u8 video_output;
+ u8 dirt_count;
+ struct rect rect[1];
+} __packed;
+
+struct synthvid_msg {
+ struct pipe_msg_hdr pipe_hdr;
+ struct synthvid_msg_hdr vid_hdr;
+ union {
+ struct synthvid_version_req ver_req;
+ struct synthvid_version_resp ver_resp;
+ struct synthvid_vram_location vram;
+ struct synthvid_vram_location_ack vram_ack;
+ struct synthvid_situation_update situ;
+ struct synthvid_situation_update_ack situ_ack;
+ struct synthvid_pointer_position ptr_pos;
+ struct synthvid_pointer_shape ptr_shape;
+ struct synthvid_feature_change feature_chg;
+ struct synthvid_dirt dirt;
+ };
+} __packed;
+
+
+
+/* FB driver definitions and structures */
+#define HVFB_WIDTH 1152 /* default screen width */
+#define HVFB_HEIGHT 864 /* default screen height */
+#define HVFB_WIDTH_MIN 640
+#define HVFB_HEIGHT_MIN 480
+
+#define RING_BUFSIZE (256 * 1024)
+#define VSP_TIMEOUT (10 * HZ)
+#define HVFB_UPDATE_DELAY (HZ / 20)
+
+struct hvfb_par {
+ struct fb_info *info;
+ bool fb_ready; /* fb device is ready */
+ struct completion wait;
+ u32 synthvid_version;
+
+ struct delayed_work dwork;
+ bool update;
+
+ u32 pseudo_palette[16];
+ u8 init_buf[MAX_VMBUS_PKT_SIZE];
+ u8 recv_buf[MAX_VMBUS_PKT_SIZE];
+};
+
+static uint screen_width = HVFB_WIDTH;
+static uint screen_height = HVFB_HEIGHT;
+static uint screen_depth;
+static uint screen_fb_size;
+
+/* Send message to Hyper-V host */
+static inline int synthvid_send(struct hv_device *hdev,
+ struct synthvid_msg *msg)
+{
+ static atomic64_t request_id = ATOMIC64_INIT(0);
+ int ret;
+
+ msg->pipe_hdr.type = PIPE_MSG_DATA;
+ msg->pipe_hdr.size = msg->vid_hdr.size;
+
+ ret = vmbus_sendpacket(hdev->channel, msg,
+ msg->vid_hdr.size + sizeof(struct pipe_msg_hdr),
+ atomic64_inc_return(&request_id),
+ VM_PKT_DATA_INBAND, 0);
+
+ if (ret)
+ pr_err("Unable to send packet via vmbus\n");
+
+ return ret;
+}
+
+
+/* Send screen resolution info to host */
+static int synthvid_send_situ(struct hv_device *hdev)
+{
+ struct fb_info *info = hv_get_drvdata(hdev);
+ struct synthvid_msg msg;
+
+ if (!info)
+ return -ENODEV;
+
+ memset(&msg, 0, sizeof(struct synthvid_msg));
+
+ msg.vid_hdr.type = SYNTHVID_SITUATION_UPDATE;
+ msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+ sizeof(struct synthvid_situation_update);
+ msg.situ.user_ctx = 0;
+ msg.situ.video_output_count = 1;
+ msg.situ.video_output[0].active = 1;
+ msg.situ.video_output[0].vram_offset = 0;
+ msg.situ.video_output[0].depth_bits = info->var.bits_per_pixel;
+ msg.situ.video_output[0].width_pixels = info->var.xres;
+ msg.situ.video_output[0].height_pixels = info->var.yres;
+ msg.situ.video_output[0].pitch_bytes = info->fix.line_length;
+
+ synthvid_send(hdev, &msg);
+
+ return 0;
+}
+
+/* Send mouse pointer info to host */
+static int synthvid_send_ptr(struct hv_device *hdev)
+{
+ struct synthvid_msg msg;
+
+ memset(&msg, 0, sizeof(struct synthvid_msg));
+ msg.vid_hdr.type = SYNTHVID_POINTER_POSITION;
+ msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+ sizeof(struct synthvid_pointer_position);
+ msg.ptr_pos.is_visible = 1;
+ msg.ptr_pos.video_output = 0;
+ msg.ptr_pos.image_x = 0;
+ msg.ptr_pos.image_y = 0;
+ synthvid_send(hdev, &msg);
+
+ memset(&msg, 0, sizeof(struct synthvid_msg));
+ msg.vid_hdr.type = SYNTHVID_POINTER_SHAPE;
+ msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+ sizeof(struct synthvid_pointer_shape);
+ msg.ptr_shape.part_idx = CURSOR_COMPLETE;
+ msg.ptr_shape.is_argb = 1;
+ msg.ptr_shape.width = 1;
+ msg.ptr_shape.height = 1;
+ msg.ptr_shape.hot_x = 0;
+ msg.ptr_shape.hot_y = 0;
+ msg.ptr_shape.data[0] = 0;
+ msg.ptr_shape.data[1] = 1;
+ msg.ptr_shape.data[2] = 1;
+ msg.ptr_shape.data[3] = 1;
+ synthvid_send(hdev, &msg);
+
+ return 0;
+}
+
+/* Send updated screen area (dirty rectangle) location to host */
+static int synthvid_update(struct fb_info *info)
+{
+ struct hv_device *hdev = device_to_hv_device(info->device);
+ struct synthvid_msg msg;
+
+ memset(&msg, 0, sizeof(struct synthvid_msg));
+
+ msg.vid_hdr.type = SYNTHVID_DIRT;
+ msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+ sizeof(struct synthvid_dirt);
+ msg.dirt.video_output = 0;
+ msg.dirt.dirt_count = 1;
+ msg.dirt.rect[0].x1 = 0;
+ msg.dirt.rect[0].y1 = 0;
+ msg.dirt.rect[0].x2 = info->var.xres;
+ msg.dirt.rect[0].y2 = info->var.yres;
+
+ synthvid_send(hdev, &msg);
+
+ return 0;
+}
+
+
+/*
+ * Actions on received messages from host:
+ * Complete the wait event.
+ * Or, reply with screen and cursor info.
+ */
+static void synthvid_recv_sub(struct hv_device *hdev)
+{
+ struct fb_info *info = hv_get_drvdata(hdev);
+ struct hvfb_par *par;
+ struct synthvid_msg *msg;
+
+ if (!info)
+ return;
+
+ par = info->par;
+ msg = (struct synthvid_msg *)par->recv_buf;
+
+ /* Complete the wait event */
+ if (msg->vid_hdr.type == SYNTHVID_VERSION_RESPONSE ||
+ msg->vid_hdr.type == SYNTHVID_VRAM_LOCATION_ACK) {
+ memcpy(par->init_buf, msg, MAX_VMBUS_PKT_SIZE);
+ complete(&par->wait);
+ return;
+ }
+
+ /* Reply with screen and cursor info */
+ if (msg->vid_hdr.type == SYNTHVID_FEATURE_CHANGE) {
+ if (par->fb_ready) {
+ synthvid_send_ptr(hdev);
+ synthvid_send_situ(hdev);
+ }
+
+ par->update = msg->feature_chg.is_dirt_needed;
+ if (par->update)
+ schedule_delayed_work(&par->dwork, HVFB_UPDATE_DELAY);
+ }
+}
+
+/* Receive callback for messages from the host */
+static void synthvid_receive(void *ctx)
+{
+ struct hv_device *hdev = ctx;
+ struct fb_info *info = hv_get_drvdata(hdev);
+ struct hvfb_par *par;
+ struct synthvid_msg *recv_buf;
+ u32 bytes_recvd;
+ u64 req_id;
+ int ret;
+
+ if (!info)
+ return;
+
+ par = info->par;
+ recv_buf = (struct synthvid_msg *)par->recv_buf;
+
+ do {
+ ret = vmbus_recvpacket(hdev->channel, recv_buf,
+ MAX_VMBUS_PKT_SIZE,
+ &bytes_recvd, &req_id);
+ if (bytes_recvd > 0 &&
+ recv_buf->pipe_hdr.type == PIPE_MSG_DATA)
+ synthvid_recv_sub(hdev);
+ } while (bytes_recvd > 0 && ret == 0);
+}
+
+/* Check synthetic video protocol version with the host */
+static int synthvid_negotiate_ver(struct hv_device *hdev, u32 ver)
+{
+ struct fb_info *info = hv_get_drvdata(hdev);
+ struct hvfb_par *par = info->par;
+ struct synthvid_msg *msg = (struct synthvid_msg *)par->init_buf;
+ int t, ret = 0;
+
+ memset(msg, 0, sizeof(struct synthvid_msg));
+ msg->vid_hdr.type = SYNTHVID_VERSION_REQUEST;
+ msg->vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+ sizeof(struct synthvid_version_req);
+ msg->ver_req.version = ver;
+ synthvid_send(hdev, msg);
+
+ t = wait_for_completion_timeout(&par->wait, VSP_TIMEOUT);
+ if (!t) {
+ pr_err("Time out on waiting version response\n");
+ ret = -ETIMEDOUT;
+ goto out;
+ }
+ if (!msg->ver_resp.is_accepted) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ par->synthvid_version = ver;
+
+out:
+ return ret;
+}
+
+/* Connect to VSP (Virtual Service Provider) on host */
+static int synthvid_connect_vsp(struct hv_device *hdev)
+{
+ struct fb_info *info = hv_get_drvdata(hdev);
+ struct hvfb_par *par = info->par;
+ int ret;
+
+ ret = vmbus_open(hdev->channel, RING_BUFSIZE, RING_BUFSIZE,
+ NULL, 0, synthvid_receive, hdev);
+ if (ret) {
+ pr_err("Unable to open vmbus channel\n");
+ return ret;
+ }
+
+ /* Negotiate the protocol version with host */
+ if (vmbus_proto_version == VERSION_WS2008 ||
+ vmbus_proto_version == VERSION_WIN7)
+ ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN7);
+ else
+ ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN8);
+
+ if (ret) {
+ pr_err("Synthetic video device version not accepted\n");
+ goto error;
+ }
+
+ if (par->synthvid_version == SYNTHVID_VERSION_WIN7) {
+ screen_depth = SYNTHVID_DEPTH_WIN7;
+ screen_fb_size = SYNTHVID_FB_SIZE_WIN7;
+ } else {
+ screen_depth = SYNTHVID_DEPTH_WIN8;
+ screen_fb_size = SYNTHVID_FB_SIZE_WIN8;
+ }
+
+ return 0;
+
+error:
+ vmbus_close(hdev->channel);
+ return ret;
+}
+
+/* Send VRAM and Situation messages to the host */
+static int synthvid_send_config(struct hv_device *hdev)
+{
+ struct fb_info *info = hv_get_drvdata(hdev);
+ struct hvfb_par *par = info->par;
+ struct synthvid_msg *msg = (struct synthvid_msg *)par->init_buf;
+ int t, ret = 0;
+
+ /* Send VRAM location */
+ memset(msg, 0, sizeof(struct synthvid_msg));
+ msg->vid_hdr.type = SYNTHVID_VRAM_LOCATION;
+ msg->vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+ sizeof(struct synthvid_vram_location);
+ msg->vram.user_ctx = msg->vram.vram_gpa = info->fix.smem_start;
+ msg->vram.is_vram_gpa_specified = 1;
+ synthvid_send(hdev, msg);
+
+ t = wait_for_completion_timeout(&par->wait, VSP_TIMEOUT);
+ if (!t) {
+ pr_err("Time out on waiting vram location ack\n");
+ ret = -ETIMEDOUT;
+ goto out;
+ }
+ if (msg->vram_ack.user_ctx != info->fix.smem_start) {
+ pr_err("Unable to set VRAM location\n");
+ ret = -ENODEV;
+ goto out;
+ }
+
+ /* Send pointer and situation update */
+ synthvid_send_ptr(hdev);
+ synthvid_send_situ(hdev);
+
+out:
+ return ret;
+}
+
+
+/*
+ * Delayed work callback:
+ * It is called at HVFB_UPDATE_DELAY or longer time interval to process
+ * screen updates. It is re-scheduled if further update is necessary.
+ */
+static void hvfb_update_work(struct work_struct *w)
+{
+ struct hvfb_par *par = container_of(w, struct hvfb_par, dwork.work);
+ struct fb_info *info = par->info;
+
+ if (par->fb_ready)
+ synthvid_update(info);
+
+ if (par->update)
+ schedule_delayed_work(&par->dwork, HVFB_UPDATE_DELAY);
+}
+
+
+/* Framebuffer operation handlers */
+
+static int hvfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+ if (var->xres < HVFB_WIDTH_MIN || var->yres < HVFB_HEIGHT_MIN ||
+ var->xres > screen_width || var->yres > screen_height ||
+ var->bits_per_pixel != screen_depth)
+ return -EINVAL;
+
+ var->xres_virtual = var->xres;
+ var->yres_virtual = var->yres;
+
+ return 0;
+}
+
+static int hvfb_set_par(struct fb_info *info)
+{
+ struct hv_device *hdev = device_to_hv_device(info->device);
+
+ return synthvid_send_situ(hdev);
+}
+
+
+static inline u32 chan_to_field(u32 chan, struct fb_bitfield *bf)
+{
+ return ((chan & 0xffff) >> (16 - bf->length)) << bf->offset;
+}
+
+static int hvfb_setcolreg(unsigned regno, unsigned red, unsigned green,
+ unsigned blue, unsigned transp, struct fb_info *info)
+{
+ u32 *pal = info->pseudo_palette;
+
+ if (regno > 15)
+ return -EINVAL;
+
+ pal[regno] = chan_to_field(red, &info->var.red)
+ | chan_to_field(green, &info->var.green)
+ | chan_to_field(blue, &info->var.blue)
+ | chan_to_field(transp, &info->var.transp);
+
+ return 0;
+}
+
+
+static struct fb_ops hvfb_ops = {
+ .owner = THIS_MODULE,
+ .fb_check_var = hvfb_check_var,
+ .fb_set_par = hvfb_set_par,
+ .fb_setcolreg = hvfb_setcolreg,
+ .fb_fillrect = cfb_fillrect,
+ .fb_copyarea = cfb_copyarea,
+ .fb_imageblit = cfb_imageblit,
+};
+
+
+/* Get options from kernel paramenter "video=" */
+static void hvfb_get_option(struct fb_info *info)
+{
+ struct hvfb_par *par = info->par;
+ char *opt = NULL, *p;
+ uint x = 0, y = 0;
+
+ if (fb_get_options(KBUILD_MODNAME, &opt) || !opt || !*opt)
+ return;
+
+ p = strsep(&opt, "x");
+ if (!*p || kstrtouint(p, 0, &x) ||
+ !opt || !*opt || kstrtouint(opt, 0, &y)) {
+ pr_err("Screen option is invalid: skipped\n");
+ return;
+ }
+
+ if (x < HVFB_WIDTH_MIN || y < HVFB_HEIGHT_MIN ||
+ (par->synthvid_version == SYNTHVID_VERSION_WIN8 &&
+ x * y * screen_depth / 8 > SYNTHVID_FB_SIZE_WIN8) ||
+ (par->synthvid_version == SYNTHVID_VERSION_WIN7 &&
+ (x > SYNTHVID_WIDTH_MAX_WIN7 || y > SYNTHVID_HEIGHT_MAX_WIN7))) {
+ pr_err("Screen resolution option is out of range: skipped\n");
+ return;
+ }
+
+ screen_width = x;
+ screen_height = y;
+ return;
+}
+
+
+/* Get framebuffer memory from Hyper-V video pci space */
+static int hvfb_getmem(struct fb_info *info)
+{
+ struct pci_dev *pdev;
+ ulong fb_phys;
+ void __iomem *fb_virt;
+
+ pdev = pci_get_device(PCI_VENDOR_ID_MICROSOFT,
+ PCI_DEVICE_ID_HYPERV_VIDEO, NULL);
+ if (!pdev) {
+ pr_err("Unable to find PCI Hyper-V video\n");
+ return -ENODEV;
+ }
+
+ if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+ pci_resource_len(pdev, 0) < screen_fb_size)
+ goto err1;
+
+ fb_phys = pci_resource_end(pdev, 0) - screen_fb_size + 1;
+ if (!request_mem_region(fb_phys, screen_fb_size, KBUILD_MODNAME))
+ goto err1;
+
+ fb_virt = ioremap(fb_phys, screen_fb_size);
+ if (!fb_virt)
+ goto err2;
+
+ info->apertures = alloc_apertures(1);
+ if (!info->apertures)
+ goto err3;
+
+ info->apertures->ranges[0].base = pci_resource_start(pdev, 0);
+ info->apertures->ranges[0].size = pci_resource_len(pdev, 0);
+ info->fix.smem_start = fb_phys;
+ info->fix.smem_len = screen_fb_size;
+ info->screen_base = fb_virt;
+ info->screen_size = screen_fb_size;
+
+ pci_dev_put(pdev);
+ return 0;
+
+err3:
+ iounmap(fb_virt);
+err2:
+ release_mem_region(fb_phys, screen_fb_size);
+err1:
+ pci_dev_put(pdev);
+ return -ENOMEM;
+}
+
+/* Release the framebuffer */
+static void hvfb_putmem(struct fb_info *info)
+{
+ iounmap(info->screen_base);
+ release_mem_region(info->fix.smem_start, screen_fb_size);
+}
+
+
+static int hvfb_probe(struct hv_device *hdev,
+ const struct hv_vmbus_device_id *dev_id)
+{
+ struct fb_info *info;
+ struct hvfb_par *par;
+ int ret;
+
+ info = framebuffer_alloc(sizeof(struct hvfb_par), &hdev->device);
+ if (!info) {
+ pr_err("No memory for framebuffer info\n");
+ return -ENOMEM;
+ }
+
+ par = info->par;
+ par->info = info;
+ par->fb_ready = false;
+ init_completion(&par->wait);
+ INIT_DELAYED_WORK(&par->dwork, hvfb_update_work);
+
+ /* Connect to VSP */
+ hv_set_drvdata(hdev, info);
+ ret = synthvid_connect_vsp(hdev);
+ if (ret) {
+ pr_err("Unable to connect to VSP\n");
+ goto error1;
+ }
+
+ ret = hvfb_getmem(info);
+ if (ret) {
+ pr_err("No memory for framebuffer\n");
+ goto error2;
+ }
+
+ hvfb_get_option(info);
+ pr_info("Screen resolution: %dx%d, Color depth: %d\n",
+ screen_width, screen_height, screen_depth);
+
+
+ /* Set up fb_info */
+ info->flags = FBINFO_DEFAULT;
+
+ info->var.xres_virtual = info->var.xres = screen_width;
+ info->var.yres_virtual = info->var.yres = screen_height;
+ info->var.bits_per_pixel = screen_depth;
+
+ if (info->var.bits_per_pixel == 16) {
+ info->var.red = (struct fb_bitfield){11, 5, 0};
+ info->var.green = (struct fb_bitfield){5, 6, 0};
+ info->var.blue = (struct fb_bitfield){0, 5, 0};
+ info->var.transp = (struct fb_bitfield){0, 0, 0};
+ } else {
+ info->var.red = (struct fb_bitfield){16, 8, 0};
+ info->var.green = (struct fb_bitfield){8, 8, 0};
+ info->var.blue = (struct fb_bitfield){0, 8, 0};
+ info->var.transp = (struct fb_bitfield){24, 8, 0};
+ }
+
+ info->var.activate = FB_ACTIVATE_NOW;
+ info->var.height = -1;
+ info->var.width = -1;
+ info->var.vmode = FB_VMODE_NONINTERLACED;
+
+ strcpy(info->fix.id, KBUILD_MODNAME);
+ info->fix.type = FB_TYPE_PACKED_PIXELS;
+ info->fix.visual = FB_VISUAL_TRUECOLOR;
+ info->fix.line_length = screen_width * screen_depth / 8;
+ info->fix.accel = FB_ACCEL_NONE;
+
+ info->fbops = &hvfb_ops;
+ info->pseudo_palette = par->pseudo_palette;
+
+ /* Send config to host */
+ ret = synthvid_send_config(hdev);
+ if (ret)
+ goto error;
+
+ ret = register_framebuffer(info);
+ if (ret) {
+ pr_err("Unable to register framebuffer\n");
+ goto error;
+ }
+
+ par->fb_ready = true;
+
+ return 0;
+
+error:
+ hvfb_putmem(info);
+error2:
+ vmbus_close(hdev->channel);
+error1:
+ cancel_delayed_work_sync(&par->dwork);
+ hv_set_drvdata(hdev, NULL);
+ framebuffer_release(info);
+ return ret;
+}
+
+
+static int hvfb_remove(struct hv_device *hdev)
+{
+ struct fb_info *info = hv_get_drvdata(hdev);
+ struct hvfb_par *par = info->par;
+
+ par->update = false;
+ par->fb_ready = false;
+
+ unregister_framebuffer(info);
+ cancel_delayed_work_sync(&par->dwork);
+
+ vmbus_close(hdev->channel);
+ hv_set_drvdata(hdev, NULL);
+
+ hvfb_putmem(info);
+ framebuffer_release(info);
+
+ return 0;
+}
+
+
+static const struct hv_vmbus_device_id id_table[] = {
+ /* Synthetic Video Device GUID */
+ {HV_SYNTHVID_GUID},
+ {}
+};
+
+MODULE_DEVICE_TABLE(vmbus, id_table);
+
+static struct hv_driver hvfb_drv = {
+ .name = KBUILD_MODNAME,
+ .id_table = id_table,
+ .probe = hvfb_probe,
+ .remove = hvfb_remove,
+};
+
+
+static int __init hvfb_drv_init(void)
+{
+ return vmbus_driver_register(&hvfb_drv);
+}
+
+static void __exit hvfb_drv_exit(void)
+{
+ vmbus_driver_unregister(&hvfb_drv);
+}
+
+module_init(hvfb_drv_init);
+module_exit(hvfb_drv_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(HV_DRV_VERSION);
+MODULE_DESCRIPTION("Microsoft Hyper-V Synthetic Video Frame Buffer Driver");
diff --git a/drivers/video/matrox/matroxfb_maven.c b/drivers/video/matrox/matroxfb_maven.c
index 217678e0b983..fd2897455696 100644
--- a/drivers/video/matrox/matroxfb_maven.c
+++ b/drivers/video/matrox/matroxfb_maven.c
@@ -137,8 +137,20 @@ static int* get_ctrl_ptr(struct maven_data* md, int idx) {
static int maven_get_reg(struct i2c_client* c, char reg) {
char dst;
- struct i2c_msg msgs[] = {{ c->addr, I2C_M_REV_DIR_ADDR, sizeof(reg), &reg },
- { c->addr, I2C_M_RD | I2C_M_NOSTART, sizeof(dst), &dst }};
+ struct i2c_msg msgs[] = {
+ {
+ .addr = c->addr,
+ .flags = I2C_M_REV_DIR_ADDR,
+ .len = sizeof(reg),
+ .buf = &reg
+ },
+ {
+ .addr = c->addr,
+ .flags = I2C_M_RD | I2C_M_NOSTART,
+ .len = sizeof(dst),
+ .buf = &dst
+ }
+ };
s32 err;
err = i2c_transfer(c->adapter, msgs, 2);
diff --git a/drivers/video/mmp/hw/mmp_ctrl.h b/drivers/video/mmp/hw/mmp_ctrl.h
index 6408d8ef3abb..edd2002b0e99 100644
--- a/drivers/video/mmp/hw/mmp_ctrl.h
+++ b/drivers/video/mmp/hw/mmp_ctrl.h
@@ -961,56 +961,7 @@ struct lcd_regs {
LCD_TVG_CUTVLN : PN2_LCD_GRA_CUTVLN) : LCD_GRA_CUTVLN)
/*
- * defined Video Memory Color format for DMA control 0 register
- * DMA0 bit[23:20]
- */
-#define VMODE_RGB565 0x0
-#define VMODE_RGB1555 0x1
-#define VMODE_RGB888PACKED 0x2
-#define VMODE_RGB888UNPACKED 0x3
-#define VMODE_RGBA888 0x4
-#define VMODE_YUV422PACKED 0x5
-#define VMODE_YUV422PLANAR 0x6
-#define VMODE_YUV420PLANAR 0x7
-#define VMODE_SMPNCMD 0x8
-#define VMODE_PALETTE4BIT 0x9
-#define VMODE_PALETTE8BIT 0xa
-#define VMODE_RESERVED 0xb
-
-/*
- * defined Graphic Memory Color format for DMA control 0 register
- * DMA0 bit[19:16]
- */
-#define GMODE_RGB565 0x0
-#define GMODE_RGB1555 0x1
-#define GMODE_RGB888PACKED 0x2
-#define GMODE_RGB888UNPACKED 0x3
-#define GMODE_RGBA888 0x4
-#define GMODE_YUV422PACKED 0x5
-#define GMODE_YUV422PLANAR 0x6
-#define GMODE_YUV420PLANAR 0x7
-#define GMODE_SMPNCMD 0x8
-#define GMODE_PALETTE4BIT 0x9
-#define GMODE_PALETTE8BIT 0xa
-#define GMODE_RESERVED 0xb
-
-/*
- * define for DMA control 1 register
- */
-#define DMA1_FRAME_TRIG 31 /* bit location */
-#define DMA1_VSYNC_MODE 28
-#define DMA1_VSYNC_INV 27
-#define DMA1_CKEY 24
-#define DMA1_CARRY 23
-#define DMA1_LNBUF_ENA 22
-#define DMA1_GATED_ENA 21
-#define DMA1_PWRDN_ENA 20
-#define DMA1_DSCALE 18
-#define DMA1_ALPHA_MODE 16
-#define DMA1_ALPHA 08
-#define DMA1_PXLCMD 00
-
-/*
+ * defined for Configure Dumb Mode
* defined for Configure Dumb Mode
* DUMB LCD Panel bit[31:28]
*/
@@ -1050,18 +1001,6 @@ struct lcd_regs {
#define CFG_CYC_BURST_LEN16 (1<<4)
#define CFG_CYC_BURST_LEN8 (0<<4)
-/*
- * defined Dumb Panel Clock Divider register
- * SCLK_Source bit[31]
- */
- /* 0: PLL clock select*/
-#define AXI_BUS_SEL 0x80000000
-#define CCD_CLK_SEL 0x40000000
-#define DCON_CLK_SEL 0x20000000
-#define ENA_CLK_INT_DIV CONFIG_FB_DOVE_CLCD_SCLK_DIV
-#define IDLE_CLK_INT_DIV 0x1 /* idle Integer Divider */
-#define DIS_CLK_INT_DIV 0x0 /* Disable Integer Divider */
-
/* SRAM ID */
#define SRAMID_GAMMA_YR 0x0
#define SRAMID_GAMMA_UG 0x1
@@ -1471,422 +1410,6 @@ struct dsi_regs {
#define LVDS_FREQ_OFFSET_MODE_CK_DIV4_OUT (0x1 << 1)
#define LVDS_FREQ_OFFSET_MODE_EN (0x1 << 0)
-/* VDMA */
-struct vdma_ch_regs {
-#define VDMA_DC_SADDR_1 0x320
-#define VDMA_DC_SADDR_2 0x3A0
-#define VDMA_DC_SZ_1 0x324
-#define VDMA_DC_SZ_2 0x3A4
-#define VDMA_CTRL_1 0x328
-#define VDMA_CTRL_2 0x3A8
-#define VDMA_SRC_SZ_1 0x32C
-#define VDMA_SRC_SZ_2 0x3AC
-#define VDMA_SA_1 0x330
-#define VDMA_SA_2 0x3B0
-#define VDMA_DA_1 0x334
-#define VDMA_DA_2 0x3B4
-#define VDMA_SZ_1 0x338
-#define VDMA_SZ_2 0x3B8
- u32 dc_saddr;
- u32 dc_size;
- u32 ctrl;
- u32 src_size;
- u32 src_addr;
- u32 dst_addr;
- u32 dst_size;
-#define VDMA_PITCH_1 0x33C
-#define VDMA_PITCH_2 0x3BC
-#define VDMA_ROT_CTRL_1 0x340
-#define VDMA_ROT_CTRL_2 0x3C0
-#define VDMA_RAM_CTRL0_1 0x344
-#define VDMA_RAM_CTRL0_2 0x3C4
-#define VDMA_RAM_CTRL1_1 0x348
-#define VDMA_RAM_CTRL1_2 0x3C8
- u32 pitch;
- u32 rot_ctrl;
- u32 ram_ctrl0;
- u32 ram_ctrl1;
-
-};
-struct vdma_regs {
-#define VDMA_ARBR_CTRL 0x300
-#define VDMA_IRQR 0x304
-#define VDMA_IRQM 0x308
-#define VDMA_IRQS 0x30C
-#define VDMA_MDMA_ARBR_CTRL 0x310
- u32 arbr_ctr;
- u32 irq_raw;
- u32 irq_mask;
- u32 irq_status;
- u32 mdma_arbr_ctrl;
- u32 reserved[3];
-
- struct vdma_ch_regs ch1;
- u32 reserved2[21];
- struct vdma_ch_regs ch2;
-};
-
-/* CMU */
-#define CMU_PIP_DE_H_CFG 0x0008
-#define CMU_PRI1_H_CFG 0x000C
-#define CMU_PRI2_H_CFG 0x0010
-#define CMU_ACE_MAIN_DE1_H_CFG 0x0014
-#define CMU_ACE_MAIN_DE2_H_CFG 0x0018
-#define CMU_ACE_PIP_DE1_H_CFG 0x001C
-#define CMU_ACE_PIP_DE2_H_CFG 0x0020
-#define CMU_PIP_DE_V_CFG 0x0024
-#define CMU_PRI_V_CFG 0x0028
-#define CMU_ACE_MAIN_DE_V_CFG 0x002C
-#define CMU_ACE_PIP_DE_V_CFG 0x0030
-#define CMU_BAR_0_CFG 0x0034
-#define CMU_BAR_1_CFG 0x0038
-#define CMU_BAR_2_CFG 0x003C
-#define CMU_BAR_3_CFG 0x0040
-#define CMU_BAR_4_CFG 0x0044
-#define CMU_BAR_5_CFG 0x0048
-#define CMU_BAR_6_CFG 0x004C
-#define CMU_BAR_7_CFG 0x0050
-#define CMU_BAR_8_CFG 0x0054
-#define CMU_BAR_9_CFG 0x0058
-#define CMU_BAR_10_CFG 0x005C
-#define CMU_BAR_11_CFG 0x0060
-#define CMU_BAR_12_CFG 0x0064
-#define CMU_BAR_13_CFG 0x0068
-#define CMU_BAR_14_CFG 0x006C
-#define CMU_BAR_15_CFG 0x0070
-#define CMU_BAR_CTRL 0x0074
-#define PATTERN_TOTAL 0x0078
-#define PATTERN_ACTIVE 0x007C
-#define PATTERN_FRONT_PORCH 0x0080
-#define PATTERN_BACK_PORCH 0x0084
-#define CMU_CLK_CTRL 0x0088
-
-#define CMU_ICSC_M_C0_L 0x0900
-#define CMU_ICSC_M_C0_H 0x0901
-#define CMU_ICSC_M_C1_L 0x0902
-#define CMU_ICSC_M_C1_H 0x0903
-#define CMU_ICSC_M_C2_L 0x0904
-#define CMU_ICSC_M_C2_H 0x0905
-#define CMU_ICSC_M_C3_L 0x0906
-#define CMU_ICSC_M_C3_H 0x0907
-#define CMU_ICSC_M_C4_L 0x0908
-#define CMU_ICSC_M_C4_H 0x0909
-#define CMU_ICSC_M_C5_L 0x090A
-#define CMU_ICSC_M_C5_H 0x090B
-#define CMU_ICSC_M_C6_L 0x090C
-#define CMU_ICSC_M_C6_H 0x090D
-#define CMU_ICSC_M_C7_L 0x090E
-#define CMU_ICSC_M_C7_H 0x090F
-#define CMU_ICSC_M_C8_L 0x0910
-#define CMU_ICSC_M_C8_H 0x0911
-#define CMU_ICSC_M_O1_0 0x0914
-#define CMU_ICSC_M_O1_1 0x0915
-#define CMU_ICSC_M_O1_2 0x0916
-#define CMU_ICSC_M_O2_0 0x0918
-#define CMU_ICSC_M_O2_1 0x0919
-#define CMU_ICSC_M_O2_2 0x091A
-#define CMU_ICSC_M_O3_0 0x091C
-#define CMU_ICSC_M_O3_1 0x091D
-#define CMU_ICSC_M_O3_2 0x091E
-#define CMU_ICSC_P_C0_L 0x0920
-#define CMU_ICSC_P_C0_H 0x0921
-#define CMU_ICSC_P_C1_L 0x0922
-#define CMU_ICSC_P_C1_H 0x0923
-#define CMU_ICSC_P_C2_L 0x0924
-#define CMU_ICSC_P_C2_H 0x0925
-#define CMU_ICSC_P_C3_L 0x0926
-#define CMU_ICSC_P_C3_H 0x0927
-#define CMU_ICSC_P_C4_L 0x0928
-#define CMU_ICSC_P_C4_H 0x0929
-#define CMU_ICSC_P_C5_L 0x092A
-#define CMU_ICSC_P_C5_H 0x092B
-#define CMU_ICSC_P_C6_L 0x092C
-#define CMU_ICSC_P_C6_H 0x092D
-#define CMU_ICSC_P_C7_L 0x092E
-#define CMU_ICSC_P_C7_H 0x092F
-#define CMU_ICSC_P_C8_L 0x0930
-#define CMU_ICSC_P_C8_H 0x0931
-#define CMU_ICSC_P_O1_0 0x0934
-#define CMU_ICSC_P_O1_1 0x0935
-#define CMU_ICSC_P_O1_2 0x0936
-#define CMU_ICSC_P_O2_0 0x0938
-#define CMU_ICSC_P_O2_1 0x0939
-#define CMU_ICSC_P_O2_2 0x093A
-#define CMU_ICSC_P_O3_0 0x093C
-#define CMU_ICSC_P_O3_1 0x093D
-#define CMU_ICSC_P_O3_2 0x093E
-#define CMU_BR_M_EN 0x0940
-#define CMU_BR_M_TH1_L 0x0942
-#define CMU_BR_M_TH1_H 0x0943
-#define CMU_BR_M_TH2_L 0x0944
-#define CMU_BR_M_TH2_H 0x0945
-#define CMU_ACE_M_EN 0x0950
-#define CMU_ACE_M_WFG1 0x0951
-#define CMU_ACE_M_WFG2 0x0952
-#define CMU_ACE_M_WFG3 0x0953
-#define CMU_ACE_M_TH0 0x0954
-#define CMU_ACE_M_TH1 0x0955
-#define CMU_ACE_M_TH2 0x0956
-#define CMU_ACE_M_TH3 0x0957
-#define CMU_ACE_M_TH4 0x0958
-#define CMU_ACE_M_TH5 0x0959
-#define CMU_ACE_M_OP0_L 0x095A
-#define CMU_ACE_M_OP0_H 0x095B
-#define CMU_ACE_M_OP5_L 0x095C
-#define CMU_ACE_M_OP5_H 0x095D
-#define CMU_ACE_M_GB2 0x095E
-#define CMU_ACE_M_GB3 0x095F
-#define CMU_ACE_M_MS1 0x0960
-#define CMU_ACE_M_MS2 0x0961
-#define CMU_ACE_M_MS3 0x0962
-#define CMU_BR_P_EN 0x0970
-#define CMU_BR_P_TH1_L 0x0972
-#define CMU_BR_P_TH1_H 0x0973
-#define CMU_BR_P_TH2_L 0x0974
-#define CMU_BR_P_TH2_H 0x0975
-#define CMU_ACE_P_EN 0x0980
-#define CMU_ACE_P_WFG1 0x0981
-#define CMU_ACE_P_WFG2 0x0982
-#define CMU_ACE_P_WFG3 0x0983
-#define CMU_ACE_P_TH0 0x0984
-#define CMU_ACE_P_TH1 0x0985
-#define CMU_ACE_P_TH2 0x0986
-#define CMU_ACE_P_TH3 0x0987
-#define CMU_ACE_P_TH4 0x0988
-#define CMU_ACE_P_TH5 0x0989
-#define CMU_ACE_P_OP0_L 0x098A
-#define CMU_ACE_P_OP0_H 0x098B
-#define CMU_ACE_P_OP5_L 0x098C
-#define CMU_ACE_P_OP5_H 0x098D
-#define CMU_ACE_P_GB2 0x098E
-#define CMU_ACE_P_GB3 0x098F
-#define CMU_ACE_P_MS1 0x0990
-#define CMU_ACE_P_MS2 0x0991
-#define CMU_ACE_P_MS3 0x0992
-#define CMU_FTDC_M_EN 0x09A0
-#define CMU_FTDC_P_EN 0x09A1
-#define CMU_FTDC_INLOW_L 0x09A2
-#define CMU_FTDC_INLOW_H 0x09A3
-#define CMU_FTDC_INHIGH_L 0x09A4
-#define CMU_FTDC_INHIGH_H 0x09A5
-#define CMU_FTDC_OUTLOW_L 0x09A6
-#define CMU_FTDC_OUTLOW_H 0x09A7
-#define CMU_FTDC_OUTHIGH_L 0x09A8
-#define CMU_FTDC_OUTHIGH_H 0x09A9
-#define CMU_FTDC_YLOW 0x09AA
-#define CMU_FTDC_YHIGH 0x09AB
-#define CMU_FTDC_CH1 0x09AC
-#define CMU_FTDC_CH2_L 0x09AE
-#define CMU_FTDC_CH2_H 0x09AF
-#define CMU_FTDC_CH3_L 0x09B0
-#define CMU_FTDC_CH3_H 0x09B1
-#define CMU_FTDC_1_C00_6 0x09B2
-#define CMU_FTDC_1_C01_6 0x09B8
-#define CMU_FTDC_1_C11_6 0x09BE
-#define CMU_FTDC_1_C10_6 0x09C4
-#define CMU_FTDC_1_OFF00_6 0x09CA
-#define CMU_FTDC_1_OFF10_6 0x09D0
-#define CMU_HS_M_EN 0x0A00
-#define CMU_HS_M_AX1_L 0x0A02
-#define CMU_HS_M_AX1_H 0x0A03
-#define CMU_HS_M_AX2_L 0x0A04
-#define CMU_HS_M_AX2_H 0x0A05
-#define CMU_HS_M_AX3_L 0x0A06
-#define CMU_HS_M_AX3_H 0x0A07
-#define CMU_HS_M_AX4_L 0x0A08
-#define CMU_HS_M_AX4_H 0x0A09
-#define CMU_HS_M_AX5_L 0x0A0A
-#define CMU_HS_M_AX5_H 0x0A0B
-#define CMU_HS_M_AX6_L 0x0A0C
-#define CMU_HS_M_AX6_H 0x0A0D
-#define CMU_HS_M_AX7_L 0x0A0E
-#define CMU_HS_M_AX7_H 0x0A0F
-#define CMU_HS_M_AX8_L 0x0A10
-#define CMU_HS_M_AX8_H 0x0A11
-#define CMU_HS_M_AX9_L 0x0A12
-#define CMU_HS_M_AX9_H 0x0A13
-#define CMU_HS_M_AX10_L 0x0A14
-#define CMU_HS_M_AX10_H 0x0A15
-#define CMU_HS_M_AX11_L 0x0A16
-#define CMU_HS_M_AX11_H 0x0A17
-#define CMU_HS_M_AX12_L 0x0A18
-#define CMU_HS_M_AX12_H 0x0A19
-#define CMU_HS_M_AX13_L 0x0A1A
-#define CMU_HS_M_AX13_H 0x0A1B
-#define CMU_HS_M_AX14_L 0x0A1C
-#define CMU_HS_M_AX14_H 0x0A1D
-#define CMU_HS_M_H1_H14 0x0A1E
-#define CMU_HS_M_S1_S14 0x0A2C
-#define CMU_HS_M_GL 0x0A3A
-#define CMU_HS_M_MAXSAT_RGB_Y_L 0x0A3C
-#define CMU_HS_M_MAXSAT_RGB_Y_H 0x0A3D
-#define CMU_HS_M_MAXSAT_RCR_L 0x0A3E
-#define CMU_HS_M_MAXSAT_RCR_H 0x0A3F
-#define CMU_HS_M_MAXSAT_RCB_L 0x0A40
-#define CMU_HS_M_MAXSAT_RCB_H 0x0A41
-#define CMU_HS_M_MAXSAT_GCR_L 0x0A42
-#define CMU_HS_M_MAXSAT_GCR_H 0x0A43
-#define CMU_HS_M_MAXSAT_GCB_L 0x0A44
-#define CMU_HS_M_MAXSAT_GCB_H 0x0A45
-#define CMU_HS_M_MAXSAT_BCR_L 0x0A46
-#define CMU_HS_M_MAXSAT_BCR_H 0x0A47
-#define CMU_HS_M_MAXSAT_BCB_L 0x0A48
-#define CMU_HS_M_MAXSAT_BCB_H 0x0A49
-#define CMU_HS_M_ROFF_L 0x0A4A
-#define CMU_HS_M_ROFF_H 0x0A4B
-#define CMU_HS_M_GOFF_L 0x0A4C
-#define CMU_HS_M_GOFF_H 0x0A4D
-#define CMU_HS_M_BOFF_L 0x0A4E
-#define CMU_HS_M_BOFF_H 0x0A4F
-#define CMU_HS_P_EN 0x0A50
-#define CMU_HS_P_AX1_L 0x0A52
-#define CMU_HS_P_AX1_H 0x0A53
-#define CMU_HS_P_AX2_L 0x0A54
-#define CMU_HS_P_AX2_H 0x0A55
-#define CMU_HS_P_AX3_L 0x0A56
-#define CMU_HS_P_AX3_H 0x0A57
-#define CMU_HS_P_AX4_L 0x0A58
-#define CMU_HS_P_AX4_H 0x0A59
-#define CMU_HS_P_AX5_L 0x0A5A
-#define CMU_HS_P_AX5_H 0x0A5B
-#define CMU_HS_P_AX6_L 0x0A5C
-#define CMU_HS_P_AX6_H 0x0A5D
-#define CMU_HS_P_AX7_L 0x0A5E
-#define CMU_HS_P_AX7_H 0x0A5F
-#define CMU_HS_P_AX8_L 0x0A60
-#define CMU_HS_P_AX8_H 0x0A61
-#define CMU_HS_P_AX9_L 0x0A62
-#define CMU_HS_P_AX9_H 0x0A63
-#define CMU_HS_P_AX10_L 0x0A64
-#define CMU_HS_P_AX10_H 0x0A65
-#define CMU_HS_P_AX11_L 0x0A66
-#define CMU_HS_P_AX11_H 0x0A67
-#define CMU_HS_P_AX12_L 0x0A68
-#define CMU_HS_P_AX12_H 0x0A69
-#define CMU_HS_P_AX13_L 0x0A6A
-#define CMU_HS_P_AX13_H 0x0A6B
-#define CMU_HS_P_AX14_L 0x0A6C
-#define CMU_HS_P_AX14_H 0x0A6D
-#define CMU_HS_P_H1_H14 0x0A6E
-#define CMU_HS_P_S1_S14 0x0A7C
-#define CMU_HS_P_GL 0x0A8A
-#define CMU_HS_P_MAXSAT_RGB_Y_L 0x0A8C
-#define CMU_HS_P_MAXSAT_RGB_Y_H 0x0A8D
-#define CMU_HS_P_MAXSAT_RCR_L 0x0A8E
-#define CMU_HS_P_MAXSAT_RCR_H 0x0A8F
-#define CMU_HS_P_MAXSAT_RCB_L 0x0A90
-#define CMU_HS_P_MAXSAT_RCB_H 0x0A91
-#define CMU_HS_P_MAXSAT_GCR_L 0x0A92
-#define CMU_HS_P_MAXSAT_GCR_H 0x0A93
-#define CMU_HS_P_MAXSAT_GCB_L 0x0A94
-#define CMU_HS_P_MAXSAT_GCB_H 0x0A95
-#define CMU_HS_P_MAXSAT_BCR_L 0x0A96
-#define CMU_HS_P_MAXSAT_BCR_H 0x0A97
-#define CMU_HS_P_MAXSAT_BCB_L 0x0A98
-#define CMU_HS_P_MAXSAT_BCB_H 0x0A99
-#define CMU_HS_P_ROFF_L 0x0A9A
-#define CMU_HS_P_ROFF_H 0x0A9B
-#define CMU_HS_P_GOFF_L 0x0A9C
-#define CMU_HS_P_GOFF_H 0x0A9D
-#define CMU_HS_P_BOFF_L 0x0A9E
-#define CMU_HS_P_BOFF_H 0x0A9F
-#define CMU_GLCSC_M_C0_L 0x0AA0
-#define CMU_GLCSC_M_C0_H 0x0AA1
-#define CMU_GLCSC_M_C1_L 0x0AA2
-#define CMU_GLCSC_M_C1_H 0x0AA3
-#define CMU_GLCSC_M_C2_L 0x0AA4
-#define CMU_GLCSC_M_C2_H 0x0AA5
-#define CMU_GLCSC_M_C3_L 0x0AA6
-#define CMU_GLCSC_M_C3_H 0x0AA7
-#define CMU_GLCSC_M_C4_L 0x0AA8
-#define CMU_GLCSC_M_C4_H 0x0AA9
-#define CMU_GLCSC_M_C5_L 0x0AAA
-#define CMU_GLCSC_M_C5_H 0x0AAB
-#define CMU_GLCSC_M_C6_L 0x0AAC
-#define CMU_GLCSC_M_C6_H 0x0AAD
-#define CMU_GLCSC_M_C7_L 0x0AAE
-#define CMU_GLCSC_M_C7_H 0x0AAF
-#define CMU_GLCSC_M_C8_L 0x0AB0
-#define CMU_GLCSC_M_C8_H 0x0AB1
-#define CMU_GLCSC_M_O1_1 0x0AB4
-#define CMU_GLCSC_M_O1_2 0x0AB5
-#define CMU_GLCSC_M_O1_3 0x0AB6
-#define CMU_GLCSC_M_O2_1 0x0AB8
-#define CMU_GLCSC_M_O2_2 0x0AB9
-#define CMU_GLCSC_M_O2_3 0x0ABA
-#define CMU_GLCSC_M_O3_1 0x0ABC
-#define CMU_GLCSC_M_O3_2 0x0ABD
-#define CMU_GLCSC_M_O3_3 0x0ABE
-#define CMU_GLCSC_P_C0_L 0x0AC0
-#define CMU_GLCSC_P_C0_H 0x0AC1
-#define CMU_GLCSC_P_C1_L 0x0AC2
-#define CMU_GLCSC_P_C1_H 0x0AC3
-#define CMU_GLCSC_P_C2_L 0x0AC4
-#define CMU_GLCSC_P_C2_H 0x0AC5
-#define CMU_GLCSC_P_C3_L 0x0AC6
-#define CMU_GLCSC_P_C3_H 0x0AC7
-#define CMU_GLCSC_P_C4_L 0x0AC8
-#define CMU_GLCSC_P_C4_H 0x0AC9
-#define CMU_GLCSC_P_C5_L 0x0ACA
-#define CMU_GLCSC_P_C5_H 0x0ACB
-#define CMU_GLCSC_P_C6_L 0x0ACC
-#define CMU_GLCSC_P_C6_H 0x0ACD
-#define CMU_GLCSC_P_C7_L 0x0ACE
-#define CMU_GLCSC_P_C7_H 0x0ACF
-#define CMU_GLCSC_P_C8_L 0x0AD0
-#define CMU_GLCSC_P_C8_H 0x0AD1
-#define CMU_GLCSC_P_O1_1 0x0AD4
-#define CMU_GLCSC_P_O1_2 0x0AD5
-#define CMU_GLCSC_P_O1_3 0x0AD6
-#define CMU_GLCSC_P_O2_1 0x0AD8
-#define CMU_GLCSC_P_O2_2 0x0AD9
-#define CMU_GLCSC_P_O2_3 0x0ADA
-#define CMU_GLCSC_P_O3_1 0x0ADC
-#define CMU_GLCSC_P_O3_2 0x0ADD
-#define CMU_GLCSC_P_O3_3 0x0ADE
-#define CMU_PIXVAL_M_EN 0x0AE0
-#define CMU_PIXVAL_P_EN 0x0AE1
-
-#define CMU_CLK_CTRL_TCLK 0x0
-#define CMU_CLK_CTRL_SCLK 0x2
-#define CMU_CLK_CTRL_MSK 0x2
-#define CMU_CLK_CTRL_ENABLE 0x1
-
-#define LCD_TOP_CTRL_TV 0x2
-#define LCD_TOP_CTRL_PN 0x0
-#define LCD_TOP_CTRL_SEL_MSK 0x2
-#define LCD_IO_CMU_IN_SEL_MSK (0x3 << 20)
-#define LCD_IO_CMU_IN_SEL_TV 0
-#define LCD_IO_CMU_IN_SEL_PN 1
-#define LCD_IO_CMU_IN_SEL_PN2 2
-#define LCD_IO_TV_OUT_SEL_MSK (0x3 << 26)
-#define LCD_IO_PN_OUT_SEL_MSK (0x3 << 24)
-#define LCD_IO_PN2_OUT_SEL_MSK (0x3 << 28)
-#define LCD_IO_TV_OUT_SEL_NON 3
-#define LCD_IO_PN_OUT_SEL_NON 3
-#define LCD_IO_PN2_OUT_SEL_NON 3
-#define LCD_TOP_CTRL_CMU_ENABLE 0x1
-#define LCD_IO_OVERL_MSK 0xC00000
-#define LCD_IO_OVERL_TV 0x0
-#define LCD_IO_OVERL_LCD1 0x400000
-#define LCD_IO_OVERL_LCD2 0xC00000
-#define HINVERT_MSK 0x4
-#define VINVERT_MSK 0x8
-#define HINVERT_LEN 0x2
-#define VINVERT_LEN 0x3
-
-#define CMU_CTRL 0x88
-#define CMU_CTRL_A0_MSK 0x6
-#define CMU_CTRL_A0_TV 0x0
-#define CMU_CTRL_A0_LCD1 0x1
-#define CMU_CTRL_A0_LCD2 0x2
-#define CMU_CTRL_A0_HDMI 0x3
-
-#define ICR_DRV_ROUTE_OFF 0x0
-#define ICR_DRV_ROUTE_TV 0x1
-#define ICR_DRV_ROUTE_LCD1 0x2
-#define ICR_DRV_ROUTE_LCD2 0x3
-
enum {
PATH_PN = 0,
PATH_TV,
diff --git a/fs/Makefile b/fs/Makefile
index 9d53192236fc..3b2c76759ec9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.o super.o \
ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
- pnode.o drop_caches.o splice.o sync.o utimes.o \
+ pnode.o splice.o sync.o utimes.o \
stack.o fs_struct.o statfs.o
ifeq ($(CONFIG_BLOCK),y)
@@ -49,6 +49,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
obj-$(CONFIG_NFS_COMMON) += nfs_common/
obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
obj-$(CONFIG_COREDUMP) += coredump.o
+obj-$(CONFIG_SYSCTL) += drop_caches.o
obj-$(CONFIG_FHANDLE) += fhandle.o
diff --git a/fs/buffer.c b/fs/buffer.c
index b4dcb34c9635..10ef81e10b20 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -865,8 +865,6 @@ try_again:
/* Link the buffer to its page */
set_bh_page(bh, page, offset);
-
- init_buffer(bh, NULL, NULL);
}
return head;
/*
@@ -2949,7 +2947,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
}
}
-int submit_bh(int rw, struct buffer_head * bh)
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
{
struct bio *bio;
int ret = 0;
@@ -2984,6 +2982,7 @@ int submit_bh(int rw, struct buffer_head * bh)
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
+ bio->bi_flags |= bio_flags;
/* Take care of bh's that straddle the end of the device */
guard_bh_eod(rw, bio, bh);
@@ -2997,6 +2996,12 @@ int submit_bh(int rw, struct buffer_head * bh)
bio_put(bio);
return ret;
}
+EXPORT_SYMBOL_GPL(_submit_bh);
+
+int submit_bh(int rw, struct buffer_head *bh)
+{
+ return _submit_bh(rw, bh, 0);
+}
EXPORT_SYMBOL(submit_bh);
/**
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f853263cf74f..cfb816dc6d9f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -672,12 +672,6 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
if (sdio->final_block_in_bio != sdio->cur_page_block ||
cur_offset != bio_next_offset)
dio_bio_submit(dio, sdio);
- /*
- * Submit now if the underlying fs is about to perform a
- * metadata read
- */
- else if (sdio->boundary)
- dio_bio_submit(dio, sdio);
}
if (sdio->bio == NULL) {
@@ -737,16 +731,6 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
sdio->cur_page_block +
(sdio->cur_page_len >> sdio->blkbits) == blocknr) {
sdio->cur_page_len += len;
-
- /*
- * If sdio->boundary then we want to schedule the IO now to
- * avoid metadata seeks.
- */
- if (sdio->boundary) {
- ret = dio_send_cur_page(dio, sdio, map_bh);
- page_cache_release(sdio->cur_page);
- sdio->cur_page = NULL;
- }
goto out;
}
@@ -758,7 +742,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
page_cache_release(sdio->cur_page);
sdio->cur_page = NULL;
if (ret)
- goto out;
+ return ret;
}
page_cache_get(page); /* It is in dio */
@@ -768,6 +752,16 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
sdio->cur_page_block = blocknr;
sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
out:
+ /*
+ * If sdio->boundary then we want to schedule the IO now to
+ * avoid metadata seeks.
+ */
+ if (sdio->boundary) {
+ ret = dio_send_cur_page(dio, sdio, map_bh);
+ dio_bio_submit(dio, sdio);
+ page_cache_release(sdio->cur_page);
+ sdio->cur_page = NULL;
+ }
return ret;
}
@@ -969,7 +963,8 @@ do_holes:
this_chunk_bytes = this_chunk_blocks << blkbits;
BUG_ON(this_chunk_bytes == 0);
- sdio->boundary = buffer_boundary(map_bh);
+ if (this_chunk_blocks == sdio->blocks_available)
+ sdio->boundary = buffer_boundary(map_bh);
ret = submit_page_section(dio, sdio, page,
offset_in_page,
this_chunk_bytes,
diff --git a/fs/exec.c b/fs/exec.c
index a96a4885bbbf..87e731f020fb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -613,7 +613,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* when the old and new regions overlap clear from new_end.
*/
free_pgd_range(&tlb, new_end, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : 0);
+ vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
} else {
/*
* otherwise, clean from old_start; this is done to not touch
@@ -622,7 +622,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* for the others its just a little faster.
*/
free_pgd_range(&tlb, old_start, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : 0);
+ vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
}
tlb_finish_mmu(&tlb, new_end, old_end);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index fb5120a5505c..3dc48cc8b6eb 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2067,7 +2067,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
- sb->s_flags |= MS_SNAP_STABLE;
return 0;
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 8179e8bc4a3d..40d13c70ef51 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -287,5 +287,5 @@ const struct file_operations fscache_stats_fops = {
.open = fscache_stats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = single_release,
};
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 86b39b167c23..11bb11f48b3a 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -162,8 +162,17 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
for (i = 0; i < bufs; i++) {
wbuf[i]->b_end_io = end_buffer_write_sync;
- /* We use-up our safety reference in submit_bh() */
- submit_bh(write_op, wbuf[i]);
+ /*
+ * Here we write back pagecache data that may be mmaped. Since
+ * we cannot afford to clean the page and set PageWriteback
+ * here due to lock ordering (page lock ranks above transaction
+ * start), the data can change while IO is in flight. Tell the
+ * block layer it should bounce the bio pages if stable data
+ * during write is required.
+ *
+ * We use up our safety reference in submit_bh().
+ */
+ _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
}
}
@@ -667,7 +676,17 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(write_op, bh);
+ /*
+ * In data=journal mode, here we can end up
+ * writing pagecache data that might be
+ * mmapped. Since we can't afford to clean the
+ * page and set PageWriteback (see the comment
+ * near the other use of _submit_bh()), the
+ * data can change while the write is in
+ * flight. Tell the block layer to bounce the
+ * bio pages if stable pages are required.
+ */
+ _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
}
cond_resched();
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 81cc7eaff863..865c4308acb6 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -310,8 +310,6 @@ int journal_write_metadata_buffer(transaction_t *transaction,
new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
/* keep subsequent assertions sane */
- new_bh->b_state = 0;
- init_buffer(new_bh, NULL, NULL);
atomic_set(&new_bh->b_count, 1);
new_jh = journal_add_journal_head(new_bh); /* This sleeps */
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ed10991ab006..8b220f1ab54f 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -367,8 +367,6 @@ retry_alloc:
}
/* keep subsequent assertions sane */
- new_bh->b_state = 0;
- init_buffer(new_bh, NULL, NULL);
atomic_set(&new_bh->b_count, 1);
new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index eeac97bb3bfa..b3fdd1a323d6 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1498,10 +1498,8 @@ leave:
dlm_put(dlm);
if (ret < 0) {
- if (buf)
- kfree(buf);
- if (item)
- kfree(item);
+ kfree(buf);
+ kfree(item);
mlog_errno(ret);
}
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 752f0b26221d..0c60ef2d8056 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -101,13 +101,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if (!S_ISDIR(inode->i_mode))
flags &= ~OCFS2_DIRSYNC_FL;
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto bail_unlock;
- }
-
oldflags = ocfs2_inode->ip_attr;
flags = flags & mask;
flags |= oldflags & ~mask;
@@ -120,7 +113,14 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
(OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
if (!capable(CAP_LINUX_IMMUTABLE))
- goto bail_commit;
+ goto bail_unlock;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
}
ocfs2_inode->ip_attr = flags;
@@ -130,8 +130,8 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if (status < 0)
mlog_errno(status);
-bail_commit:
ocfs2_commit_trans(osb, handle);
+
bail_unlock:
ocfs2_inode_unlock(inode, 1);
bail:
@@ -706,8 +706,10 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
o2info_set_request_filled(&oiff->iff_req);
- if (o2info_to_user(*oiff, req))
+ if (o2info_to_user(*oiff, req)) {
+ status = -EFAULT;
goto bail;
+ }
status = 0;
bail:
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 9f8dcadd9a50..f1fc172175b6 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -471,7 +471,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
int ret, goal_bit = 0;
struct buffer_head *gd_bh = NULL;
- struct ocfs2_group_desc *bg = NULL;
+ struct ocfs2_group_desc *bg;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int c_to_b = 1 << (osb->s_clustersize_bits -
inode->i_sb->s_blocksize_bits);
@@ -482,13 +482,6 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
range->me_goal);
/*
- * moving goal is not allowd to start with a group desc blok(#0 blk)
- * let's compromise to the latter cluster.
- */
- if (range->me_goal == le64_to_cpu(bg->bg_blkno))
- range->me_goal += c_to_b;
-
- /*
* validate goal sits within global_bitmap, and return the victim
* group desc
*/
@@ -502,6 +495,13 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
bg = (struct ocfs2_group_desc *)gd_bh->b_data;
/*
+ * moving goal is not allowd to start with a group desc blok(#0 blk)
+ * let's compromise to the latter cluster.
+ */
+ if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+ range->me_goal += c_to_b;
+
+ /*
* movement is not gonna cross two groups.
*/
if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
@@ -1057,42 +1057,40 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
struct inode *inode = file_inode(filp);
struct ocfs2_move_extents range;
- struct ocfs2_move_extents_context *context = NULL;
+ struct ocfs2_move_extents_context *context;
+
+ if (!argp)
+ return -EINVAL;
status = mnt_want_write_file(filp);
if (status)
return status;
if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
- goto out;
+ goto out_drop;
if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
status = -EPERM;
- goto out;
+ goto out_drop;
}
context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
if (!context) {
status = -ENOMEM;
mlog_errno(status);
- goto out;
+ goto out_drop;
}
context->inode = inode;
context->file = filp;
- if (argp) {
- if (copy_from_user(&range, argp, sizeof(range))) {
- status = -EFAULT;
- goto out;
- }
- } else {
- status = -EINVAL;
- goto out;
+ if (copy_from_user(&range, argp, sizeof(range))) {
+ status = -EFAULT;
+ goto out_free;
}
if (range.me_start > i_size_read(inode))
- goto out;
+ goto out_free;
if (range.me_start + range.me_len > i_size_read(inode))
range.me_len = i_size_read(inode) - range.me_start;
@@ -1124,25 +1122,24 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
status = ocfs2_validate_and_adjust_move_goal(inode, &range);
if (status)
- goto out;
+ goto out_copy;
}
status = ocfs2_move_extents(context);
if (status)
mlog_errno(status);
-out:
+out_copy:
/*
* movement/defragmentation may end up being partially completed,
* that's the reason why we need to return userspace the finished
* length and new_offset even if failure happens somewhere.
*/
- if (argp) {
- if (copy_to_user(argp, &range, sizeof(range)))
- status = -EFAULT;
- }
+ if (copy_to_user(argp, &range, sizeof(range)))
+ status = -EFAULT;
+out_free:
kfree(context);
-
+out_drop:
mnt_drop_write_file(filp);
return status;
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 712f24db9600..ab30716584f5 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -5,7 +5,7 @@
obj-y += proc.o
proc-y := nommu.o task_nommu.o
-proc-$(CONFIG_MMU) := mmu.o task_mmu.o
+proc-$(CONFIG_MMU) := task_mmu.o
proc-y += inode.o root.o base.o generic.o array.o \
fd.o
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 85ff3a4598b3..75710357a517 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -30,24 +30,6 @@ extern int proc_net_init(void);
static inline int proc_net_init(void) { return 0; }
#endif
-struct vmalloc_info {
- unsigned long used;
- unsigned long largest_chunk;
-};
-
-#ifdef CONFIG_MMU
-#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
-extern void get_vmalloc_info(struct vmalloc_info *vmi);
-#else
-
-#define VMALLOC_TOTAL 0UL
-#define get_vmalloc_info(vmi) \
-do { \
- (vmi)->used = 0; \
- (vmi)->largest_chunk = 0; \
-} while(0)
-#endif
-
extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index eda6f017f272..f6a13f489e30 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -15,6 +15,7 @@
#include <linux/capability.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
+#include <linux/notifier.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/printk.h>
@@ -564,7 +565,6 @@ static const struct file_operations proc_kcore_operations = {
.llseek = default_llseek,
};
-#ifdef CONFIG_MEMORY_HOTPLUG
/* just remember that we have to update kcore */
static int __meminit kcore_callback(struct notifier_block *self,
unsigned long action, void *arg)
@@ -578,8 +578,11 @@ static int __meminit kcore_callback(struct notifier_block *self,
}
return NOTIFY_OK;
}
-#endif
+static struct notifier_block kcore_callback_nb __meminitdata = {
+ .notifier_call = kcore_callback,
+ .priority = 0,
+};
static struct kcore_list kcore_vmalloc;
@@ -631,7 +634,7 @@ static int __init proc_kcore_init(void)
add_modules_range();
/* Store direct-map area from physical memory map */
kcore_update_ram();
- hotplug_memory_notifier(kcore_callback, 0);
+ register_hotmemory_notifier(&kcore_callback_nb);
return 0;
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 1efaaa19c4f3..5aa847a603c0 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -11,6 +11,7 @@
#include <linux/swap.h>
#include <linux/vmstat.h>
#include <linux/atomic.h>
+#include <linux/vmalloc.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "internal.h"
diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c
deleted file mode 100644
index 8ae221dfd010..000000000000
--- a/fs/proc/mmu.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* mmu.c: mmu memory info files
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/spinlock.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <asm/pgtable.h>
-#include "internal.h"
-
-void get_vmalloc_info(struct vmalloc_info *vmi)
-{
- struct vm_struct *vma;
- unsigned long free_area_size;
- unsigned long prev_end;
-
- vmi->used = 0;
-
- if (!vmlist) {
- vmi->largest_chunk = VMALLOC_TOTAL;
- }
- else {
- vmi->largest_chunk = 0;
-
- prev_end = VMALLOC_START;
-
- read_lock(&vmlist_lock);
-
- for (vma = vmlist; vma; vma = vma->next) {
- unsigned long addr = (unsigned long) vma->addr;
-
- /*
- * Some archs keep another range for modules in vmlist
- */
- if (addr < VMALLOC_START)
- continue;
- if (addr >= VMALLOC_END)
- break;
-
- vmi->used += vma->size;
-
- free_area_size = addr - prev_end;
- if (vmi->largest_chunk < free_area_size)
- vmi->largest_chunk = free_area_size;
-
- prev_end = vma->size + addr;
- }
-
- if (VMALLOC_END - prev_end > vmi->largest_chunk)
- vmi->largest_chunk = VMALLOC_END - prev_end;
-
- read_unlock(&vmlist_lock);
- }
-}
diff --git a/fs/read_write.c b/fs/read_write.c
index e6ddc8dceb96..7a648911246b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -128,7 +128,7 @@ EXPORT_SYMBOL(generic_file_llseek_size);
*
* This is a generic implemenation of ->llseek useable for all normal local
* filesystems. It just updates the file offset to the value specified by
- * @offset and @whence under i_mutex.
+ * @offset and @whence.
*/
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
{
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
new file mode 100644
index 000000000000..d06079c774a0
--- /dev/null
+++ b/include/asm-generic/hugetlb.h
@@ -0,0 +1,40 @@
+#ifndef _ASM_GENERIC_HUGETLB_H
+#define _ASM_GENERIC_HUGETLB_H
+
+static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
+{
+ return mk_pte(page, pgprot);
+}
+
+static inline int huge_pte_write(pte_t pte)
+{
+ return pte_write(pte);
+}
+
+static inline int huge_pte_dirty(pte_t pte)
+{
+ return pte_dirty(pte);
+}
+
+static inline pte_t huge_pte_mkwrite(pte_t pte)
+{
+ return pte_mkwrite(pte);
+}
+
+static inline pte_t huge_pte_mkdirty(pte_t pte)
+{
+ return pte_mkdirty(pte);
+}
+
+static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
+{
+ return pte_modify(pte, newprot);
+}
+
+static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pte_clear(mm, addr, ptep);
+}
+
+#endif /* _ASM_GENERIC_HUGETLB_H */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index bfd87685fc1f..a59ff51b0166 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -7,6 +7,16 @@
#include <linux/mm_types.h>
#include <linux/bug.h>
+/*
+ * On almost all architectures and configurations, 0 can be used as the
+ * upper ceiling to free_pgtables(): on many architectures it has the same
+ * effect as using TASK_SIZE. However, there is one configuration which
+ * must impose a more careful limit, to avoid freeing kernel pgtables.
+ */
+#ifndef USER_PGTABLES_CEILING
+#define USER_PGTABLES_CEILING 0UL
+#endif
+
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cdf11191e645..22990cf4439d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -111,12 +111,13 @@ struct bio {
#define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */
#define BIO_QUIET 10 /* Make BIO Quiet */
#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+#define BIO_SNAP_STABLE 12 /* bio data must be snapshotted during write */
/*
* Flags starting here get preserved by bio_reset() - this includes
* BIO_POOL_IDX()
*/
-#define BIO_RESET_BITS 12
+#define BIO_RESET_BITS 13
#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index cdc3bab01832..5f0b0e1f7c08 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -44,7 +44,6 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat,
unsigned long endpfn);
extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
-extern unsigned long free_low_memory_core_early(int nodeid);
extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
extern unsigned long free_all_bootmem(void);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 5afc4f94d110..4c16c4a88d47 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -181,6 +181,7 @@ void ll_rw_block(int, int, struct buffer_head * bh[]);
int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, int rw);
void write_dirty_buffer(struct buffer_head *bh, int rw);
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags);
int submit_bh(int, struct buffer_head *);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 900af5964f55..470073bf93d0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -687,13 +687,6 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
-/*
- * Get a cgroup whose id is greater than or equal to id under tree of root.
- * Returning a cgroup_subsys_state or NULL.
- */
-struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
- struct cgroup_subsys_state *root, int *foundid);
-
/* Returns true if root is ancestor of cg */
bool css_is_ancestor(struct cgroup_subsys_state *cg,
const struct cgroup_subsys_state *root);
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 3bd46f766751..21ca773f77bf 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -27,7 +27,7 @@ extern int debug_locks_off(void);
\
if (!oops_in_progress && unlikely(c)) { \
if (debug_locks_off() && !debug_locks_silent) \
- WARN_ON(1); \
+ WARN(1, "DEBUG_LOCKS_WARN_ON(%s)", #c); \
__ret = 1; \
} \
__ret; \
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ee1c244a62a1..528454c2caa9 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -99,7 +99,11 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
extern int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags);
-extern int split_huge_page(struct page *page);
+extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+static inline int split_huge_page(struct page *page)
+{
+ return split_huge_page_to_list(page, NULL);
+}
extern void __split_huge_page_pmd(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd);
#define split_huge_page_pmd(__vma, __address, __pmd) \
@@ -186,6 +190,11 @@ extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vm
#define transparent_hugepage_enabled(__vma) 0
#define transparent_hugepage_flags 0UL
+static inline int
+split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+ return 0;
+}
static inline int split_huge_page(struct page *page)
{
return 0;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 16e4e9a643fb..3a62df310f2e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -58,6 +58,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
void hugetlb_report_meminfo(struct seq_file *);
int hugetlb_report_node_meminfo(int, char *);
+void hugetlb_show_meminfo(void);
unsigned long hugetlb_total_pages(void);
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
@@ -114,6 +115,9 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
{
}
#define hugetlb_report_node_meminfo(n, buf) 0
+static inline void hugetlb_show_meminfo(void)
+{
+}
#define follow_huge_pmd(mm, addr, pmd, write) NULL
#define follow_huge_pud(mm, addr, pud, write) NULL
#define prepare_hugepage_range(file, addr, len) (-EINVAL)
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 95d0850584da..c2559847d7ee 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1318,6 +1318,17 @@ void vmbus_driver_unregister(struct hv_driver *hv_driver);
0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4, 0x40 \
}
/*
+ * Synthetic Video GUID
+ * {DA0A7802-E377-4aac-8E77-0558EB1073F8}
+ */
+#define HV_SYNTHVID_GUID \
+ .guid = { \
+ 0x02, 0x78, 0x0a, 0xda, 0x77, 0xe3, 0xac, 0x4a, \
+ 0x8e, 0x77, 0x05, 0x58, 0xeb, 0x10, 0x73, 0xf8 \
+ }
+
+
+/*
* Common header for Hyper-V ICs
*/
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 85ac9b9b72a2..89b7c24a36e9 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -192,6 +192,10 @@ extern struct resource * __request_region(struct resource *,
extern int __check_region(struct resource *, resource_size_t, resource_size_t);
extern void __release_region(struct resource *, resource_size_t,
resource_size_t);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+extern int release_mem_region_adjustable(struct resource *, resource_size_t,
+ resource_size_t);
+#endif
static inline int __deprecated check_region(resource_size_t s,
resource_size_t n)
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 45e93b468878..73817af8b480 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -18,6 +18,7 @@
#include <linux/node.h>
#include <linux/compiler.h>
#include <linux/mutex.h>
+#include <linux/notifier.h>
#define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS)
@@ -114,9 +115,10 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
extern int register_memory_isolate_notifier(struct notifier_block *nb);
extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
extern int register_new_memory(int, struct mem_section *);
+#ifdef CONFIG_MEMORY_HOTREMOVE
extern int unregister_memory_section(struct mem_section *);
+#endif
extern int memory_dev_init(void);
-extern int remove_memory_block(unsigned long, struct mem_section *, int);
extern int memory_notify(unsigned long val, void *v);
extern int memory_isolate_notify(unsigned long val, void *v);
extern struct memory_block *find_memory_block_hinted(struct mem_section *,
@@ -127,13 +129,18 @@ enum mem_add_context { BOOT, HOTPLUG };
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
#ifdef CONFIG_MEMORY_HOTPLUG
-#define hotplug_memory_notifier(fn, pri) { \
+#define hotplug_memory_notifier(fn, pri) ({ \
static __meminitdata struct notifier_block fn##_mem_nb =\
- { .notifier_call = fn, .priority = pri }; \
+ { .notifier_call = fn, .priority = pri };\
register_memory_notifier(&fn##_mem_nb); \
-}
+})
+#define register_hotmemory_notifier(nb) register_memory_notifier(nb)
+#define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb)
#else
-#define hotplug_memory_notifier(fn, pri) do { } while (0)
+#define hotplug_memory_notifier(fn, pri) (0)
+/* These aren't inline functions due to a GCC bug. */
+#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; })
+#define unregister_hotmemory_notifier(nb) ({ (void)(nb); })
#endif
/*
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index b6a3be7d47bf..3e622c610925 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -97,13 +97,13 @@ extern void __online_page_free(struct page *page);
#ifdef CONFIG_MEMORY_HOTREMOVE
extern bool is_pageblock_removable_nolock(struct page *page);
extern int arch_remove_memory(u64 start, u64 size);
+extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages);
#endif /* CONFIG_MEMORY_HOTREMOVE */
/* reasonably generic interface to expand the physical pages in a zone */
extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages);
#ifdef CONFIG_NUMA
extern int memory_add_physaddr_to_nid(u64 start);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e2091b88d24c..c05d7cfbb6b9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -44,6 +44,9 @@ extern int sysctl_legacy_va_layout;
#include <asm/pgtable.h>
#include <asm/processor.h>
+extern unsigned long sysctl_user_reserve_kbytes;
+extern unsigned long sysctl_admin_reserve_kbytes;
+
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
/* to align the pointer to the (next) page boundary */
@@ -899,7 +902,8 @@ extern void pagefault_out_of_memory(void);
* Flags passed to show_mem() and show_free_areas() to suppress output in
* various contexts.
*/
-#define SHOW_MEM_FILTER_NODES (0x0001u) /* filter disallowed nodes */
+#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
+#define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */
extern void show_free_areas(unsigned int flags);
extern bool skip_free_areas_node(unsigned int flags, int nid);
@@ -1294,6 +1298,61 @@ extern void free_area_init_node(int nid, unsigned long * zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
extern void free_initmem(void);
+/*
+ * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
+ * into the buddy system. The freed pages will be poisoned with pattern
+ * "poison" if it's non-zero.
+ * Return pages freed into the buddy system.
+ */
+extern unsigned long free_reserved_area(unsigned long start, unsigned long end,
+ int poison, char *s);
+#ifdef CONFIG_HIGHMEM
+/*
+ * Free a highmem page into the buddy system, adjusting totalhigh_pages
+ * and totalram_pages.
+ */
+extern void free_highmem_page(struct page *page);
+#endif
+
+static inline void adjust_managed_page_count(struct page *page, long count)
+{
+ totalram_pages += count;
+}
+
+/* Free the reserved page into the buddy system, so it gets managed. */
+static inline void __free_reserved_page(struct page *page)
+{
+ ClearPageReserved(page);
+ init_page_count(page);
+ __free_page(page);
+}
+
+static inline void free_reserved_page(struct page *page)
+{
+ __free_reserved_page(page);
+ adjust_managed_page_count(page, 1);
+}
+
+static inline void mark_page_reserved(struct page *page)
+{
+ SetPageReserved(page);
+ adjust_managed_page_count(page, -1);
+}
+
+/*
+ * Default method to free all the __init memory into the buddy system.
+ * The freed pages will be poisoned with pattern "poison" if it is
+ * non-zero. Return pages freed into the buddy system.
+ */
+static inline unsigned long free_initmem_default(int poison)
+{
+ extern char __init_begin[], __init_end[];
+
+ return free_reserved_area(PAGE_ALIGN((unsigned long)&__init_begin) ,
+ ((unsigned long)&__init_end) & PAGE_MASK,
+ poison, "unused kernel");
+}
+
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
* With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
@@ -1675,8 +1734,12 @@ int in_gate_area_no_mm(unsigned long addr);
#define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);})
#endif /* __HAVE_ARCH_GATE_AREA */
+#ifdef CONFIG_SYSCTL
+extern int sysctl_drop_caches;
int drop_caches_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+#endif
+
unsigned long shrink_slab(struct shrink_control *shrink,
unsigned long nr_pages_scanned,
unsigned long lru_pages);
@@ -1704,12 +1767,12 @@ pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
void *vmemmap_alloc_block(unsigned long size, int node);
void *vmemmap_alloc_block_buf(unsigned long size, int node);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
-int vmemmap_populate_basepages(struct page *start_page,
- unsigned long pages, int node);
-int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
+int vmemmap_populate_basepages(unsigned long start, unsigned long end,
+ int node);
+int vmemmap_populate(unsigned long start, unsigned long end, int node);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
-void vmemmap_free(struct page *memmap, unsigned long nr_pages);
+void vmemmap_free(unsigned long start, unsigned long end);
#endif
void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
unsigned long size);
@@ -1756,5 +1819,11 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool page_is_guard(struct page *page) { return false; }
#endif /* CONFIG_DEBUG_PAGEALLOC */
+#if MAX_NUMNODES > 1
+void __init setup_nr_node_ids(void);
+#else
+static inline void setup_nr_node_ids(void) {}
+#endif
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index d65746efc954..d14a4c362465 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -47,8 +47,11 @@
* runtime initialization.
*/
+typedef int (*notifier_fn_t)(struct notifier_block *nb,
+ unsigned long action, void *data);
+
struct notifier_block {
- int (*notifier_call)(struct notifier_block *, unsigned long, void *);
+ notifier_fn_t notifier_call;
struct notifier_block __rcu *next;
int priority;
};
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0e38e13eb249..e3dea75a078b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -149,7 +149,7 @@ static inline int page_cache_get_speculative(struct page *page)
{
VM_BUG_ON(in_interrupt());
-#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
+#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
VM_BUG_ON(!in_atomic());
# endif
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 5bf5500db83d..69e37c2d1ea5 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -6,7 +6,13 @@ struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir,
extern struct dentry *ramfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data);
-#ifndef CONFIG_MMU
+#ifdef CONFIG_MMU
+static inline int
+ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+ return 0;
+}
+#else
extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize);
extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long addr,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2818a123f3ea..1701ce4be746 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -236,7 +236,7 @@ extern unsigned long nr_free_pagecache_pages(void);
extern void __lru_cache_add(struct page *, enum lru_list lru);
extern void lru_cache_add_lru(struct page *, enum lru_list lru);
extern void lru_add_page_tail(struct page *page, struct page *page_tail,
- struct lruvec *lruvec);
+ struct lruvec *lruvec, struct list_head *head);
extern void activate_page(struct page *);
extern void mark_page_accessed(struct page *);
extern void lru_add_drain(void);
@@ -330,6 +330,9 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
/* linux/mm/page_io.c */
extern int swap_readpage(struct page *);
extern int swap_writepage(struct page *page, struct writeback_control *wbc);
+extern void end_swap_bio_write(struct bio *bio, int err);
+extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
+ void (*end_write_func)(struct bio *, int));
extern int swap_set_page_dirty(struct page *page);
extern void end_swap_bio_read(struct bio *bio, int err);
@@ -343,8 +346,9 @@ extern struct address_space swapper_spaces[];
#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
extern unsigned long total_swapcache_pages(void);
extern void show_swap_cache_info(void);
-extern int add_to_swap(struct page *);
+extern int add_to_swap(struct page *, struct list_head *list);
extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
+extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
extern void __delete_from_swap_cache(struct page *);
extern void delete_from_swap_cache(struct page *);
extern void free_page_and_swap_cache(struct page *);
@@ -461,7 +465,7 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp)
return NULL;
}
-static inline int add_to_swap(struct page *page)
+static inline int add_to_swap(struct page *page, struct list_head *list)
{
return 0;
}
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6071e911c7f4..7d5773a99f20 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -3,7 +3,9 @@
#include <linux/spinlock.h>
#include <linux/init.h>
+#include <linux/list.h>
#include <asm/page.h> /* pgprot_t */
+#include <linux/rbtree.h>
struct vm_area_struct; /* vma defining user mapping in mm_types.h */
@@ -35,6 +37,17 @@ struct vm_struct {
const void *caller;
};
+struct vmap_area {
+ unsigned long va_start;
+ unsigned long va_end;
+ unsigned long flags;
+ struct rb_node rb_node; /* address sorted rbtree */
+ struct list_head list; /* address sorted list */
+ struct list_head purge_list; /* "lazy purge" list */
+ struct vm_struct *vm;
+ struct rcu_head rcu_head;
+};
+
/*
* Highlevel APIs for driver use
*/
@@ -130,8 +143,7 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
/*
* Internals. Dont't use..
*/
-extern rwlock_t vmlist_lock;
-extern struct vm_struct *vmlist;
+extern struct list_head vmap_area_list;
extern __init void vm_area_add_early(struct vm_struct *vm);
extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
@@ -158,4 +170,22 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
# endif
#endif
+struct vmalloc_info {
+ unsigned long used;
+ unsigned long largest_chunk;
+};
+
+#ifdef CONFIG_MMU
+#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
+extern void get_vmalloc_info(struct vmalloc_info *vmi);
+#else
+
+#define VMALLOC_TOTAL 0UL
+#define get_vmalloc_info(vmi) \
+do { \
+ (vmi)->used = 0; \
+ (vmi)->largest_chunk = 0; \
+} while (0)
+#endif
+
#endif /* _LINUX_VMALLOC_H */
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
new file mode 100644
index 000000000000..76be077340ea
--- /dev/null
+++ b/include/linux/vmpressure.h
@@ -0,0 +1,47 @@
+#ifndef __LINUX_VMPRESSURE_H
+#define __LINUX_VMPRESSURE_H
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/cgroup.h>
+
+struct vmpressure {
+ unsigned long scanned;
+ unsigned long reclaimed;
+ /* The lock is used to keep the scanned/reclaimed above in sync. */
+ struct mutex sr_lock;
+
+ /* The list of vmpressure_event structs. */
+ struct list_head events;
+ /* Have to grab the lock on events traversal or modifications. */
+ struct mutex events_lock;
+
+ struct work_struct work;
+};
+
+struct mem_cgroup;
+
+#ifdef CONFIG_MEMCG
+extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+ unsigned long scanned, unsigned long reclaimed);
+extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
+
+extern void vmpressure_init(struct vmpressure *vmpr);
+extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
+extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
+extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
+extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
+ struct eventfd_ctx *eventfd,
+ const char *args);
+extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
+ struct eventfd_ctx *eventfd);
+#else
+static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+ unsigned long scanned, unsigned long reclaimed) {}
+static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
+ int prio) {}
+#endif /* CONFIG_MEMCG */
+#endif /* __LINUX_VMPRESSURE_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 5fd71a7d0dfd..c586679b6fef 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -48,13 +48,8 @@ static inline void count_vm_events(enum vm_event_item item, long delta)
}
extern void all_vm_events(unsigned long *);
-#ifdef CONFIG_HOTPLUG
+
extern void vm_events_fold_cpu(int cpu);
-#else
-static inline void vm_events_fold_cpu(int cpu)
-{
-}
-#endif
#else
diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h
new file mode 100644
index 000000000000..0421f49a20f7
--- /dev/null
+++ b/include/trace/events/filemap.h
@@ -0,0 +1,58 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM filemap
+
+#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FILEMAP_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/device.h>
+#include <linux/kdev_t.h>
+
+DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,
+
+ TP_PROTO(struct page *page),
+
+ TP_ARGS(page),
+
+ TP_STRUCT__entry(
+ __field(struct page *, page)
+ __field(unsigned long, i_ino)
+ __field(unsigned long, index)
+ __field(dev_t, s_dev)
+ ),
+
+ TP_fast_assign(
+ __entry->page = page;
+ __entry->i_ino = page->mapping->host->i_ino;
+ __entry->index = page->index;
+ if (page->mapping->host->i_sb)
+ __entry->s_dev = page->mapping->host->i_sb->s_dev;
+ else
+ __entry->s_dev = page->mapping->host->i_rdev;
+ ),
+
+ TP_printk("dev %d:%d ino %lx page=%p pfn=%lu ofs=%lu",
+ MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
+ __entry->i_ino,
+ __entry->page,
+ page_to_pfn(__entry->page),
+ __entry->index << PAGE_SHIFT)
+);
+
+DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache,
+ TP_PROTO(struct page *page),
+ TP_ARGS(page)
+ );
+
+DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
+ TP_PROTO(struct page *page),
+ TP_ARGS(page)
+ );
+
+#endif /* _TRACE_FILEMAP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index c7fc1e6517c3..a4ed56cf0eac 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -88,7 +88,6 @@ struct inodes_stat_t {
#define MS_STRICTATIME (1<<24) /* Always perform atime updates */
/* These sb flags are internal to the kernel */
-#define MS_SNAP_STABLE (1<<27) /* Snapshot pages during writeback, if needed */
#define MS_NOSEC (1<<28)
#define MS_BORN (1<<29)
#define MS_ACTIVE (1<<30)
diff --git a/ipc/util.c b/ipc/util.c
index 464a8abd779f..03eadd8fb0fd 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -23,6 +23,7 @@
#include <linux/msg.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+#include <linux/notifier.h>
#include <linux/capability.h>
#include <linux/highuid.h>
#include <linux/security.h>
@@ -47,19 +48,16 @@ struct ipc_proc_iface {
int (*show)(struct seq_file *, void *);
};
-#ifdef CONFIG_MEMORY_HOTPLUG
-
static void ipc_memory_notifier(struct work_struct *work)
{
ipcns_notify(IPCNS_MEMCHANGED);
}
-static DECLARE_WORK(ipc_memory_wq, ipc_memory_notifier);
-
-
static int ipc_memory_callback(struct notifier_block *self,
unsigned long action, void *arg)
{
+ static DECLARE_WORK(ipc_memory_wq, ipc_memory_notifier);
+
switch (action) {
case MEM_ONLINE: /* memory successfully brought online */
case MEM_OFFLINE: /* or offline: it's time to recompute msgmni */
@@ -85,7 +83,10 @@ static int ipc_memory_callback(struct notifier_block *self,
return NOTIFY_OK;
}
-#endif /* CONFIG_MEMORY_HOTPLUG */
+static struct notifier_block ipc_memory_nb = {
+ .notifier_call = ipc_memory_callback,
+ .priority = IPC_CALLBACK_PRI,
+};
/**
* ipc_init - initialise IPC subsystem
@@ -102,7 +103,7 @@ static int __init ipc_init(void)
sem_init();
msg_init();
shm_init();
- hotplug_memory_notifier(ipc_memory_callback, IPC_CALLBACK_PRI);
+ register_hotmemory_notifier(&ipc_memory_nb);
register_ipcns_notifier(&init_ipc_ns);
return 0;
}
diff --git a/kernel/audit.c b/kernel/audit.c
index d596e5355f15..9816a1b96cfc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* As soon as there's any sign of userspace auditd,
* start kauditd to talk to it */
- if (!kauditd_task)
+ if (!kauditd_task) {
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
- if (IS_ERR(kauditd_task)) {
- err = PTR_ERR(kauditd_task);
- kauditd_task = NULL;
- return err;
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
}
-
loginuid = audit_get_loginuid(current);
sessionid = audit_get_sessionid(current);
security_task_getsecid(current, &sid);
diff --git a/kernel/audit.h b/kernel/audit.h
index d51cba868e1b..11468d99dad0 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -59,10 +59,7 @@ struct audit_entry {
struct audit_krule rule;
};
-#ifdef CONFIG_AUDIT
-extern int audit_enabled;
extern int audit_ever_enabled;
-#endif
extern int audit_pid;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 642a89c4f3d6..a291aa23fb3f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -617,9 +617,9 @@ void audit_trim_trees(void)
}
spin_unlock(&hash_lock);
trim_marked(tree);
- put_tree(tree);
drop_collected_mounts(root_mnt);
skip_it:
+ put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f9fc54bbe06f..267436826c3b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -594,6 +594,10 @@ exit_nofree:
return entry;
exit_free:
+ if (entry->rule.watch)
+ audit_put_watch(entry->rule.watch); /* matches initial get */
+ if (entry->rule.tree)
+ audit_put_tree(entry->rule.tree); /* that's the temporary one */
audit_free_rule(entry);
return ERR_PTR(err);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f857a0a9..c68229411a7c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context)
}
}
-static inline void audit_zero_context(struct audit_context *context,
- enum audit_state state)
-{
- memset(context, 0, sizeof(*context));
- context->state = state;
- context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-}
-
static inline struct audit_context *audit_alloc_context(enum audit_state state)
{
struct audit_context *context;
- if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (!context)
return NULL;
- audit_zero_context(context, state);
+ context->state = state;
+ context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
return context;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..dfaf50d4705e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5416,55 +5416,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
}
EXPORT_SYMBOL_GPL(css_lookup);
-/**
- * css_get_next - lookup next cgroup under specified hierarchy.
- * @ss: pointer to subsystem
- * @id: current position of iteration.
- * @root: pointer to css. search tree under this.
- * @foundid: position of found object.
- *
- * Search next css under the specified hierarchy of rootid. Calling under
- * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
- */
-struct cgroup_subsys_state *
-css_get_next(struct cgroup_subsys *ss, int id,
- struct cgroup_subsys_state *root, int *foundid)
-{
- struct cgroup_subsys_state *ret = NULL;
- struct css_id *tmp;
- int tmpid;
- int rootid = css_id(root);
- int depth = css_depth(root);
-
- if (!rootid)
- return NULL;
-
- BUG_ON(!ss->use_id);
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- /* fill start point for scan */
- tmpid = id;
- while (1) {
- /*
- * scan next entry from bitmap(tree), tmpid is updated after
- * idr_get_next().
- */
- tmp = idr_get_next(&ss->idr, &tmpid);
- if (!tmp)
- break;
- if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
- ret = rcu_dereference(tmp->css);
- if (ret) {
- *foundid = tmpid;
- break;
- }
- }
- /* continue to scan from next id */
- tmpid = tmpid + 1;
- }
- return ret;
-}
-
/*
* get corresponding css from file open on cgroupfs directory
*/
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4f9dfe43ecbd..334d983a36b2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2251,7 +2251,6 @@ void cpuset_update_active_cpus(bool cpu_online)
schedule_work(&cpuset_hotplug_work);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2263,20 +2262,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
-#endif
+
+static struct notifier_block cpuset_track_online_nodes_nb = {
+ .notifier_call = cpuset_track_online_nodes,
+ .priority = 10, /* ??! */
+};
/**
* cpuset_init_smp - initialize cpus_allowed
*
* Description: Finish top cpuset after cpu, node maps are initialized
- **/
-
+ */
void __init cpuset_init_smp(void)
{
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
top_cpuset.mems_allowed = node_states[N_MEMORY];
- hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+ register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
cpuset_propagate_hotplug_wq =
alloc_ordered_workqueue("cpuset_hotplug", 0);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ffd4e111fd67..b574920cbd4b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1118,12 +1118,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
{
unsigned long addr;
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
- init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
- free_page((unsigned long)__va(addr));
- totalram_pages++;
- }
+ for (addr = begin; addr < end; addr += PAGE_SIZE)
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
}
int crash_shrink_memory(unsigned long new_size)
@@ -1581,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(swapper_pg_dir);
#endif
VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmlist);
+ VMCOREINFO_SYMBOL(vmap_area_list);
#ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(mem_map);
@@ -1619,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(free_area, free_list);
VMCOREINFO_OFFSET(list_head, next);
VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vm_struct, addr);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
log_buf_kexec_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9eb7fed0bbaa..9b12d65186f7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -52,8 +52,21 @@ enum KTHREAD_BITS {
KTHREAD_IS_PARKED,
};
-#define to_kthread(tsk) \
- container_of((tsk)->vfork_done, struct kthread, exited)
+#define __to_kthread(vfork) \
+ container_of(vfork, struct kthread, exited)
+
+static inline struct kthread *to_kthread(struct task_struct *k)
+{
+ return __to_kthread(k->vfork_done);
+}
+
+static struct kthread *to_live_kthread(struct task_struct *k)
+{
+ struct completion *vfork = ACCESS_ONCE(k->vfork_done);
+ if (likely(vfork))
+ return __to_kthread(vfork);
+ return NULL;
+}
/**
* kthread_should_stop - should this kthread return now?
@@ -311,19 +324,6 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
return p;
}
-static struct kthread *task_get_live_kthread(struct task_struct *k)
-{
- struct kthread *kthread;
-
- get_task_struct(k);
- kthread = to_kthread(k);
- /* It might have exited */
- barrier();
- if (k->vfork_done != NULL)
- return kthread;
- return NULL;
-}
-
static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
{
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
@@ -350,11 +350,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
*/
void kthread_unpark(struct task_struct *k)
{
- struct kthread *kthread = task_get_live_kthread(k);
+ struct kthread *kthread = to_live_kthread(k);
if (kthread)
__kthread_unpark(k, kthread);
- put_task_struct(k);
}
/**
@@ -371,7 +370,7 @@ void kthread_unpark(struct task_struct *k)
*/
int kthread_park(struct task_struct *k)
{
- struct kthread *kthread = task_get_live_kthread(k);
+ struct kthread *kthread = to_live_kthread(k);
int ret = -ENOSYS;
if (kthread) {
@@ -384,7 +383,6 @@ int kthread_park(struct task_struct *k)
}
ret = 0;
}
- put_task_struct(k);
return ret;
}
@@ -405,10 +403,13 @@ int kthread_park(struct task_struct *k)
*/
int kthread_stop(struct task_struct *k)
{
- struct kthread *kthread = task_get_live_kthread(k);
+ struct kthread *kthread;
int ret;
trace_sched_kthread_stop(k);
+
+ get_task_struct(k);
+ kthread = to_live_kthread(k);
if (kthread) {
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
__kthread_unpark(k, kthread);
@@ -416,10 +417,9 @@ int kthread_stop(struct task_struct *k)
wait_for_completion(&kthread->exited);
}
ret = k->exit_code;
-
put_task_struct(k);
- trace_sched_kthread_stop_ret(ret);
+ trace_sched_kthread_stop_ret(ret);
return ret;
}
EXPORT_SYMBOL(kthread_stop);
diff --git a/kernel/resource.c b/kernel/resource.c
index 73f35d4b30b9..d7386986e10e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -21,6 +21,7 @@
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
+#include <linux/mm.h>
#include <asm/io.h>
@@ -50,6 +51,14 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
+/*
+ * For memory hotplug, there is no way to free resource entries allocated
+ * by boot mem after the system is up. So for reusing the resource entry
+ * we need to remember the resource.
+ */
+static struct resource *bootmem_resource_free;
+static DEFINE_SPINLOCK(bootmem_resource_lock);
+
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -151,6 +160,40 @@ __initcall(ioresources_init);
#endif /* CONFIG_PROC_FS */
+static void free_resource(struct resource *res)
+{
+ if (!res)
+ return;
+
+ if (!PageSlab(virt_to_head_page(res))) {
+ spin_lock(&bootmem_resource_lock);
+ res->sibling = bootmem_resource_free;
+ bootmem_resource_free = res;
+ spin_unlock(&bootmem_resource_lock);
+ } else {
+ kfree(res);
+ }
+}
+
+static struct resource *alloc_resource(gfp_t flags)
+{
+ struct resource *res = NULL;
+
+ spin_lock(&bootmem_resource_lock);
+ if (bootmem_resource_free) {
+ res = bootmem_resource_free;
+ bootmem_resource_free = res->sibling;
+ }
+ spin_unlock(&bootmem_resource_lock);
+
+ if (res)
+ memset(res, 0, sizeof(struct resource));
+ else
+ res = kzalloc(sizeof(struct resource), flags);
+
+ return res;
+}
+
/* Return the conflict entry if you can't request it */
static struct resource * __request_resource(struct resource *root, struct resource *new)
{
@@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
write_unlock(&resource_lock);
}
-/**
- * adjust_resource - modify a resource's start and size
- * @res: resource to modify
- * @start: new start value
- * @size: new size
- *
- * Given an existing resource, change its start and size to match the
- * arguments. Returns 0 on success, -EBUSY if it can't fit.
- * Existing children of the resource are assumed to be immutable.
- */
-int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
+static int __adjust_resource(struct resource *res, resource_size_t start,
+ resource_size_t size)
{
struct resource *tmp, *parent = res->parent;
resource_size_t end = start + size - 1;
int result = -EBUSY;
- write_lock(&resource_lock);
-
if (!parent)
goto skip;
@@ -751,6 +783,26 @@ skip:
result = 0;
out:
+ return result;
+}
+
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
+ * Given an existing resource, change its start and size to match the
+ * arguments. Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
+ */
+int adjust_resource(struct resource *res, resource_size_t start,
+ resource_size_t size)
+{
+ int result;
+
+ write_lock(&resource_lock);
+ result = __adjust_resource(res, start, size);
write_unlock(&resource_lock);
return result;
}
@@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,
{
struct resource *parent = root;
struct resource *conflict;
- struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+ struct resource *res = alloc_resource(GFP_ATOMIC);
struct resource *next_res = NULL;
if (!res)
@@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,
/* conflict covered whole area */
if (conflict->start <= res->start &&
conflict->end >= res->end) {
- kfree(res);
+ free_resource(res);
WARN_ON(next_res);
break;
}
@@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,
end = res->end;
res->end = conflict->start - 1;
if (conflict->end < end) {
- next_res = kzalloc(sizeof(*next_res),
- GFP_ATOMIC);
+ next_res = alloc_resource(GFP_ATOMIC);
if (!next_res) {
- kfree(res);
+ free_resource(res);
break;
}
next_res->name = name;
@@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent,
const char *name, int flags)
{
DECLARE_WAITQUEUE(wait, current);
- struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+ struct resource *res = alloc_resource(GFP_KERNEL);
if (!res)
return NULL;
@@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent,
continue;
}
/* Uhhuh, that didn't work out.. */
- kfree(res);
+ free_resource(res);
res = NULL;
break;
}
@@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,
return -EBUSY;
release_resource(res);
- kfree(res);
+ free_resource(res);
return 0;
}
EXPORT_SYMBOL(__check_region);
@@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,
write_unlock(&resource_lock);
if (res->flags & IORESOURCE_MUXED)
wake_up(&muxed_resource_wait);
- kfree(res);
+ free_resource(res);
return;
}
p = &res->sibling;
@@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start,
}
EXPORT_SYMBOL(__release_region);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/**
+ * release_mem_region_adjustable - release a previously reserved memory region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @size: resource region size
+ *
+ * This interface is intended for memory hot-delete. The requested region
+ * is released from a currently busy memory resource. The requested region
+ * must either match exactly or fit into a single busy resource entry. In
+ * the latter case, the remaining resource is adjusted accordingly.
+ * Existing children of the busy memory resource must be immutable in the
+ * request.
+ *
+ * Note:
+ * - Additional release conditions, such as overlapping region, can be
+ * supported after they are confirmed as valid cases.
+ * - When a busy memory resource gets split into two entries, the code
+ * assumes that all children remain in the lower address entry for
+ * simplicity. Enhance this logic when necessary.
+ */
+int release_mem_region_adjustable(struct resource *parent,
+ resource_size_t start, resource_size_t size)
+{
+ struct resource **p;
+ struct resource *res;
+ struct resource *new_res;
+ resource_size_t end;
+ int ret = -EINVAL;
+
+ end = start + size - 1;
+ if ((start < parent->start) || (end > parent->end))
+ return ret;
+
+ /* The alloc_resource() result gets checked later */
+ new_res = alloc_resource(GFP_KERNEL);
+
+ p = &parent->child;
+ write_lock(&resource_lock);
+
+ while ((res = *p)) {
+ if (res->start >= end)
+ break;
+
+ /* look for the next resource if it does not fit into */
+ if (res->start > start || res->end < end) {
+ p = &res->sibling;
+ continue;
+ }
+
+ if (!(res->flags & IORESOURCE_MEM))
+ break;
+
+ if (!(res->flags & IORESOURCE_BUSY)) {
+ p = &res->child;
+ continue;
+ }
+
+ /* found the target resource; let's adjust accordingly */
+ if (res->start == start && res->end == end) {
+ /* free the whole entry */
+ *p = res->sibling;
+ free_resource(res);
+ ret = 0;
+ } else if (res->start == start && res->end != end) {
+ /* adjust the start */
+ ret = __adjust_resource(res, end + 1,
+ res->end - end);
+ } else if (res->start != start && res->end == end) {
+ /* adjust the end */
+ ret = __adjust_resource(res, res->start,
+ start - res->start);
+ } else {
+ /* split into two entries */
+ if (!new_res) {
+ ret = -ENOMEM;
+ break;
+ }
+ new_res->name = res->name;
+ new_res->start = end + 1;
+ new_res->end = res->end;
+ new_res->flags = res->flags;
+ new_res->parent = res->parent;
+ new_res->sibling = res->sibling;
+ new_res->child = NULL;
+
+ ret = __adjust_resource(res, res->start,
+ start - res->start);
+ if (ret)
+ break;
+ res->sibling = new_res;
+ new_res = NULL;
+ }
+
+ break;
+ }
+
+ write_unlock(&resource_lock);
+ free_resource(new_res);
+ return ret;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
/*
* Managed region resource
*/
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afc1dc60f3f8..9edcf456e0fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;
#endif
extern int pid_max;
extern int pid_max_min, pid_max_max;
-extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
extern int latencytop_enabled;
@@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .procname = "user_reserve_kbytes",
+ .data = &sysctl_user_reserve_kbytes,
+ .maxlen = sizeof(sysctl_user_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "admin_reserve_kbytes",
+ .data = &sysctl_admin_reserve_kbytes,
+ .maxlen = sizeof(sysctl_admin_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
{ }
};
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 4407f8c9b1f7..b7c72311ad0c 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -18,6 +18,9 @@ void show_mem(unsigned int filter)
printk("Mem-Info:\n");
show_free_areas(filter);
+ if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+ return;
+
for_each_online_pgdat(pgdat) {
unsigned long i, flags;
diff --git a/mm/Kconfig b/mm/Kconfig
index 3bea74f1ccfe..e742d06285b7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -263,8 +263,14 @@ config ZONE_DMA_FLAG
default "1"
config BOUNCE
- def_bool y
+ bool "Enable bounce buffers"
+ default y
depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+ help
+ Enable bounce buffers for devices that cannot access
+ the full range of memory available to the CPU. Enabled
+ by default when ZONE_DMA or HIGHMEM is selected, but you
+ may say n to override this.
# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
# have more than 4GB of memory, but we don't currently use the IOTLB to present
diff --git a/mm/Makefile b/mm/Makefile
index 3a4628751f89..72c5acb9345f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/bounce.c b/mm/bounce.c
index 5f8901768602..a5c2ec3589cb 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -181,32 +181,13 @@ static void bounce_end_io_read_isa(struct bio *bio, int err)
#ifdef CONFIG_NEED_BOUNCE_POOL
static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
{
- struct page *page;
- struct backing_dev_info *bdi;
- struct address_space *mapping;
- struct bio_vec *from;
- int i;
-
if (bio_data_dir(bio) != WRITE)
return 0;
if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
return 0;
- /*
- * Based on the first page that has a valid mapping, decide whether or
- * not we have to employ bounce buffering to guarantee stable pages.
- */
- bio_for_each_segment(from, bio, i) {
- page = from->bv_page;
- mapping = page_mapping(page);
- if (!mapping)
- continue;
- bdi = mapping->backing_dev_info;
- return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
- }
-
- return 0;
+ return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
}
#else
static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
diff --git a/mm/filemap.c b/mm/filemap.c
index e1979fdca805..e989fb1eaa72 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -35,6 +35,9 @@
#include <linux/cleancache.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/filemap.h>
+
/*
* FIXME: remove all knowledge of the buffer layer from the core VM
*/
@@ -113,6 +116,7 @@ void __delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ trace_mm_filemap_delete_from_page_cache(page);
/*
* if we're uptodate, flush out into the cleancache, otherwise
* invalidate any existing cleancache entries. We can't leave
@@ -184,6 +188,17 @@ static int sleep_on_page_killable(void *word)
return fatal_signal_pending(current) ? -EINTR : 0;
}
+static int filemap_check_errors(struct address_space *mapping)
+{
+ int ret = 0;
+ /* Check for outstanding write errors */
+ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ ret = -ENOSPC;
+ if (test_and_clear_bit(AS_EIO, &mapping->flags))
+ ret = -EIO;
+ return ret;
+}
+
/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
@@ -265,10 +280,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
struct pagevec pvec;
int nr_pages;
- int ret = 0;
+ int ret2, ret = 0;
if (end_byte < start_byte)
- return 0;
+ goto out;
pagevec_init(&pvec, 0);
while ((index <= end) &&
@@ -291,12 +306,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
pagevec_release(&pvec);
cond_resched();
}
-
- /* Check for outstanding write errors */
- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
- ret = -ENOSPC;
- if (test_and_clear_bit(AS_EIO, &mapping->flags))
- ret = -EIO;
+out:
+ ret2 = filemap_check_errors(mapping);
+ if (!ret)
+ ret = ret2;
return ret;
}
@@ -337,6 +350,8 @@ int filemap_write_and_wait(struct address_space *mapping)
if (!err)
err = err2;
}
+ } else {
+ err = filemap_check_errors(mapping);
}
return err;
}
@@ -368,6 +383,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
if (!err)
err = err2;
}
+ } else {
+ err = filemap_check_errors(mapping);
}
return err;
}
@@ -464,6 +481,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
+ trace_mm_filemap_add_to_page_cache(page);
} else {
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2f7f5aaaafb..03a89a2f464b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -163,35 +163,34 @@ static int start_khugepaged(void)
}
static atomic_t huge_zero_refcount;
-static unsigned long huge_zero_pfn __read_mostly;
+static struct page *huge_zero_page __read_mostly;
-static inline bool is_huge_zero_pfn(unsigned long pfn)
+static inline bool is_huge_zero_page(struct page *page)
{
- unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
- return zero_pfn && pfn == zero_pfn;
+ return ACCESS_ONCE(huge_zero_page) == page;
}
static inline bool is_huge_zero_pmd(pmd_t pmd)
{
- return is_huge_zero_pfn(pmd_pfn(pmd));
+ return is_huge_zero_page(pmd_page(pmd));
}
-static unsigned long get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
{
struct page *zero_page;
retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
- return ACCESS_ONCE(huge_zero_pfn);
+ return ACCESS_ONCE(huge_zero_page);
zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
if (!zero_page) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
- return 0;
+ return NULL;
}
count_vm_event(THP_ZERO_PAGE_ALLOC);
preempt_disable();
- if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+ if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
preempt_enable();
__free_page(zero_page);
goto retry;
@@ -200,7 +199,7 @@ retry:
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
- return ACCESS_ONCE(huge_zero_pfn);
+ return ACCESS_ONCE(huge_zero_page);
}
static void put_huge_zero_page(void)
@@ -220,9 +219,9 @@ static int shrink_huge_zero_page(struct shrinker *shrink,
return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
- unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
- BUG_ON(zero_pfn == 0);
- __free_page(__pfn_to_page(zero_pfn));
+ struct page *zero_page = xchg(&huge_zero_page, NULL);
+ BUG_ON(zero_page == NULL);
+ __free_page(zero_page);
}
return 0;
@@ -713,6 +712,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
return VM_FAULT_OOM;
clear_huge_page(page, haddr, HPAGE_PMD_NR);
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * clear_huge_page writes become visible before the set_pmd_at()
+ * write.
+ */
__SetPageUptodate(page);
spin_lock(&mm->page_table_lock);
@@ -724,12 +728,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
} else {
pmd_t entry;
entry = mk_huge_pmd(page, vma);
- /*
- * The spinlocking to take the lru_lock inside
- * page_add_new_anon_rmap() acts as a full memory
- * barrier to be sure clear_huge_page writes become
- * visible after the set_pmd_at() write.
- */
page_add_new_anon_rmap(page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
pgtable_trans_huge_deposit(mm, pgtable);
@@ -765,12 +763,12 @@ static inline struct page *alloc_hugepage(int defrag)
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
- unsigned long zero_pfn)
+ struct page *zero_page)
{
pmd_t entry;
if (!pmd_none(*pmd))
return false;
- entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
+ entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
set_pmd_at(mm, haddr, pmd, entry);
@@ -795,20 +793,20 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!(flags & FAULT_FLAG_WRITE) &&
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
- unsigned long zero_pfn;
+ struct page *zero_page;
bool set;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
- zero_pfn = get_huge_zero_page();
- if (unlikely(!zero_pfn)) {
+ zero_page = get_huge_zero_page();
+ if (unlikely(!zero_page)) {
pte_free(mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
spin_lock(&mm->page_table_lock);
set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
- zero_pfn);
+ zero_page);
spin_unlock(&mm->page_table_lock);
if (!set) {
pte_free(mm, pgtable);
@@ -887,16 +885,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* a page table.
*/
if (is_huge_zero_pmd(pmd)) {
- unsigned long zero_pfn;
+ struct page *zero_page;
bool set;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
- zero_pfn = get_huge_zero_page();
+ zero_page = get_huge_zero_page();
set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
- zero_pfn);
+ zero_page);
BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
ret = 0;
goto out_unlock;
@@ -1560,7 +1558,8 @@ static int __split_huge_page_splitting(struct page *page,
return ret;
}
-static void __split_huge_page_refcount(struct page *page)
+static void __split_huge_page_refcount(struct page *page,
+ struct list_head *list)
{
int i;
struct zone *zone = page_zone(page);
@@ -1646,7 +1645,7 @@ static void __split_huge_page_refcount(struct page *page)
BUG_ON(!PageDirty(page_tail));
BUG_ON(!PageSwapBacked(page_tail));
- lru_add_page_tail(page, page_tail, lruvec);
+ lru_add_page_tail(page, page_tail, lruvec, list);
}
atomic_sub(tail_count, &page->_count);
BUG_ON(atomic_read(&page->_count) <= 0);
@@ -1753,7 +1752,8 @@ static int __split_huge_page_map(struct page *page,
/* must be called with anon_vma->root->rwsem held */
static void __split_huge_page(struct page *page,
- struct anon_vma *anon_vma)
+ struct anon_vma *anon_vma,
+ struct list_head *list)
{
int mapcount, mapcount2;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1784,7 +1784,7 @@ static void __split_huge_page(struct page *page,
mapcount, page_mapcount(page));
BUG_ON(mapcount != page_mapcount(page));
- __split_huge_page_refcount(page);
+ __split_huge_page_refcount(page, list);
mapcount2 = 0;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
@@ -1799,12 +1799,19 @@ static void __split_huge_page(struct page *page,
BUG_ON(mapcount != mapcount2);
}
-int split_huge_page(struct page *page)
+/*
+ * Split a hugepage into normal pages. This doesn't change the position of head
+ * page. If @list is null, tail pages will be added to LRU list, otherwise, to
+ * @list. Both head page and tail pages will inherit mapping, flags, and so on
+ * from the hugepage.
+ * Return 0 if the hugepage is split successfully otherwise return 1.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct anon_vma *anon_vma;
int ret = 1;
- BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
+ BUG_ON(is_huge_zero_page(page));
BUG_ON(!PageAnon(page));
/*
@@ -1824,7 +1831,7 @@ int split_huge_page(struct page *page)
goto out_unlock;
BUG_ON(!PageSwapBacked(page));
- __split_huge_page(page, anon_vma);
+ __split_huge_page(page, anon_vma, list);
count_vm_event(THP_SPLIT);
BUG_ON(PageCompound(page));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1a12f5b9a0ab..9b9aeef8e590 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2121,6 +2121,21 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
nid, h->surplus_huge_pages_node[nid]);
}
+void hugetlb_show_meminfo(void)
+{
+ struct hstate *h;
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ for_each_hstate(h)
+ pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+ nid,
+ h->nr_huge_pages_node[nid],
+ h->free_huge_pages_node[nid],
+ h->surplus_huge_pages_node[nid],
+ 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+}
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
@@ -2247,10 +2262,11 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
pte_t entry;
if (writable) {
- entry =
- pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
+ vma->vm_page_prot)));
} else {
- entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ entry = huge_pte_wrprotect(mk_huge_pte(page,
+ vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
@@ -2264,7 +2280,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
{
pte_t entry;
- entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
+ entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
update_mmu_cache(vma, address, ptep);
}
@@ -2379,7 +2395,7 @@ again:
* HWPoisoned hugepage is already unmapped and dropped reference
*/
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
- pte_clear(mm, address, ptep);
+ huge_pte_clear(mm, address, ptep);
continue;
}
@@ -2403,7 +2419,7 @@ again:
pte = huge_ptep_get_and_clear(mm, address, ptep);
tlb_remove_tlb_entry(tlb, ptep, address);
- if (pte_dirty(pte))
+ if (huge_pte_dirty(pte))
set_page_dirty(page);
page_remove_rmap(page);
@@ -2856,7 +2872,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* page now as it is used to determine if a reservation has been
* consumed.
*/
- if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
+ if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
if (vma_needs_reservation(h, vma, address) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
@@ -2886,12 +2902,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
- if (!pte_write(entry)) {
+ if (!huge_pte_write(entry)) {
ret = hugetlb_cow(mm, vma, address, ptep, entry,
pagecache_page);
goto out_page_table_lock;
}
- entry = pte_mkdirty(entry);
+ entry = huge_pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, address, ptep, entry,
@@ -2972,7 +2988,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* directly from any kind of swap entries.
*/
if (absent || is_swap_pte(huge_ptep_get(pte)) ||
- ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
+ ((flags & FOLL_WRITE) &&
+ !huge_pte_write(huge_ptep_get(pte)))) {
int ret;
spin_unlock(&mm->page_table_lock);
@@ -3042,7 +3059,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
if (!huge_pte_none(huge_ptep_get(ptep))) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
- pte = pte_mkhuge(pte_modify(pte, newprot));
+ pte = pte_mkhuge(huge_pte_modify(pte, newprot));
pte = arch_make_huge_pte(pte, vma, NULL, 0);
set_huge_pte_at(mm, address, ptep, pte);
pages++;
diff --git a/mm/madvise.c b/mm/madvise.c
index c58c94b56c3d..7055883e6e25 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -473,27 +473,27 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
if (!madvise_behavior_valid(behavior))
return error;
- write = madvise_need_mmap_write(behavior);
- if (write)
- down_write(&current->mm->mmap_sem);
- else
- down_read(&current->mm->mmap_sem);
-
if (start & ~PAGE_MASK)
- goto out;
+ return error;
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
- goto out;
+ return error;
end = start + len;
if (end < start)
- goto out;
+ return error;
error = 0;
if (end == start)
- goto out;
+ return error;
+
+ write = madvise_need_mmap_write(behavior);
+ if (write)
+ down_write(&current->mm->mmap_sem);
+ else
+ down_read(&current->mm->mmap_sem);
/*
* If the interval [start,end) covers some unmapped address
@@ -509,14 +509,14 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
/* Still start < end. */
error = -ENOMEM;
if (!vma)
- goto out_plug;
+ goto out;
/* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
start = vma->vm_start;
if (start >= end)
- goto out_plug;
+ goto out;
}
/* Here vma->vm_start <= start < (end|vma->vm_end) */
@@ -527,21 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
error = madvise_vma(vma, &prev, start, tmp, behavior);
if (error)
- goto out_plug;
+ goto out;
start = tmp;
if (prev && start < prev->vm_end)
start = prev->vm_end;
error = unmapped_error;
if (start >= end)
- goto out_plug;
+ goto out;
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_sem */
vma = find_vma(current->mm, start);
}
-out_plug:
- blk_finish_plug(&plug);
out:
+ blk_finish_plug(&plug);
if (write)
up_write(&current->mm->mmap_sem);
else
diff --git a/mm/memblock.c b/mm/memblock.c
index b8d9147e5c08..c5fad932fa51 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -322,10 +322,11 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
/**
* memblock_insert_region - insert new memblock region
- * @type: memblock type to insert into
- * @idx: index for the insertion point
- * @base: base address of the new region
- * @size: size of the new region
+ * @type: memblock type to insert into
+ * @idx: index for the insertion point
+ * @base: base address of the new region
+ * @size: size of the new region
+ * @nid: node id of the new region
*
* Insert new memblock region [@base,@base+@size) into @type at @idx.
* @type must already have extra room to accomodate the new region.
@@ -771,6 +772,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
{
phys_addr_t found;
+ if (WARN_ON(!align))
+ align = __alignof__(long long);
+
/* align @size to avoid excessive fragmentation on reserved array */
size = round_up(size, align);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2b552224f5cf..b8dc8e4cbf6a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
+#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
@@ -152,8 +153,13 @@ struct mem_cgroup_stat_cpu {
};
struct mem_cgroup_reclaim_iter {
- /* css_id of the last scanned hierarchy member */
- int position;
+ /*
+ * last scanned hierarchy member. Valid only if last_dead_count
+ * matches memcg->dead_count of the hierarchy root group.
+ */
+ struct mem_cgroup *last_visited;
+ unsigned long last_dead_count;
+
/* scan generation, increased every round-trip */
unsigned int generation;
};
@@ -256,6 +262,9 @@ struct mem_cgroup {
*/
struct res_counter res;
+ /* vmpressure notifications */
+ struct vmpressure vmpressure;
+
union {
/*
* the counter to account for mem+swap usage.
@@ -335,6 +344,7 @@ struct mem_cgroup {
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
+ atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
#endif
@@ -353,6 +363,7 @@ struct mem_cgroup {
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
+
/*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
@@ -504,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
return container_of(s, struct mem_cgroup, css);
}
+/* Some nice accessors for the vmpressure. */
+struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
+{
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ return &memcg->vmpressure;
+}
+
+struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+{
+ return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+}
+
+struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
+{
+ return &mem_cgroup_from_css(css)->vmpressure;
+}
+
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
@@ -1067,6 +1096,51 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
return memcg;
}
+/*
+ * Returns a next (in a pre-order walk) alive memcg (with elevated css
+ * ref. count) or NULL if the whole root's subtree has been visited.
+ *
+ * helper function to be used by mem_cgroup_iter
+ */
+static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
+ struct mem_cgroup *last_visited)
+{
+ struct cgroup *prev_cgroup, *next_cgroup;
+
+ /*
+ * Root is not visited by cgroup iterators so it needs an
+ * explicit visit.
+ */
+ if (!last_visited)
+ return root;
+
+ prev_cgroup = (last_visited == root) ? NULL
+ : last_visited->css.cgroup;
+skip_node:
+ next_cgroup = cgroup_next_descendant_pre(
+ prev_cgroup, root->css.cgroup);
+
+ /*
+ * Even if we found a group we have to make sure it is
+ * alive. css && !memcg means that the groups should be
+ * skipped and we should continue the tree walk.
+ * last_visited css is safe to use because it is
+ * protected by css_get and the tree walk is rcu safe.
+ */
+ if (next_cgroup) {
+ struct mem_cgroup *mem = mem_cgroup_from_cont(
+ next_cgroup);
+ if (css_tryget(&mem->css))
+ return mem;
+ else {
+ prev_cgroup = next_cgroup;
+ goto skip_node;
+ }
+ }
+
+ return NULL;
+}
+
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -1089,7 +1163,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup_reclaim_cookie *reclaim)
{
struct mem_cgroup *memcg = NULL;
- int id = 0;
+ struct mem_cgroup *last_visited = NULL;
+ unsigned long uninitialized_var(dead_count);
if (mem_cgroup_disabled())
return NULL;
@@ -1098,20 +1173,17 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
root = root_mem_cgroup;
if (prev && !reclaim)
- id = css_id(&prev->css);
-
- if (prev && prev != root)
- css_put(&prev->css);
+ last_visited = prev;
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
- return NULL;
+ goto out_css_put;
return root;
}
+ rcu_read_lock();
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
- struct cgroup_subsys_state *css;
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
@@ -1120,31 +1192,60 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
- if (prev && reclaim->generation != iter->generation)
- return NULL;
- id = iter->position;
+ last_visited = iter->last_visited;
+ if (prev && reclaim->generation != iter->generation) {
+ iter->last_visited = NULL;
+ goto out_unlock;
+ }
+
+ /*
+ * If the dead_count mismatches, a destruction
+ * has happened or is happening concurrently.
+ * If the dead_count matches, a destruction
+ * might still happen concurrently, but since
+ * we checked under RCU, that destruction
+ * won't free the object until we release the
+ * RCU reader lock. Thus, the dead_count
+ * check verifies the pointer is still valid,
+ * css_tryget() verifies the cgroup pointed to
+ * is alive.
+ */
+ dead_count = atomic_read(&root->dead_count);
+ smp_rmb();
+ last_visited = iter->last_visited;
+ if (last_visited) {
+ if ((dead_count != iter->last_dead_count) ||
+ !css_tryget(&last_visited->css)) {
+ last_visited = NULL;
+ }
+ }
}
- rcu_read_lock();
- css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
- if (css) {
- if (css == &root->css || css_tryget(css))
- memcg = mem_cgroup_from_css(css);
- } else
- id = 0;
- rcu_read_unlock();
+ memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- iter->position = id;
- if (!css)
+ if (last_visited)
+ css_put(&last_visited->css);
+
+ iter->last_visited = memcg;
+ smp_wmb();
+ iter->last_dead_count = dead_count;
+
+ if (!memcg)
iter->generation++;
else if (!prev && memcg)
reclaim->generation = iter->generation;
}
- if (prev && !css)
- return NULL;
+ if (prev && !memcg)
+ goto out_unlock;
}
+out_unlock:
+ rcu_read_unlock();
+out_css_put:
+ if (prev && prev != root)
+ css_put(&prev->css);
+
return memcg;
}
@@ -1686,11 +1787,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct task_struct *chosen = NULL;
/*
- * If current has a pending SIGKILL, then automatically select it. The
- * goal is to allow it to allocate so that it may quickly exit and free
- * its memory.
+ * If current has a pending SIGKILL or is exiting, then automatically
+ * select it. The goal is to allow it to allocate so that it may
+ * quickly exit and free its memory.
*/
- if (fatal_signal_pending(current)) {
+ if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
set_thread_flag(TIF_MEMDIE);
return;
}
@@ -3114,12 +3215,12 @@ void memcg_release_cache(struct kmem_cache *s)
root = s->memcg_params->root_cache;
root->memcg_params->memcg_caches[id] = NULL;
- mem_cgroup_put(memcg);
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
+ mem_cgroup_put(memcg);
out:
kfree(s->memcg_params);
}
@@ -3382,7 +3483,6 @@ static void memcg_create_cache_work_func(struct work_struct *w)
/*
* Enqueue the creation of a per-memcg kmem_cache.
- * Called with rcu_read_lock.
*/
static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
@@ -3390,12 +3490,8 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
struct create_work *cw;
cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
- if (cw == NULL)
- return;
-
- /* The corresponding put will be done in the workqueue. */
- if (!css_tryget(&memcg->css)) {
- kfree(cw);
+ if (cw == NULL) {
+ css_put(&memcg->css);
return;
}
@@ -3451,10 +3547,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
- rcu_read_unlock();
if (!memcg_can_account_kmem(memcg))
- return cachep;
+ goto out;
idx = memcg_cache_id(memcg);
@@ -3463,29 +3558,38 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();
- if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
- /*
- * If we are in a safe context (can wait, and not in interrupt
- * context), we could be be predictable and return right away.
- * This would guarantee that the allocation being performed
- * already belongs in the new cache.
- *
- * However, there are some clashes that can arrive from locking.
- * For instance, because we acquire the slab_mutex while doing
- * kmem_cache_dup, this means no further allocation could happen
- * with the slab_mutex held.
- *
- * Also, because cache creation issue get_online_cpus(), this
- * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
- * that ends up reversed during cpu hotplug. (cpuset allocates
- * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
- * better to defer everything.
- */
- memcg_create_cache_enqueue(memcg, cachep);
- return cachep;
+ if (likely(cachep->memcg_params->memcg_caches[idx])) {
+ cachep = cachep->memcg_params->memcg_caches[idx];
+ goto out;
}
- return cachep->memcg_params->memcg_caches[idx];
+ /* The corresponding put will be done in the workqueue. */
+ if (!css_tryget(&memcg->css))
+ goto out;
+ rcu_read_unlock();
+
+ /*
+ * If we are in a safe context (can wait, and not in interrupt
+ * context), we could be be predictable and return right away.
+ * This would guarantee that the allocation being performed
+ * already belongs in the new cache.
+ *
+ * However, there are some clashes that can arrive from locking.
+ * For instance, because we acquire the slab_mutex while doing
+ * kmem_cache_dup, this means no further allocation could happen
+ * with the slab_mutex held.
+ *
+ * Also, because cache creation issue get_online_cpus(), this
+ * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
+ * that ends up reversed during cpu hotplug. (cpuset allocates
+ * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
+ * better to defer everything.
+ */
+ memcg_create_cache_enqueue(memcg, cachep);
+ return cachep;
+out:
+ rcu_read_unlock();
+ return cachep;
}
EXPORT_SYMBOL(__memcg_kmem_get_cache);
@@ -4947,9 +5051,6 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
- if (!do_swap_account && type == _MEMSWAP)
- return -EOPNOTSUPP;
-
switch (type) {
case _MEM:
if (name == RES_USAGE)
@@ -5084,9 +5185,6 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
- if (!do_swap_account && type == _MEMSWAP)
- return -EOPNOTSUPP;
-
switch (name) {
case RES_LIMIT:
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
@@ -5163,9 +5261,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);
- if (!do_swap_account && type == _MEMSWAP)
- return -EOPNOTSUPP;
-
switch (name) {
case RES_MAX_USAGE:
if (type == _MEM)
@@ -5744,7 +5839,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return ret;
return mem_cgroup_sockets_init(memcg, ss);
-};
+}
static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
{
@@ -5838,6 +5933,11 @@ static struct cftype mem_cgroup_files[] = {
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
+ {
+ .name = "pressure_level",
+ .register_event = vmpressure_register_event,
+ .unregister_event = vmpressure_unregister_event,
+ },
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
@@ -6119,6 +6219,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
+ vmpressure_init(&memcg->vmpressure);
return &memcg->css;
@@ -6184,10 +6285,29 @@ mem_cgroup_css_online(struct cgroup *cont)
return error;
}
+/*
+ * Announce all parents that a group from their hierarchy is gone.
+ */
+static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *parent = memcg;
+
+ while ((parent = parent_mem_cgroup(parent)))
+ atomic_inc(&parent->dead_count);
+
+ /*
+ * if the root memcg is not hierarchical we have to check it
+ * explicitely.
+ */
+ if (!root_mem_cgroup->use_hierarchy)
+ atomic_inc(&root_mem_cgroup->dead_count);
+}
+
static void mem_cgroup_css_offline(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ mem_cgroup_invalidate_reclaim_iterators(memcg);
mem_cgroup_reparent_charges(memcg);
mem_cgroup_destroy_all_caches(memcg);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index df0694c6adef..ceb0c7f1932f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -785,10 +785,10 @@ static struct page_state {
{ sc|dirty, sc, "clean swapcache", me_swapcache_clean },
{ mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
- { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
+ { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
{ unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
- { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
+ { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
{ lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
{ lru|dirty, lru, "clean LRU", me_pagecache_clean },
diff --git a/mm/memory.c b/mm/memory.c
index ba94dec5b259..f7a1fba85d14 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3244,6 +3244,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceeding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
__SetPageUptodate(page);
if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ee3765760818..a221fac1f47d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -436,6 +436,40 @@ static int __meminit __add_section(int nid, struct zone *zone,
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
}
+/*
+ * Reasonably generic function for adding memory. It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+ int err = 0;
+ int start_sec, end_sec;
+ /* during initialize mem_map, align hot-added range to section */
+ start_sec = pfn_to_section_nr(phys_start_pfn);
+ end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
+
+ for (i = start_sec; i <= end_sec; i++) {
+ err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
+
+ /*
+ * EEXIST is finally dealt with by ioresource collision
+ * check. see add_memory() => register_memory_resource()
+ * Warning will be printed if there is collision.
+ */
+ if (err && (err != -EEXIST))
+ break;
+ err = 0;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(__add_pages);
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static int find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
@@ -658,39 +692,6 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
return 0;
}
-/*
- * Reasonably generic function for adding memory. It is
- * expected that archs that support memory hotplug will
- * call this function after deciding the zone to which to
- * add the new pages.
- */
-int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages)
-{
- unsigned long i;
- int err = 0;
- int start_sec, end_sec;
- /* during initialize mem_map, align hot-added range to section */
- start_sec = pfn_to_section_nr(phys_start_pfn);
- end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
-
- for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
-
- /*
- * EEXIST is finally dealt with by ioresource collision
- * check. see add_memory() => register_memory_resource()
- * Warning will be printed if there is collision.
- */
- if (err && (err != -EEXIST))
- break;
- err = 0;
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(__add_pages);
-
/**
* __remove_pages() - remove sections of pages from a zone
* @zone: zone from which pages need to be removed
@@ -705,8 +706,10 @@ EXPORT_SYMBOL_GPL(__add_pages);
int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
unsigned long nr_pages)
{
- unsigned long i, ret = 0;
+ unsigned long i;
int sections_to_remove;
+ resource_size_t start, size;
+ int ret = 0;
/*
* We can only remove entire sections
@@ -714,7 +717,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
BUG_ON(nr_pages % PAGES_PER_SECTION);
- release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
+ start = phys_start_pfn << PAGE_SHIFT;
+ size = nr_pages * PAGE_SIZE;
+ ret = release_mem_region_adjustable(&iomem_resource, start, size);
+ if (ret)
+ pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n",
+ start, start + size - 1, ret);
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
@@ -726,6 +734,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
int set_online_page_callback(online_page_callback_t callback)
{
@@ -1613,7 +1622,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
/**
* walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
* @start_pfn: start pfn of the memory range
- * @end_pfn: end pft of the memory range
+ * @end_pfn: end pfn of the memory range
* @arg: argument passed to func
* @func: callback for each memory section walked
*
@@ -1681,11 +1690,15 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ phys_addr_t beginpa, endpa;
+
+ beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
+ endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
pr_warn("removing memory fails, because memory "
- "[%#010llx-%#010llx] is onlined\n",
- PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
- PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
+ "[%pa-%pa] is onlined\n",
+ &beginpa, &endpa);
+ }
return ret;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 3bbaf5d230b0..27ed22579fd9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -736,7 +736,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (PageWriteback(page)) {
/*
- * Only in the case of a full syncronous migration is it
+ * Only in the case of a full synchronous migration is it
* necessary to wait for PageWriteback. In the async case,
* the retry loop is too short and in the sync-light case,
* the overhead of stalling is too much
@@ -973,19 +973,23 @@ out:
}
/*
- * migrate_pages
+ * migrate_pages - migrate the pages specified in a list, to the free pages
+ * supplied as the target for the page migration
*
- * The function takes one list of pages to migrate and a function
- * that determines from the page to be migrated and the private data
- * the target of the move and allocates the page.
+ * @from: The list of pages to be migrated.
+ * @get_new_page: The function used to allocate free pages to be used
+ * as the target of the page migration.
+ * @private: Private data to be passed on to get_new_page()
+ * @mode: The migration mode that specifies the constraints for
+ * page migration, if any.
+ * @reason: The reason for page migration.
*
- * The function returns after 10 attempts or if no pages
- * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
- * Caller should call putback_lru_pages to return pages to the LRU
+ * The function returns after 10 attempts or if no pages are movable any more
+ * because the list has become empty or no retryable pages exist any more.
+ * The caller should call putback_lru_pages() to return pages to the LRU
* or free list only if ret != 0.
*
- * Return: Number of pages not migrated or error code.
+ * Returns the number of pages that were not migrated, or an error code.
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
unsigned long private, enum migrate_mode mode, int reason)
diff --git a/mm/mmap.c b/mm/mmap.c
index 0db0de1c2fbe..da3e9c04bf37 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -6,6 +6,7 @@
* Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
+#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
@@ -33,6 +34,8 @@
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
#include <linux/sched/sysctl.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -84,6 +87,8 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
/*
* Make sure vm_committed_as in one cacheline and not cacheline shared with
* other variables. It can be updated by several CPUs frequently.
@@ -122,7 +127,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed;
+ unsigned long free, allowed, reserve;
vm_acct_memory(pages);
@@ -163,10 +168,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free -= totalreserve_pages;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- free -= free / 32;
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
@@ -177,16 +182,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
allowed = (totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- allowed -= allowed / 32;
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed += total_swap_pages;
- /* Don't let a single process grow too big:
- leave 3% of the size of this process for other processes */
- if (mm)
- allowed -= mm->total_vm / 32;
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min(mm->total_vm / 32, reserve);
+ }
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
@@ -543,6 +551,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
return 0;
}
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long nr_pages = 0;
+ struct vm_area_struct *vma;
+
+ /* Find first overlaping mapping */
+ vma = find_vma_intersection(mm, addr, end);
+ if (!vma)
+ return 0;
+
+ nr_pages = (min(end, vma->vm_end) -
+ max(addr, vma->vm_start)) >> PAGE_SHIFT;
+
+ /* Iterate over the rest of the overlaps */
+ for (vma = vma->vm_next; vma; vma = vma->vm_next) {
+ unsigned long overlap_len;
+
+ if (vma->vm_start > end)
+ break;
+
+ overlap_len = min(end, vma->vm_end) - vma->vm_start;
+ nr_pages += overlap_len >> PAGE_SHIFT;
+ }
+
+ return nr_pages;
+}
+
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
@@ -829,7 +865,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- mpol_put(vma_policy(next));
+ vma_set_policy(vma, vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
* In mprotect's case 6 (see comments on vma_merge),
@@ -1435,6 +1471,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long charged = 0;
struct inode *inode = file ? file_inode(file) : NULL;
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ if (!(vm_flags & MAP_FIXED))
+ return -ENOMEM;
+
+ nr_pages = count_vma_pages_range(mm, addr, addr + len);
+
+ if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
+
/* Clear old maps */
error = -ENOMEM;
munmap_back:
@@ -1444,10 +1497,6 @@ munmap_back:
goto munmap_back;
}
- /* Check against address space limit. */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
-
/*
* Private writable mapping: check memory availability
*/
@@ -1935,9 +1984,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = NULL;
- if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */
- return NULL;
-
/* Check the cache first. */
/* (Cache hit rate is typically around 35%.) */
vma = ACCESS_ONCE(mm->mmap_cache);
@@ -2305,7 +2351,7 @@ static void unmap_region(struct mm_struct *mm,
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : 0);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);
}
@@ -2685,7 +2731,7 @@ void exit_mmap(struct mm_struct *mm)
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
/*
@@ -3097,3 +3143,115 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0);
VM_BUG_ON(ret);
}
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int init_user_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ return 0;
+}
+module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int init_admin_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ return 0;
+}
+module_init(init_admin_reserve)
+
+/*
+ * Reinititalise user and admin reserves if memory is added or removed.
+ *
+ * The default user reserve max is 128MB, and the default max for the
+ * admin reserve is 8MB. These are usually, but not always, enough to
+ * enable recovery from a memory hogging process using login/sshd, a shell,
+ * and tools like top. It may make sense to increase or even disable the
+ * reserve depending on the existence of swap or variations in the recovery
+ * tools. So, the admin may have changed them.
+ *
+ * If memory is added and the reserves have been eliminated or increased above
+ * the default max, then we'll trust the admin.
+ *
+ * If memory is removed and there isn't enough free memory, then we
+ * need to reset the reserves.
+ *
+ * Otherwise keep the reserve set by the admin.
+ */
+static int reserve_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ unsigned long tmp, free_kbytes;
+
+ switch (action) {
+ case MEM_ONLINE:
+ /* Default max is 128MB. Leave alone if modified by operator. */
+ tmp = sysctl_user_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 17))
+ init_user_reserve();
+
+ /* Default max is 8MB. Leave alone if modified by operator. */
+ tmp = sysctl_admin_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 13))
+ init_admin_reserve();
+
+ break;
+ case MEM_OFFLINE:
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ if (sysctl_user_reserve_kbytes > free_kbytes) {
+ init_user_reserve();
+ pr_info("vm.user_reserve_kbytes reset to %lu\n",
+ sysctl_user_reserve_kbytes);
+ }
+
+ if (sysctl_admin_reserve_kbytes > free_kbytes) {
+ init_admin_reserve();
+ pr_info("vm.admin_reserve_kbytes reset to %lu\n",
+ sysctl_admin_reserve_kbytes);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block reserve_mem_nb = {
+ .notifier_call = reserve_mem_notifier,
+};
+
+static int __meminit init_reserve_notifier(void)
+{
+ if (register_hotmemory_notifier(&reserve_mem_nb))
+ printk("Failed registering memory add/remove notifier for admin reserve");
+
+ return 0;
+}
+module_init(init_reserve_notifier)
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36e381e..bdd3fa2fc73b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -45,9 +45,9 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
if (!addr)
return NULL;
+ memblock_reserve(addr, size);
ptr = phys_to_virt(addr);
memset(ptr, 0, size);
- memblock_reserve(addr, size);
/*
* The min_count is set to 0 so that bootmem allocated blocks
* are never reported as leaks.
@@ -120,7 +120,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
return end_pfn - start_pfn;
}
-unsigned long __init free_low_memory_core_early(int nodeid)
+static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
phys_addr_t start, end, size;
@@ -170,7 +170,7 @@ unsigned long __init free_all_bootmem(void)
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
- return free_low_memory_core_early(MAX_NUMNODES);
+ return free_low_memory_core_early();
}
/**
diff --git a/mm/nommu.c b/mm/nommu.c
index e001768b14e8..fbe3e2f317eb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -63,6 +63,8 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
@@ -228,8 +230,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL(follow_pfn);
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
+LIST_HEAD(vmap_area_list);
void vfree(const void *addr)
{
@@ -1898,7 +1899,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed;
+ unsigned long free, allowed, reserve;
vm_acct_memory(pages);
@@ -1939,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free -= totalreserve_pages;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- free -= free / 32;
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
@@ -1952,16 +1953,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
allowed = totalram_pages * sysctl_overcommit_ratio / 100;
/*
- * Leave the last 3% for root
+ * Reserve some 3% for root
*/
if (!cap_sys_admin)
- allowed -= allowed / 32;
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed += total_swap_pages;
- /* Don't let a single process grow too big:
- leave 3% of the size of this process for other processes */
- if (mm)
- allowed -= mm->total_vm / 32;
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min(mm->total_vm / 32, reserve);
+ }
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
@@ -2123,3 +2127,45 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
up_write(&nommu_region_sem);
return 0;
}
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ return 0;
+}
+module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int __meminit init_admin_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ return 0;
+}
+module_init(init_admin_reserve)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index efe68148f621..4514ad7415c3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2311,10 +2311,6 @@ void wait_for_stable_page(struct page *page)
if (!bdi_cap_stable_pages_required(bdi))
return;
-#ifdef CONFIG_NEED_BOUNCE_POOL
- if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE)
- return;
-#endif /* CONFIG_NEED_BOUNCE_POOL */
wait_on_page_writeback(page);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7ff1536f01b8..98cbdf6e5532 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -58,6 +58,7 @@
#include <linux/prefetch.h>
#include <linux/migrate.h>
#include <linux/page-debug-flags.h>
+#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <asm/tlbflush.h>
@@ -1941,9 +1942,24 @@ zonelist_scan:
continue;
default:
/* did we reclaim enough */
- if (!zone_watermark_ok(zone, order, mark,
+ if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
+ goto try_this_zone;
+
+ /*
+ * Failed to reclaim enough to meet watermark.
+ * Only mark the zone full if checking the min
+ * watermark or if we failed to reclaim just
+ * 1<<order pages or else the page allocator
+ * fastpath will prematurely mark zones full
+ * when the watermark is between the low and
+ * min watermarks.
+ */
+ if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
+ ret == ZONE_RECLAIM_SOME)
goto this_zone_full;
+
+ continue;
}
}
@@ -2003,6 +2019,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
return;
/*
+ * Walking all memory to count page types is very expensive and should
+ * be inhibited in non-blockable contexts.
+ */
+ if (!(gfp_mask & __GFP_WAIT))
+ filter |= SHOW_MEM_FILTER_PAGE_COUNT;
+
+ /*
* This documents exceptions given to allocations in certain
* contexts that are allowed to allocate outside current's set
* of allowed nodes.
@@ -3106,6 +3129,8 @@ void show_free_areas(unsigned int filter)
printk("= %lukB\n", K(total));
}
+ hugetlb_show_meminfo();
+
printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
show_swap_cache_info();
@@ -4162,10 +4187,23 @@ int __meminit __early_pfn_to_nid(unsigned long pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid;
+ /*
+ * NOTE: The following SMP-unsafe globals are only used early in boot
+ * when the kernel is running single-threaded.
+ */
+ static unsigned long __meminitdata last_start_pfn, last_end_pfn;
+ static int __meminitdata last_nid;
+
+ if (last_start_pfn <= pfn && pfn < last_end_pfn)
+ return last_nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
- if (start_pfn <= pfn && pfn < end_pfn)
+ if (start_pfn <= pfn && pfn < end_pfn) {
+ last_start_pfn = start_pfn;
+ last_end_pfn = end_pfn;
+ last_nid = nid;
return nid;
+ }
/* This is a memory hole */
return -1;
}
@@ -4711,7 +4749,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
/*
* Figure out the number of possible node ids.
*/
-static void __init setup_nr_node_ids(void)
+void __init setup_nr_node_ids(void)
{
unsigned int node;
unsigned int highest = 0;
@@ -4720,10 +4758,6 @@ static void __init setup_nr_node_ids(void)
highest = node;
nr_node_ids = highest + 1;
}
-#else
-static inline void setup_nr_node_ids(void)
-{
-}
#endif
/**
@@ -5114,6 +5148,35 @@ early_param("movablecore", cmdline_parse_movablecore);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+unsigned long free_reserved_area(unsigned long start, unsigned long end,
+ int poison, char *s)
+{
+ unsigned long pages, pos;
+
+ pos = start = PAGE_ALIGN(start);
+ end &= PAGE_MASK;
+ for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
+ if (poison)
+ memset((void *)pos, poison, PAGE_SIZE);
+ free_reserved_page(virt_to_page(pos));
+ }
+
+ if (pages && s)
+ pr_info("Freeing %s memory: %ldK (%lx - %lx)\n",
+ s, pages << (PAGE_SHIFT - 10), start, end);
+
+ return pages;
+}
+
+#ifdef CONFIG_HIGHMEM
+void free_highmem_page(struct page *page)
+{
+ __free_reserved_page(page);
+ totalram_pages++;
+ totalhigh_pages++;
+}
+#endif
+
/**
* set_dma_reserve - set the specified number of pages reserved in the first zone
* @new_dma_reserve: The number of pages to mark reserved
diff --git a/mm/page_io.c b/mm/page_io.c
index 78eee32ee486..bb5d75274686 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -42,7 +42,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
return bio;
}
-static void end_swap_bio_write(struct bio *bio, int err)
+void end_swap_bio_write(struct bio *bio, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
@@ -185,9 +185,7 @@ bad_bmap:
*/
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
- struct bio *bio;
- int ret = 0, rw = WRITE;
- struct swap_info_struct *sis = page_swap_info(page);
+ int ret = 0;
if (try_to_free_swap(page)) {
unlock_page(page);
@@ -199,6 +197,17 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
goto out;
}
+ ret = __swap_writepage(page, wbc, end_swap_bio_write);
+out:
+ return ret;
+}
+
+int __swap_writepage(struct page *page, struct writeback_control *wbc,
+ void (*end_write_func)(struct bio *, int))
+{
+ struct bio *bio;
+ int ret = 0, rw = WRITE;
+ struct swap_info_struct *sis = page_swap_info(page);
if (sis->flags & SWP_FILE) {
struct kiocb kiocb;
@@ -214,6 +223,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
kiocb.ki_left = PAGE_SIZE;
kiocb.ki_nbytes = PAGE_SIZE;
+ set_page_writeback(page);
unlock_page(page);
ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
&kiocb, &iov,
@@ -222,11 +232,27 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
if (ret == PAGE_SIZE) {
count_vm_event(PSWPOUT);
ret = 0;
+ } else {
+ /*
+ * In the case of swap-over-nfs, this can be a
+ * temporary failure if the system has limited
+ * memory for allocating transmit buffers.
+ * Mark the page dirty and avoid
+ * rotate_reclaimable_page but rate-limit the
+ * messages but do not flag PageError like
+ * the normal direct-to-bio case as it could
+ * be temporary.
+ */
+ set_page_dirty(page);
+ ClearPageReclaim(page);
+ pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
+ page_file_offset(page));
}
+ end_page_writeback(page);
return ret;
}
- bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
+ bio = get_swap_bio(GFP_NOIO, page, end_write_func);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 807c96bf0dc6..6280da86b5d6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1513,6 +1513,9 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
unsigned long max_nl_size = 0;
unsigned int mapcount;
+ if (PageHuge(page))
+ pgoff = page->index << compound_order(page);
+
mutex_lock(&mapping->i_mmap_mutex);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1c44af71fcf5..39b2a0b86fe8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -25,6 +25,7 @@
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
+#include <linux/ramfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
@@ -2830,8 +2831,6 @@ out4:
* effectively equivalent, but much lighter weight.
*/
-#include <linux/ramfs.h>
-
static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
.mount = ramfs_mount,
@@ -2931,11 +2930,9 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
d_instantiate(path.dentry, inode);
inode->i_size = size;
clear_nlink(inode); /* It is unlinked */
-#ifndef CONFIG_MMU
res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
if (IS_ERR(res))
goto put_dentry;
-#endif
res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
&shmem_file_operations);
diff --git a/mm/slub.c b/mm/slub.c
index 4aec53705e4f..a0206df88aba 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include "slab.h"
#include <linux/proc_fs.h>
+#include <linux/notifier.h>
#include <linux/seq_file.h>
#include <linux/kmemcheck.h>
#include <linux/cpu.h>
@@ -3483,7 +3484,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-#if defined(CONFIG_MEMORY_HOTPLUG)
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
@@ -3598,7 +3598,10 @@ static int slab_memory_callback(struct notifier_block *self,
return ret;
}
-#endif /* CONFIG_MEMORY_HOTPLUG */
+static struct notifier_block slab_memory_callback_nb = {
+ .notifier_call = slab_memory_callback,
+ .priority = SLAB_CALLBACK_PRI,
+};
/********************************************************************
* Basic setup of slabs
@@ -3651,7 +3654,7 @@ void __init kmem_cache_init(void)
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
- hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+ register_hotmemory_notifier(&slab_memory_callback_nb);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 1b7e22ab9b09..27eeab3be757 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -53,10 +53,12 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
struct page *page;
if (node_state(node, N_HIGH_MEMORY))
- page = alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO, get_order(size));
+ page = alloc_pages_node(
+ node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
+ get_order(size));
else
- page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
+ page = alloc_pages(
+ GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
get_order(size));
if (page)
return page_address(page);
@@ -145,11 +147,10 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
return pgd;
}
-int __meminit vmemmap_populate_basepages(struct page *start_page,
- unsigned long size, int node)
+int __meminit vmemmap_populate_basepages(unsigned long start,
+ unsigned long end, int node)
{
- unsigned long addr = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + size);
+ unsigned long addr = start;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
@@ -176,9 +177,15 @@ int __meminit vmemmap_populate_basepages(struct page *start_page,
struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
{
- struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION);
- int error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
- if (error)
+ unsigned long start;
+ unsigned long end;
+ struct page *map;
+
+ map = pfn_to_page(pnum * PAGES_PER_SECTION);
+ start = (unsigned long)map;
+ end = (unsigned long)(map + PAGES_PER_SECTION);
+
+ if (vmemmap_populate(start, end, nid))
return NULL;
return map;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc847947..1c91f0d3f6ab 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -615,12 +615,20 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
}
static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
- vmemmap_free(memmap, nr_pages);
+ unsigned long start = (unsigned long)memmap;
+ unsigned long end = (unsigned long)(memmap + nr_pages);
+
+ vmemmap_free(start, end);
}
+#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
- vmemmap_free(memmap, nr_pages);
+ unsigned long start = (unsigned long)memmap;
+ unsigned long end = (unsigned long)(memmap + nr_pages);
+
+ vmemmap_free(start, end);
}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
#else
static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
{
@@ -658,6 +666,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
get_order(sizeof(struct page) * nr_pages));
}
+#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
@@ -684,40 +693,9 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
put_page_bootmem(page);
}
}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-static void free_section_usemap(struct page *memmap, unsigned long *usemap)
-{
- struct page *usemap_page;
- unsigned long nr_pages;
-
- if (!usemap)
- return;
-
- usemap_page = virt_to_page(usemap);
- /*
- * Check to see if allocation came from hot-plug-add
- */
- if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
- kfree(usemap);
- if (memmap)
- __kfree_section_memmap(memmap, PAGES_PER_SECTION);
- return;
- }
-
- /*
- * The usemap came from bootmem. This is packed with other usemaps
- * on the section which has pgdat at boot time. Just keep it as is now.
- */
-
- if (memmap) {
- nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
- >> PAGE_SHIFT;
-
- free_map_bootmem(memmap, nr_pages);
- }
-}
-
/*
* returns the number of sections whose mem_maps were properly
* set. If this is <=0, then that means that the passed-in
@@ -794,6 +772,39 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
}
#endif
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+{
+ struct page *usemap_page;
+ unsigned long nr_pages;
+
+ if (!usemap)
+ return;
+
+ usemap_page = virt_to_page(usemap);
+ /*
+ * Check to see if allocation came from hot-plug-add
+ */
+ if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
+ kfree(usemap);
+ if (memmap)
+ __kfree_section_memmap(memmap, PAGES_PER_SECTION);
+ return;
+ }
+
+ /*
+ * The usemap came from bootmem. This is packed with other usemaps
+ * on the section which has pgdat at boot time. Just keep it as is now.
+ */
+
+ if (memmap) {
+ nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+ >> PAGE_SHIFT;
+
+ free_map_bootmem(memmap, nr_pages);
+ }
+}
+
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
{
struct page *memmap = NULL;
@@ -813,4 +824,5 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
free_section_usemap(memmap, usemap);
}
-#endif
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 8a529a01e8fc..acd40bfffa82 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -737,7 +737,7 @@ EXPORT_SYMBOL(__pagevec_release);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* used by __split_huge_page_refcount() */
void lru_add_page_tail(struct page *page, struct page *page_tail,
- struct lruvec *lruvec)
+ struct lruvec *lruvec, struct list_head *list)
{
int uninitialized_var(active);
enum lru_list lru;
@@ -749,7 +749,8 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
VM_BUG_ON(NR_CPUS != 1 &&
!spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
- SetPageLRU(page_tail);
+ if (!list)
+ SetPageLRU(page_tail);
if (page_evictable(page_tail)) {
if (PageActive(page)) {
@@ -767,7 +768,11 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
if (likely(PageLRU(page)))
list_add_tail(&page_tail->lru, &page->lru);
- else {
+ else if (list) {
+ /* page reclaim is reclaiming a huge page */
+ get_page(page_tail);
+ list_add_tail(&page_tail->lru, list);
+ } else {
struct list_head *list_head;
/*
* Head page has not yet been counted, as an hpage,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7efcf1525921..b3d40dcf3624 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -78,7 +78,7 @@ void show_swap_cache_info(void)
* __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+int __add_to_swap_cache(struct page *page, swp_entry_t entry)
{
int error;
struct address_space *address_space;
@@ -160,7 +160,7 @@ void __delete_from_swap_cache(struct page *page)
* Allocate swap space for the page and add the page to the
* swap cache. Caller needs to hold the page lock.
*/
-int add_to_swap(struct page *page)
+int add_to_swap(struct page *page, struct list_head *list)
{
swp_entry_t entry;
int err;
@@ -173,7 +173,7 @@ int add_to_swap(struct page *page)
return 0;
if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page))) {
+ if (unlikely(split_huge_page_to_list(page, list))) {
swapcache_free(entry, NULL);
return 0;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f751f2068c3..72043d6c88c0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -249,19 +249,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
-struct vmap_area {
- unsigned long va_start;
- unsigned long va_end;
- unsigned long flags;
- struct rb_node rb_node; /* address sorted rbtree */
- struct list_head list; /* address sorted list */
- struct list_head purge_list; /* "lazy purge" list */
- struct vm_struct *vm;
- struct rcu_head rcu_head;
-};
-
static DEFINE_SPINLOCK(vmap_area_lock);
-static LIST_HEAD(vmap_area_list);
+/* Export for kexec only */
+LIST_HEAD(vmap_area_list);
static struct rb_root vmap_area_root = RB_ROOT;
/* The vmap cache globals are protected by vmap_area_lock */
@@ -313,7 +303,7 @@ static void __insert_vmap_area(struct vmap_area *va)
rb_link_node(&va->rb_node, parent, p);
rb_insert_color(&va->rb_node, &vmap_area_root);
- /* address-sort this list so it is usable like the vmlist */
+ /* address-sort this list */
tmp = rb_prev(&va->rb_node);
if (tmp) {
struct vmap_area *prev;
@@ -1125,6 +1115,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
}
EXPORT_SYMBOL(vm_map_ram);
+static struct vm_struct *vmlist __initdata;
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -1283,41 +1274,35 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
}
EXPORT_SYMBOL_GPL(map_vm_area);
-/*** Old vmalloc interfaces ***/
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
-
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, const void *caller)
{
+ spin_lock(&vmap_area_lock);
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
va->flags |= VM_VM_AREA;
+ spin_unlock(&vmap_area_lock);
}
-static void insert_vmalloc_vmlist(struct vm_struct *vm)
+static void clear_vm_unlist(struct vm_struct *vm)
{
- struct vm_struct *tmp, **p;
-
+ /*
+ * Before removing VM_UNLIST,
+ * we should make sure that vm has proper values.
+ * Pair with smp_rmb() in show_numa_info().
+ */
+ smp_wmb();
vm->flags &= ~VM_UNLIST;
- write_lock(&vmlist_lock);
- for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
- if (tmp->addr >= vm->addr)
- break;
- }
- vm->next = *p;
- *p = vm;
- write_unlock(&vmlist_lock);
}
static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, const void *caller)
{
setup_vmalloc_vm(vm, va, flags, caller);
- insert_vmalloc_vmlist(vm);
+ clear_vm_unlist(vm);
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -1360,10 +1345,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
/*
* When this function is called from __vmalloc_node_range,
- * we do not add vm_struct to vmlist here to avoid
- * accessing uninitialized members of vm_struct such as
- * pages and nr_pages fields. They will be set later.
- * To distinguish it from others, we use a VM_UNLIST flag.
+ * we add VM_UNLIST flag to avoid accessing uninitialized
+ * members of vm_struct such as pages and nr_pages fields.
+ * They will be set later.
*/
if (flags & VM_UNLIST)
setup_vmalloc_vm(area, va, flags, caller);
@@ -1447,19 +1431,10 @@ struct vm_struct *remove_vm_area(const void *addr)
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;
- if (!(vm->flags & VM_UNLIST)) {
- struct vm_struct *tmp, **p;
- /*
- * remove from list and disallow access to
- * this vm_struct before unmap. (address range
- * confliction is maintained by vmap.)
- */
- write_lock(&vmlist_lock);
- for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
- ;
- *p = tmp->next;
- write_unlock(&vmlist_lock);
- }
+ spin_lock(&vmap_area_lock);
+ va->vm = NULL;
+ va->flags &= ~VM_VM_AREA;
+ spin_unlock(&vmap_area_lock);
vmap_debug_free_range(va->va_start, va->va_end);
free_unmap_vmap_area(va);
@@ -1680,10 +1655,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return NULL;
/*
- * In this function, newly allocated vm_struct is not added
- * to vmlist at __get_vm_area_node(). so, it is added here.
+ * In this function, newly allocated vm_struct has VM_UNLIST flag.
+ * It means that vm_struct is not fully initialized.
+ * Now, it is fully initialized, so remove this flag here.
*/
- insert_vmalloc_vmlist(area);
+ clear_vm_unlist(area);
/*
* A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2005,7 +1981,8 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
long vread(char *buf, char *addr, unsigned long count)
{
- struct vm_struct *tmp;
+ struct vmap_area *va;
+ struct vm_struct *vm;
char *vaddr, *buf_start = buf;
unsigned long buflen = count;
unsigned long n;
@@ -2014,10 +1991,17 @@ long vread(char *buf, char *addr, unsigned long count)
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
- read_lock(&vmlist_lock);
- for (tmp = vmlist; count && tmp; tmp = tmp->next) {
- vaddr = (char *) tmp->addr;
- if (addr >= vaddr + tmp->size - PAGE_SIZE)
+ spin_lock(&vmap_area_lock);
+ list_for_each_entry(va, &vmap_area_list, list) {
+ if (!count)
+ break;
+
+ if (!(va->flags & VM_VM_AREA))
+ continue;
+
+ vm = va->vm;
+ vaddr = (char *) vm->addr;
+ if (addr >= vaddr + vm->size - PAGE_SIZE)
continue;
while (addr < vaddr) {
if (count == 0)
@@ -2027,10 +2011,10 @@ long vread(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = vaddr + tmp->size - PAGE_SIZE - addr;
+ n = vaddr + vm->size - PAGE_SIZE - addr;
if (n > count)
n = count;
- if (!(tmp->flags & VM_IOREMAP))
+ if (!(vm->flags & VM_IOREMAP))
aligned_vread(buf, addr, n);
else /* IOREMAP area is treated as memory hole */
memset(buf, 0, n);
@@ -2039,7 +2023,7 @@ long vread(char *buf, char *addr, unsigned long count)
count -= n;
}
finished:
- read_unlock(&vmlist_lock);
+ spin_unlock(&vmap_area_lock);
if (buf == buf_start)
return 0;
@@ -2078,7 +2062,8 @@ finished:
long vwrite(char *buf, char *addr, unsigned long count)
{
- struct vm_struct *tmp;
+ struct vmap_area *va;
+ struct vm_struct *vm;
char *vaddr;
unsigned long n, buflen;
int copied = 0;
@@ -2088,10 +2073,17 @@ long vwrite(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
buflen = count;
- read_lock(&vmlist_lock);
- for (tmp = vmlist; count && tmp; tmp = tmp->next) {
- vaddr = (char *) tmp->addr;
- if (addr >= vaddr + tmp->size - PAGE_SIZE)
+ spin_lock(&vmap_area_lock);
+ list_for_each_entry(va, &vmap_area_list, list) {
+ if (!count)
+ break;
+
+ if (!(va->flags & VM_VM_AREA))
+ continue;
+
+ vm = va->vm;
+ vaddr = (char *) vm->addr;
+ if (addr >= vaddr + vm->size - PAGE_SIZE)
continue;
while (addr < vaddr) {
if (count == 0)
@@ -2100,10 +2092,10 @@ long vwrite(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = vaddr + tmp->size - PAGE_SIZE - addr;
+ n = vaddr + vm->size - PAGE_SIZE - addr;
if (n > count)
n = count;
- if (!(tmp->flags & VM_IOREMAP)) {
+ if (!(vm->flags & VM_IOREMAP)) {
aligned_vwrite(buf, addr, n);
copied++;
}
@@ -2112,7 +2104,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
count -= n;
}
finished:
- read_unlock(&vmlist_lock);
+ spin_unlock(&vmap_area_lock);
if (!copied)
return 0;
return buflen;
@@ -2519,19 +2511,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
- __acquires(&vmlist_lock)
+ __acquires(&vmap_area_lock)
{
loff_t n = *pos;
- struct vm_struct *v;
+ struct vmap_area *va;
- read_lock(&vmlist_lock);
- v = vmlist;
- while (n > 0 && v) {
+ spin_lock(&vmap_area_lock);
+ va = list_entry((&vmap_area_list)->next, typeof(*va), list);
+ while (n > 0 && &va->list != &vmap_area_list) {
n--;
- v = v->next;
+ va = list_entry(va->list.next, typeof(*va), list);
}
- if (!n)
- return v;
+ if (!n && &va->list != &vmap_area_list)
+ return va;
return NULL;
@@ -2539,16 +2531,20 @@ static void *s_start(struct seq_file *m, loff_t *pos)
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct vm_struct *v = p;
+ struct vmap_area *va = p, *next;
++*pos;
- return v->next;
+ next = list_entry(va->list.next, typeof(*va), list);
+ if (&next->list != &vmap_area_list)
+ return next;
+
+ return NULL;
}
static void s_stop(struct seq_file *m, void *p)
- __releases(&vmlist_lock)
+ __releases(&vmap_area_lock)
{
- read_unlock(&vmlist_lock);
+ spin_unlock(&vmap_area_lock);
}
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
@@ -2559,6 +2555,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
if (!counters)
return;
+ /* Pair with smp_wmb() in clear_vm_unlist() */
+ smp_rmb();
+ if (v->flags & VM_UNLIST)
+ return;
+
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
for (nr = 0; nr < v->nr_pages; nr++)
@@ -2572,7 +2573,20 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
static int s_show(struct seq_file *m, void *p)
{
- struct vm_struct *v = p;
+ struct vmap_area *va = p;
+ struct vm_struct *v;
+
+ if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
+ return 0;
+
+ if (!(va->flags & VM_VM_AREA)) {
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
+ return 0;
+ }
+
+ v = va->vm;
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
@@ -2645,5 +2659,53 @@ static int __init proc_vmalloc_init(void)
return 0;
}
module_init(proc_vmalloc_init);
+
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+ struct vmap_area *va;
+ unsigned long free_area_size;
+ unsigned long prev_end;
+
+ vmi->used = 0;
+ vmi->largest_chunk = 0;
+
+ prev_end = VMALLOC_START;
+
+ spin_lock(&vmap_area_lock);
+
+ if (list_empty(&vmap_area_list)) {
+ vmi->largest_chunk = VMALLOC_TOTAL;
+ goto out;
+ }
+
+ list_for_each_entry(va, &vmap_area_list, list) {
+ unsigned long addr = va->va_start;
+
+ /*
+ * Some archs keep another range for modules in vmalloc space
+ */
+ if (addr < VMALLOC_START)
+ continue;
+ if (addr >= VMALLOC_END)
+ break;
+
+ if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
+ continue;
+
+ vmi->used += (va->va_end - va->va_start);
+
+ free_area_size = addr - prev_end;
+ if (vmi->largest_chunk < free_area_size)
+ vmi->largest_chunk = free_area_size;
+
+ prev_end = va->va_end;
+ }
+
+ if (VMALLOC_END - prev_end > vmi->largest_chunk)
+ vmi->largest_chunk = VMALLOC_END - prev_end;
+
+out:
+ spin_unlock(&vmap_area_lock);
+}
#endif
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
new file mode 100644
index 000000000000..736a6011c2c8
--- /dev/null
+++ b/mm/vmpressure.c
@@ -0,0 +1,374 @@
+/*
+ * Linux VM pressure
+ *
+ * Copyright 2012 Linaro Ltd.
+ * Anton Vorontsov <anton.vorontsov@linaro.org>
+ *
+ * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
+ * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/eventfd.h>
+#include <linux/swap.h>
+#include <linux/printk.h>
+#include <linux/vmpressure.h>
+
+/*
+ * The window size (vmpressure_win) is the number of scanned pages before
+ * we try to analyze scanned/reclaimed ratio. So the window is used as a
+ * rate-limit tunable for the "low" level notification, and also for
+ * averaging the ratio for medium/critical levels. Using small window
+ * sizes can cause lot of false positives, but too big window size will
+ * delay the notifications.
+ *
+ * As the vmscan reclaimer logic works with chunks which are multiple of
+ * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
+ *
+ * TODO: Make the window size depend on machine size, as we do for vmstat
+ * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
+ */
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+
+/*
+ * These thresholds are used when we account memory pressure through
+ * scanned/reclaimed ratio. The current values were chosen empirically. In
+ * essence, they are percents: the higher the value, the more number
+ * unsuccessful reclaims there were.
+ */
+static const unsigned int vmpressure_level_med = 60;
+static const unsigned int vmpressure_level_critical = 95;
+
+/*
+ * When there are too little pages left to scan, vmpressure() may miss the
+ * critical pressure as number of pages will be less than "window size".
+ * However, in that case the vmscan priority will raise fast as the
+ * reclaimer will try to scan LRUs more deeply.
+ *
+ * The vmscan logic considers these special priorities:
+ *
+ * prio == DEF_PRIORITY (12): reclaimer starts with that value
+ * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
+ * prio == 0 : close to OOM, kernel scans every page in an lru
+ *
+ * Any value in this range is acceptable for this tunable (i.e. from 12 to
+ * 0). Current value for the vmpressure_level_critical_prio is chosen
+ * empirically, but the number, in essence, means that we consider
+ * critical level when scanning depth is ~10% of the lru size (vmscan
+ * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
+ * eights).
+ */
+static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
+
+static struct vmpressure *work_to_vmpressure(struct work_struct *work)
+{
+ return container_of(work, struct vmpressure, work);
+}
+
+static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
+{
+ return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
+}
+
+static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
+{
+ struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
+
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ return NULL;
+ return memcg_to_vmpressure(memcg);
+}
+
+enum vmpressure_levels {
+ VMPRESSURE_LOW = 0,
+ VMPRESSURE_MEDIUM,
+ VMPRESSURE_CRITICAL,
+ VMPRESSURE_NUM_LEVELS,
+};
+
+static const char * const vmpressure_str_levels[] = {
+ [VMPRESSURE_LOW] = "low",
+ [VMPRESSURE_MEDIUM] = "medium",
+ [VMPRESSURE_CRITICAL] = "critical",
+};
+
+static enum vmpressure_levels vmpressure_level(unsigned long pressure)
+{
+ if (pressure >= vmpressure_level_critical)
+ return VMPRESSURE_CRITICAL;
+ else if (pressure >= vmpressure_level_med)
+ return VMPRESSURE_MEDIUM;
+ return VMPRESSURE_LOW;
+}
+
+static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+ unsigned long reclaimed)
+{
+ unsigned long scale = scanned + reclaimed;
+ unsigned long pressure;
+
+ /*
+ * We calculate the ratio (in percents) of how many pages were
+ * scanned vs. reclaimed in a given time frame (window). Note that
+ * time is in VM reclaimer's "ticks", i.e. number of pages
+ * scanned. This makes it possible to set desired reaction time
+ * and serves as a ratelimit.
+ */
+ pressure = scale - (reclaimed * scale / scanned);
+ pressure = pressure * 100 / scale;
+
+ pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
+ scanned, reclaimed);
+
+ return vmpressure_level(pressure);
+}
+
+struct vmpressure_event {
+ struct eventfd_ctx *efd;
+ enum vmpressure_levels level;
+ struct list_head node;
+};
+
+static bool vmpressure_event(struct vmpressure *vmpr,
+ unsigned long scanned, unsigned long reclaimed)
+{
+ struct vmpressure_event *ev;
+ enum vmpressure_levels level;
+ bool signalled = false;
+
+ level = vmpressure_calc_level(scanned, reclaimed);
+
+ mutex_lock(&vmpr->events_lock);
+
+ list_for_each_entry(ev, &vmpr->events, node) {
+ if (level >= ev->level) {
+ eventfd_signal(ev->efd, 1);
+ signalled = true;
+ }
+ }
+
+ mutex_unlock(&vmpr->events_lock);
+
+ return signalled;
+}
+
+static void vmpressure_work_fn(struct work_struct *work)
+{
+ struct vmpressure *vmpr = work_to_vmpressure(work);
+ unsigned long scanned;
+ unsigned long reclaimed;
+
+ /*
+ * Several contexts might be calling vmpressure(), so it is
+ * possible that the work was rescheduled again before the old
+ * work context cleared the counters. In that case we will run
+ * just after the old work returns, but then scanned might be zero
+ * here. No need for any locks here since we don't care if
+ * vmpr->reclaimed is in sync.
+ */
+ if (!vmpr->scanned)
+ return;
+
+ mutex_lock(&vmpr->sr_lock);
+ scanned = vmpr->scanned;
+ reclaimed = vmpr->reclaimed;
+ vmpr->scanned = 0;
+ vmpr->reclaimed = 0;
+ mutex_unlock(&vmpr->sr_lock);
+
+ do {
+ if (vmpressure_event(vmpr, scanned, reclaimed))
+ break;
+ /*
+ * If not handled, propagate the event upward into the
+ * hierarchy.
+ */
+ } while ((vmpr = vmpressure_parent(vmpr)));
+}
+
+/**
+ * vmpressure() - Account memory pressure through scanned/reclaimed ratio
+ * @gfp: reclaimer's gfp mask
+ * @memcg: cgroup memory controller handle
+ * @scanned: number of pages scanned
+ * @reclaimed: number of pages reclaimed
+ *
+ * This function should be called from the vmscan reclaim path to account
+ * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
+ * pressure index is then further refined and averaged over time.
+ *
+ * This function does not return any value.
+ */
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+ unsigned long scanned, unsigned long reclaimed)
+{
+ struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+
+ /*
+ * Here we only want to account pressure that userland is able to
+ * help us with. For example, suppose that DMA zone is under
+ * pressure; if we notify userland about that kind of pressure,
+ * then it will be mostly a waste as it will trigger unnecessary
+ * freeing of memory by userland (since userland is more likely to
+ * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
+ * is why we include only movable, highmem and FS/IO pages.
+ * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
+ * we account it too.
+ */
+ if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
+ return;
+
+ /*
+ * If we got here with no pages scanned, then that is an indicator
+ * that reclaimer was unable to find any shrinkable LRUs at the
+ * current scanning depth. But it does not mean that we should
+ * report the critical pressure, yet. If the scanning priority
+ * (scanning depth) goes too high (deep), we will be notified
+ * through vmpressure_prio(). But so far, keep calm.
+ */
+ if (!scanned)
+ return;
+
+ mutex_lock(&vmpr->sr_lock);
+ vmpr->scanned += scanned;
+ vmpr->reclaimed += reclaimed;
+ scanned = vmpr->scanned;
+ mutex_unlock(&vmpr->sr_lock);
+
+ if (scanned < vmpressure_win || work_pending(&vmpr->work))
+ return;
+ schedule_work(&vmpr->work);
+}
+
+/**
+ * vmpressure_prio() - Account memory pressure through reclaimer priority level
+ * @gfp: reclaimer's gfp mask
+ * @memcg: cgroup memory controller handle
+ * @prio: reclaimer's priority
+ *
+ * This function should be called from the reclaim path every time when
+ * the vmscan's reclaiming priority (scanning depth) changes.
+ *
+ * This function does not return any value.
+ */
+void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
+{
+ /*
+ * We only use prio for accounting critical level. For more info
+ * see comment for vmpressure_level_critical_prio variable above.
+ */
+ if (prio > vmpressure_level_critical_prio)
+ return;
+
+ /*
+ * OK, the prio is below the threshold, updating vmpressure
+ * information before shrinker dives into long shrinking of long
+ * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
+ * to the vmpressure() basically means that we signal 'critical'
+ * level.
+ */
+ vmpressure(gfp, memcg, vmpressure_win, 0);
+}
+
+/**
+ * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
+ * @cg: cgroup that is interested in vmpressure notifications
+ * @cft: cgroup control files handle
+ * @eventfd: eventfd context to link notifications with
+ * @args: event arguments (used to set up a pressure level threshold)
+ *
+ * This function associates eventfd context with the vmpressure
+ * infrastructure, so that the notifications will be delivered to the
+ * @eventfd. The @args parameter is a string that denotes pressure level
+ * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
+ * "critical").
+ *
+ * This function should not be used directly, just pass it to (struct
+ * cftype).register_event, and then cgroup core will handle everything by
+ * itself.
+ */
+int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ struct vmpressure *vmpr = cg_to_vmpressure(cg);
+ struct vmpressure_event *ev;
+ int level;
+
+ for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
+ if (!strcmp(vmpressure_str_levels[level], args))
+ break;
+ }
+
+ if (level >= VMPRESSURE_NUM_LEVELS)
+ return -EINVAL;
+
+ ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ if (!ev)
+ return -ENOMEM;
+
+ ev->efd = eventfd;
+ ev->level = level;
+
+ mutex_lock(&vmpr->events_lock);
+ list_add(&ev->node, &vmpr->events);
+ mutex_unlock(&vmpr->events_lock);
+
+ return 0;
+}
+
+/**
+ * vmpressure_unregister_event() - Unbind eventfd from vmpressure
+ * @cg: cgroup handle
+ * @cft: cgroup control files handle
+ * @eventfd: eventfd context that was used to link vmpressure with the @cg
+ *
+ * This function does internal manipulations to detach the @eventfd from
+ * the vmpressure notifications, and then frees internal resources
+ * associated with the @eventfd (but the @eventfd itself is not freed).
+ *
+ * This function should not be used directly, just pass it to (struct
+ * cftype).unregister_event, and then cgroup core will handle everything
+ * by itself.
+ */
+void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
+ struct eventfd_ctx *eventfd)
+{
+ struct vmpressure *vmpr = cg_to_vmpressure(cg);
+ struct vmpressure_event *ev;
+
+ mutex_lock(&vmpr->events_lock);
+ list_for_each_entry(ev, &vmpr->events, node) {
+ if (ev->efd != eventfd)
+ continue;
+ list_del(&ev->node);
+ kfree(ev);
+ break;
+ }
+ mutex_unlock(&vmpr->events_lock);
+}
+
+/**
+ * vmpressure_init() - Initialize vmpressure control structure
+ * @vmpr: Structure to be initialized
+ *
+ * This function should be called on every allocated vmpressure structure
+ * before any usage.
+ */
+void vmpressure_init(struct vmpressure *vmpr)
+{
+ mutex_init(&vmpr->sr_lock);
+ mutex_init(&vmpr->events_lock);
+ INIT_LIST_HEAD(&vmpr->events);
+ INIT_WORK(&vmpr->work, vmpressure_work_fn);
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 669fba39be1a..fa6a85378ee4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
+#include <linux/vmpressure.h>
#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
@@ -780,7 +781,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageAnon(page) && !PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
- if (!add_to_swap(page))
+ if (!add_to_swap(page, page_list))
goto activate_locked;
may_enter_fs = 1;
}
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
}
memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);
+
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+ sc->nr_scanned - nr_scanned,
+ sc->nr_reclaimed - nr_reclaimed);
+
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
}
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
count_vm_event(ALLOCSTALL);
do {
+ vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+ sc->priority);
sc->nr_scanned = 0;
aborted_reclaim = shrink_zones(zonelist, sc);
@@ -2619,7 +2627,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
bool pgdat_is_balanced = false;
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
- unsigned long total_scanned;
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
@@ -2639,7 +2646,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
.gfp_mask = sc.gfp_mask,
};
loop_again:
- total_scanned = 0;
sc.priority = DEF_PRIORITY;
sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
@@ -2730,7 +2736,6 @@ loop_again:
order, sc.gfp_mask,
&nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
- total_scanned += nr_soft_scanned;
/*
* We put equal pressure on every zone, unless
@@ -2765,7 +2770,6 @@ loop_again:
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_scanned += sc.nr_scanned;
if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e1d8ed172c42..f42745e65780 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -52,7 +52,6 @@ void all_vm_events(unsigned long *ret)
}
EXPORT_SYMBOL_GPL(all_vm_events);
-#ifdef CONFIG_HOTPLUG
/*
* Fold the foreign cpu events into our own.
*
@@ -69,7 +68,6 @@ void vm_events_fold_cpu(int cpu)
fold_state->event[i] = 0;
}
}
-#endif /* CONFIG_HOTPLUG */
#endif /* CONFIG_VM_EVENT_COUNTERS */
@@ -495,6 +493,10 @@ void refresh_cpu_vm_stats(int cpu)
atomic_long_add(global_diff[i], &vm_stat[i]);
}
+/*
+ * this is only called if !populated_zone(zone), which implies no other users of
+ * pset->vm_stat_diff[] exsist.
+ */
void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
{
int i;
diff --git a/scripts/decodecode b/scripts/decodecode
index 4f8248d5a11f..d8824f37acce 100755
--- a/scripts/decodecode
+++ b/scripts/decodecode
@@ -89,10 +89,16 @@ echo $code >> $T.s
disas $T
cat $T.dis >> $T.aa
+# (lines of whole $T.oo) - (lines of $T.aa, i.e. "Code starting") + 3,
+# i.e. the title + the "===..=" line (sed is counting from 1, 0 address is
+# special)
+faultlinenum=$(( $(wc -l $T.oo | cut -d" " -f1) - \
+ $(wc -l $T.aa | cut -d" " -f1) + 3))
+
faultline=`cat $T.dis | head -1 | cut -d":" -f2-`
faultline=`echo "$faultline" | sed -e 's/\[/\\\[/g; s/\]/\\\]/g'`
-cat $T.oo | sed -e "s/\($faultline\)/\*\1 <-- trapping instruction/g"
+cat $T.oo | sed -e "${faultlinenum}s/^\(.*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/"
echo
cat $T.aa
cleanup