summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/memory.txt74
-rw-r--r--Documentation/device-mapper/dm-crypt.txt7
-rw-r--r--Documentation/device-mapper/dm-raid.txt70
-rw-r--r--Documentation/filesystems/proc.txt7
-rw-r--r--Documentation/gpio.txt2
-rw-r--r--Documentation/vm/transhuge.txt298
-rw-r--r--MAINTAINERS3
-rw-r--r--arch/alpha/include/asm/mman.h3
-rw-r--r--arch/arm/kernel/module.c14
-rw-r--r--arch/arm/mm/pgd.c2
-rw-r--r--arch/avr32/boards/atngw100/setup.c2
-rw-r--r--arch/avr32/boards/atstk1000/atstk1002.c2
-rw-r--r--arch/avr32/boards/favr-32/setup.c2
-rw-r--r--arch/avr32/boards/hammerhead/setup.c2
-rw-r--r--arch/avr32/boards/merisc/setup.c2
-rw-r--r--arch/avr32/boards/mimc200/setup.c2
-rw-r--r--arch/avr32/configs/atngw100_defconfig23
-rw-r--r--arch/avr32/configs/atngw100_evklcd100_defconfig17
-rw-r--r--arch/avr32/configs/atngw100_evklcd101_defconfig17
-rw-r--r--arch/avr32/configs/atngw100mkii_defconfig22
-rw-r--r--arch/avr32/configs/atngw100mkii_evklcd100_defconfig17
-rw-r--r--arch/avr32/configs/atngw100mkii_evklcd101_defconfig17
-rw-r--r--arch/avr32/configs/atstk1002_defconfig25
-rw-r--r--arch/avr32/configs/atstk1003_defconfig41
-rw-r--r--arch/avr32/configs/atstk1004_defconfig109
-rw-r--r--arch/avr32/configs/atstk1006_defconfig23
-rw-r--r--arch/avr32/configs/favr-32_defconfig2
-rw-r--r--arch/avr32/configs/hammerhead_defconfig2
-rw-r--r--arch/avr32/include/asm/syscalls.h14
-rw-r--r--arch/avr32/kernel/process.c9
-rw-r--r--arch/avr32/kernel/time.c5
-rw-r--r--arch/ia64/kernel/perfmon.c2
-rw-r--r--arch/ia64/mm/hugetlbpage.c2
-rw-r--r--arch/mips/include/asm/mman.h3
-rw-r--r--arch/mips/kernel/module.c14
-rw-r--r--arch/parisc/include/asm/mman.h3
-rw-r--r--arch/powerpc/mm/gup.c12
-rw-r--r--arch/sh/mm/hugetlbpage.c2
-rw-r--r--arch/sparc/kernel/module.c14
-rw-r--r--arch/sparc/mm/generic_32.c2
-rw-r--r--arch/sparc/mm/generic_64.c2
-rw-r--r--arch/sparc/mm/hugetlbpage.c2
-rw-r--r--arch/um/kernel/skas/mmu.c2
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/include/asm/paravirt.h25
-rw-r--r--arch/x86/include/asm/paravirt_types.h6
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-3level.h23
-rw-r--r--arch/x86/include/asm/pgtable.h143
-rw-r--r--arch/x86/include/asm/pgtable_64.h28
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/xen/page.h16
-rw-r--r--arch/x86/kernel/module.c17
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kvm/mmu.c125
-rw-r--r--arch/x86/kvm/paging_tmpl.h9
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/pgtable.c66
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/mmu.c365
-rw-r--r--arch/x86/xen/p2m.c510
-rw-r--r--arch/xtensa/include/asm/mman.h3
-rw-r--r--drivers/base/node.c21
-rw-r--r--drivers/md/Kconfig24
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bitmap.c12
-rw-r--r--drivers/md/dm-crypt.c618
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-ioctl.c111
-rw-r--r--drivers/md/dm-kcopyd.c57
-rw-r--r--drivers/md/dm-log-userspace-base.c139
-rw-r--r--drivers/md/dm-log-userspace-transfer.c1
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-mpath.c67
-rw-r--r--drivers/md/dm-raid.c697
-rw-r--r--drivers/md/dm-raid1.c19
-rw-r--r--drivers/md/dm-snap-persistent.c4
-rw-r--r--drivers/md/dm-snap.c62
-rw-r--r--drivers/md/dm-stripe.c27
-rw-r--r--drivers/md/dm-table.c19
-rw-r--r--drivers/md/dm.c23
-rw-r--r--drivers/md/md.c197
-rw-r--r--drivers/md/md.h13
-rw-r--r--drivers/md/raid1.c33
-rw-r--r--drivers/md/raid10.c17
-rw-r--r--drivers/md/raid5.c16
-rw-r--r--drivers/serial/atmel_serial.c5
-rw-r--r--drivers/xen/Kconfig11
-rw-r--r--drivers/xen/Makefile5
-rw-r--r--drivers/xen/gntdev.c665
-rw-r--r--drivers/xen/grant-table.c46
-rw-r--r--drivers/xen/platform-pci.c21
-rw-r--r--fs/ecryptfs/main.c5
-rw-r--r--fs/fs-writeback.c105
-rw-r--r--fs/mpage.c49
-rw-r--r--fs/nfs/dir.c6
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/meminfo.c14
-rw-r--r--fs/proc/page.c14
-rw-r--r--fs/proc/task_mmu.c7
-rw-r--r--include/asm-generic/gpio.h10
-rw-r--r--include/asm-generic/mman-common.h3
-rw-r--r--include/asm-generic/pgtable.h225
-rw-r--r--include/linux/compaction.h25
-rw-r--r--include/linux/device-mapper.h12
-rw-r--r--include/linux/dm-ioctl.h14
-rw-r--r--include/linux/dm-log-userspace.h13
-rw-r--r--include/linux/gfp.h15
-rw-r--r--include/linux/gpio.h6
-rw-r--r--include/linux/huge_mm.h179
-rw-r--r--include/linux/irqdesc.h2
-rw-r--r--include/linux/kernel.h7
-rw-r--r--include/linux/kernel_stat.h19
-rw-r--r--include/linux/khugepaged.h67
-rw-r--r--include/linux/memcontrol.h36
-rw-r--r--include/linux/memory_hotplug.h14
-rw-r--r--include/linux/migrate.h12
-rw-r--r--include/linux/mm.h140
-rw-r--r--include/linux/mm_inline.h19
-rw-r--r--include/linux/mm_types.h3
-rw-r--r--include/linux/mmc/sh_mmcif.h4
-rw-r--r--include/linux/mmu_notifier.h66
-rw-r--r--include/linux/mmzone.h14
-rw-r--r--include/linux/page-flags.h71
-rw-r--r--include/linux/page_cgroup.h54
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/radix-tree.h16
-rw-r--r--include/linux/rmap.h2
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/vmalloc.h10
-rw-r--r--include/linux/vmstat.h7
-rw-r--r--include/trace/events/compaction.h74
-rw-r--r--include/trace/events/vmscan.h6
-rw-r--r--include/trace/events/writeback.h1
-rw-r--r--include/xen/gntdev.h119
-rw-r--r--include/xen/grant_table.h44
-rw-r--r--kernel/fork.c41
-rw-r--r--kernel/futex.c55
-rw-r--r--kernel/irq/irqdesc.c40
-rw-r--r--mm/Kconfig38
-rw-r--r--mm/Makefile3
-rw-r--r--mm/compaction.c174
-rw-r--r--mm/dmapool.c16
-rw-r--r--mm/filemap.c17
-rw-r--r--mm/huge_memory.c2346
-rw-r--r--mm/hugetlb.c111
-rw-r--r--mm/internal.h16
-rw-r--r--mm/ksm.c81
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memcontrol.c258
-rw-r--r--mm/memory-failure.c22
-rw-r--r--mm/memory.c336
-rw-r--r--mm/memory_hotplug.c17
-rw-r--r--mm/mempolicy.c23
-rw-r--r--mm/migrate.c123
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c163
-rw-r--r--mm/mmap.c17
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/mprotect.c20
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/page-writeback.c7
-rw-r--r--mm/page_alloc.c163
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/pgtable-generic.c123
-rw-r--r--mm/rmap.c91
-rw-r--r--mm/slub.c11
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c322
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/vmalloc.c89
-rw-r--r--mm/vmscan.c432
-rw-r--r--mm/vmstat.c51
-rw-r--r--virt/kvm/kvm_main.c39
181 files changed, 9940 insertions, 2132 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 7781857dc940..bac328c232f5 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -385,6 +385,10 @@ mapped_file - # of bytes of mapped file (includes tmpfs/shmem)
pgpgin - # of pages paged in (equivalent to # of charging events).
pgpgout - # of pages paged out (equivalent to # of uncharging events).
swap - # of bytes of swap usage
+dirty - # of bytes that are waiting to get written back to the disk.
+writeback - # of bytes that are actively being written back to the disk.
+nfs_unstable - # of bytes sent to the NFS server, but not yet committed to
+ the actual storage.
inactive_anon - # of bytes of anonymous memory and swap cache memory on
LRU list.
active_anon - # of bytes of anonymous and swap cache memory on active
@@ -406,6 +410,9 @@ total_mapped_file - sum of all children's "cache"
total_pgpgin - sum of all children's "pgpgin"
total_pgpgout - sum of all children's "pgpgout"
total_swap - sum of all children's "swap"
+total_dirty - sum of all children's "dirty"
+total_writeback - sum of all children's "writeback"
+total_nfs_unstable - sum of all children's "nfs_unstable"
total_inactive_anon - sum of all children's "inactive_anon"
total_active_anon - sum of all children's "active_anon"
total_inactive_file - sum of all children's "inactive_file"
@@ -453,6 +460,73 @@ memory under it will be reclaimed.
You can reset failcnt by writing 0 to failcnt file.
# echo 0 > .../memory.failcnt
+5.5 dirty memory
+
+Control the maximum amount of dirty pages a cgroup can have at any given time.
+
+Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim)
+page cache used by a cgroup. So, in case of multiple cgroup writers, they will
+not be able to consume more than their designated share of dirty pages and will
+be forced to perform write-out if they cross that limit.
+
+The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*. It
+is possible to configure a limit to trigger both a direct writeback or a
+background writeback performed by per-bdi flusher threads. The root cgroup
+memory.dirty_* control files are read-only and match the contents of
+the /proc/sys/vm/dirty_* files.
+
+Per-cgroup dirty limits can be set using the following files in the cgroupfs:
+
+- memory.dirty_ratio: the amount of dirty memory (expressed as a percentage of
+ cgroup memory) at which a process generating dirty pages will itself start
+ writing out dirty data.
+
+- memory.dirty_limit_in_bytes: the amount of dirty memory (expressed in bytes)
+ in the cgroup at which a process generating dirty pages will start itself
+ writing out dirty data. Suffix (k, K, m, M, g, or G) can be used to indicate
+ that value is kilo, mega or gigabytes.
+
+ Note: memory.dirty_limit_in_bytes is the counterpart of memory.dirty_ratio.
+ Only one of them may be specified at a time. When one is written it is
+ immediately taken into account to evaluate the dirty memory limits and the
+ other appears as 0 when read.
+
+- memory.dirty_background_ratio: the amount of dirty memory of the cgroup
+ (expressed as a percentage of cgroup memory) at which background writeback
+ kernel threads will start writing out dirty data.
+
+- memory.dirty_background_limit_in_bytes: the amount of dirty memory (expressed
+ in bytes) in the cgroup at which background writeback kernel threads will
+ start writing out dirty data. Suffix (k, K, m, M, g, or G) can be used to
+ indicate that value is kilo, mega or gigabytes.
+
+ Note: memory.dirty_background_limit_in_bytes is the counterpart of
+ memory.dirty_background_ratio. Only one of them may be specified at a time.
+ When one is written it is immediately taken into account to evaluate the dirty
+ memory limits and the other appears as 0 when read.
+
+A cgroup may contain more dirty memory than its dirty limit. This is possible
+because of the principle that the first cgroup to touch a page is charged for
+it. Subsequent page counting events (dirty, writeback, nfs_unstable) are also
+counted to the originally charged cgroup.
+
+Example: If page is allocated by a cgroup A task, then the page is charged to
+cgroup A. If the page is later dirtied by a task in cgroup B, then the cgroup A
+dirty count will be incremented. If cgroup A is over its dirty limit but cgroup
+B is not, then dirtying a cgroup A page from a cgroup B task may push cgroup A
+over its dirty limit without throttling the dirtying cgroup B task.
+
+When use_hierarchy=0, each cgroup has dirty memory usage and limits.
+System-wide dirty limits are also consulted. Dirty memory consumption is
+checked against both system-wide and per-cgroup dirty limits.
+
+The current implementation does not enforce per-cgroup dirty limits when
+use_hierarchy=1. System-wide dirty limits are used for processes in such
+cgroups. Attempts to read memory.dirty_* files return the system-wide
+values. Writes to the memory.dirty_* files return error. An enhanced
+implementation is needed to check the chain of parents to ensure that no
+dirty limit is exceeded.
+
6. Hierarchy support
The memory controller supports a deep hierarchy and hierarchical accounting.
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 524de926290d..59293ac4a5d0 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -8,7 +8,7 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
<cipher>
Encryption cipher and an optional IV generation mode.
- (In format cipher-chainmode-ivopts:ivmode).
+ (In format cipher[:keycount]-chainmode-ivopts:ivmode).
Examples:
des
aes-cbc-essiv:sha256
@@ -20,6 +20,11 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
Key used for encryption. It is encoded as a hexadecimal number.
You can only use key sizes that are valid for the selected cipher.
+<keycount>
+ Multi-key compatibility mode. You can define <keycount> keys and
+ then sectors are encrypted according to their offsets (sector 0 uses key0;
+ sector 1 uses key1 etc.). <keycount> must be a power of two.
+
<iv_offset>
The IV offset is a sector count that is added to the sector number
before creating the IV.
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
new file mode 100644
index 000000000000..33b6b7071ac8
--- /dev/null
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -0,0 +1,70 @@
+Device-mapper RAID (dm-raid) is a bridge from DM to MD. It
+provides a way to use device-mapper interfaces to access the MD RAID
+drivers.
+
+As with all device-mapper targets, the nominal public interfaces are the
+constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
+and STATUSTYPE_TABLE). The CTR table looks like the following:
+
+1: <s> <l> raid \
+2: <raid_type> <#raid_params> <raid_params> \
+3: <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
+
+Line 1 contains the standard first three arguments to any device-mapper
+target - the start, length, and target type fields. The target type in
+this case is "raid".
+
+Line 2 contains the arguments that define the particular raid
+type/personality/level, the required arguments for that raid type, and
+any optional arguments. Possible raid types include: raid4, raid5_la,
+raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (raid1 is
+planned for the future.) The list of required and optional parameters
+is the same for all the current raid types. The required parameters are
+positional, while the optional parameters are given as key/value pairs.
+The possible parameters are as follows:
+ <chunk_size> Chunk size in sectors.
+ [[no]sync] Force/Prevent RAID initialization
+ [rebuild <idx>] Rebuild the drive indicated by the index
+ [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
+ [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
+ [stripe_cache <sectors>] Stripe cache size for higher RAIDs
+
+Line 3 contains the list of devices that compose the array in
+metadata/data device pairs. If the metadata is stored separately, a '-'
+is given for the metadata device position. If a drive has failed or is
+missing at creation time, a '-' can be given for both the metadata and
+data drives for a given position.
+
+NB. Currently all metadata devices must be specified as '-'.
+
+Examples:
+# RAID4 - 4 data drives, 1 parity
+# No metadata devices specified to hold superblock/bitmap info
+# Chunk size of 1MiB
+# (Lines separated for easy reading)
+0 1960893648 raid \
+ raid4 1 2048 \
+ 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+
+# RAID4 - 4 data drives, 1 parity (no metadata devices)
+# Chunk size of 1MiB, force RAID initialization,
+# min recovery rate at 20 kiB/sec/disk
+0 1960893648 raid \
+ raid4 4 2048 min_recovery_rate 20 sync\
+ 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+
+Performing a 'dmsetup table' should display the CTR table used to
+construct the mapping (with possible reordering of optional
+parameters).
+
+Performing a 'dmsetup status' will yield information on the state and
+health of the array. The output is as follows:
+1: <s> <l> raid \
+2: <raid_type> <#devices> <1 health char for each dev> <resync_ratio>
+
+Line 1 is standard DM output. Line 2 is best shown by example:
+ 0 1960893648 raid raid4 5 AAAAA 2/490221568
+Here we can see the RAID type is raid4, there are 5 devices - all of
+which are 'A'live, and the array is 2/490221568 complete with recovery.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 9471225212c4..23cae6548d3a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -375,6 +375,7 @@ Anonymous: 0 kB
Swap: 0 kB
KernelPageSize: 4 kB
MMUPageSize: 4 kB
+Locked: 374 kB
The first of these lines shows the same information as is displayed for the
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
@@ -670,6 +671,8 @@ varies by architecture and compile options. The following is from a
> cat /proc/meminfo
+The "Locked" indicates whether the mapping is locked in memory or not.
+
MemTotal: 16344972 kB
MemFree: 13634064 kB
@@ -1320,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj.
Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
other with its scaled value.
+The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
+value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
+requires CAP_SYS_RESOURCE.
+
NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
Documentation/feature-removal-schedule.txt.
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index a492d92bb098..792faa3c06cf 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -135,7 +135,7 @@ setting up a platform_device using the GPIO, is mark its direction:
int gpio_direction_input(unsigned gpio);
int gpio_direction_output(unsigned gpio, int value);
-The return value is zero for success, else a negative errno. It must
+The return value is zero for success, else a negative errno. It should
be checked, since the get/set calls don't have error returns and since
misconfiguration is possible. You should normally issue these calls from
a task context. However, for spinlock-safe GPIOs it's OK to use them
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
new file mode 100644
index 000000000000..0924aaca3302
--- /dev/null
+++ b/Documentation/vm/transhuge.txt
@@ -0,0 +1,298 @@
+= Transparent Hugepage Support =
+
+== Objective ==
+
+Performance critical computing applications dealing with large memory
+working sets are already running on top of libhugetlbfs and in turn
+hugetlbfs. Transparent Hugepage Support is an alternative means of
+using huge pages for the backing of virtual memory with huge pages
+that supports the automatic promotion and demotion of page sizes and
+without the shortcomings of hugetlbfs.
+
+Currently it only works for anonymous memory mappings but in the
+future it can expand over the pagecache layer starting with tmpfs.
+
+The reason applications are running faster is because of two
+factors. The first factor is almost completely irrelevant and it's not
+of significant interest because it'll also have the downside of
+requiring larger clear-page copy-page in page faults which is a
+potentially negative effect. The first factor consists in taking a
+single page fault for each 2M virtual region touched by userland (so
+reducing the enter/exit kernel frequency by a 512 times factor). This
+only matters the first time the memory is accessed for the lifetime of
+a memory mapping. The second long lasting and much more important
+factor will affect all subsequent accesses to the memory for the whole
+runtime of the application. The second factor consist of two
+components: 1) the TLB miss will run faster (especially with
+virtualization using nested pagetables but almost always also on bare
+metal without virtualization) and 2) a single TLB entry will be
+mapping a much larger amount of virtual memory in turn reducing the
+number of TLB misses. With virtualization and nested pagetables the
+TLB can be mapped of larger size only if both KVM and the Linux guest
+are using hugepages but a significant speedup already happens if only
+one of the two is using hugepages just because of the fact the TLB
+miss is going to run faster.
+
+== Design ==
+
+- "graceful fallback": mm components which don't have transparent
+ hugepage knowledge fall back to breaking a transparent hugepage and
+ working on the regular pages and their respective regular pmd/pte
+ mappings
+
+- if a hugepage allocation fails because of memory fragmentation,
+ regular pages should be gracefully allocated instead and mixed in
+ the same vma without any failure or significant delay and without
+ userland noticing
+
+- if some task quits and more hugepages become available (either
+ immediately in the buddy or through the VM), guest physical memory
+ backed by regular pages should be relocated on hugepages
+ automatically (with khugepaged)
+
+- it doesn't require memory reservation and in turn it uses hugepages
+ whenever possible (the only possible reservation here is kernelcore=
+ to avoid unmovable pages to fragment all the memory but such a tweak
+ is not specific to transparent hugepage support and it's a generic
+ feature that applies to all dynamic high order allocations in the
+ kernel)
+
+- this initial support only offers the feature in the anonymous memory
+ regions but it'd be ideal to move it to tmpfs and the pagecache
+ later
+
+Transparent Hugepage Support maximizes the usefulness of free memory
+if compared to the reservation approach of hugetlbfs by allowing all
+unused memory to be used as cache or other movable (or even unmovable
+entities). It doesn't require reservation to prevent hugepage
+allocation failures to be noticeable from userland. It allows paging
+and all other advanced VM features to be available on the
+hugepages. It requires no modifications for applications to take
+advantage of it.
+
+Applications however can be further optimized to take advantage of
+this feature, like for example they've been optimized before to avoid
+a flood of mmap system calls for every malloc(4k). Optimizing userland
+is by far not mandatory and khugepaged already can take care of long
+lived page allocations even for hugepage unaware applications that
+deals with large amounts of memory.
+
+In certain cases when hugepages are enabled system wide, application
+may end up allocating more memory resources. An application may mmap a
+large region but only touch 1 byte of it, in that case a 2M page might
+be allocated instead of a 4k page for no good. This is why it's
+possible to disable hugepages system-wide and to only have them inside
+MADV_HUGEPAGE madvise regions.
+
+Embedded systems should enable hugepages only inside madvise regions
+to eliminate any risk of wasting any precious byte of memory and to
+only run faster.
+
+Applications that gets a lot of benefit from hugepages and that don't
+risk to lose memory by using hugepages, should use
+madvise(MADV_HUGEPAGE) on their critical mmapped regions.
+
+== sysfs ==
+
+Transparent Hugepage Support can be entirely disabled (mostly for
+debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to
+avoid the risk of consuming more memory resources) or enabled system
+wide. This can be achieved with one of:
+
+echo always >/sys/kernel/mm/transparent_hugepage/enabled
+echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
+echo never >/sys/kernel/mm/transparent_hugepage/enabled
+
+It's also possible to limit defrag efforts in the VM to generate
+hugepages in case they're not immediately free to madvise regions or
+to never try to defrag memory and simply fallback to regular pages
+unless hugepages are immediately available. Clearly if we spend CPU
+time to defrag memory, we would expect to gain even more by the fact
+we use hugepages later instead of regular pages. This isn't always
+guaranteed, but it may be more likely in case the allocation is for a
+MADV_HUGEPAGE region.
+
+echo always >/sys/kernel/mm/transparent_hugepage/defrag
+echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
+echo never >/sys/kernel/mm/transparent_hugepage/defrag
+
+khugepaged will be automatically started when
+transparent_hugepage/enabled is set to "always" or "madvise, and it'll
+be automatically shutdown if it's set to "never".
+
+khugepaged runs usually at low frequency so while one may not want to
+invoke defrag algorithms synchronously during the page faults, it
+should be worth invoking defrag at least in khugepaged. However it's
+also possible to disable defrag in khugepaged:
+
+echo yes >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+echo no >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+
+You can also control how many pages khugepaged should scan at each
+pass:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
+
+and how many milliseconds to wait in khugepaged between each pass (you
+can set this to 0 to run khugepaged at 100% utilization of one core):
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
+
+and how many milliseconds to wait in khugepaged if there's an hugepage
+allocation failure to throttle the next allocation attempt.
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
+
+The khugepaged progress can be seen in the number of pages collapsed:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
+
+for each pass:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
+
+== Boot parameter ==
+
+You can change the sysfs boot time defaults of Transparent Hugepage
+Support by passing the parameter "transparent_hugepage=always" or
+"transparent_hugepage=madvise" or "transparent_hugepage=never"
+(without "") to the kernel command line.
+
+== Need of application restart ==
+
+The transparent_hugepage/enabled values only affect future
+behavior. So to make them effective you need to restart any
+application that could have been using hugepages. This also applies to
+the regions registered in khugepaged.
+
+== get_user_pages and follow_page ==
+
+get_user_pages and follow_page if run on a hugepage, will return the
+head or tail pages as usual (exactly as they would do on
+hugetlbfs). Most gup users will only care about the actual physical
+address of the page and its temporary pinning to release after the I/O
+is complete, so they won't ever notice the fact the page is huge. But
+if any driver is going to mangle over the page structure of the tail
+page (like for checking page->mapping or other bits that are relevant
+for the head page and not the tail page), it should be updated to jump
+to check head page instead (while serializing properly against
+split_huge_page() to avoid the head and tail pages to disappear from
+under it, see the futex code to see an example of that, hugetlbfs also
+needed special handling in futex code for similar reasons).
+
+NOTE: these aren't new constraints to the GUP API, and they match the
+same constrains that applies to hugetlbfs too, so any driver capable
+of handling GUP on hugetlbfs will also work fine on transparent
+hugepage backed mappings.
+
+In case you can't handle compound pages if they're returned by
+follow_page, the FOLL_SPLIT bit can be specified as parameter to
+follow_page, so that it will split the hugepages before returning
+them. Migration for example passes FOLL_SPLIT as parameter to
+follow_page because it's not hugepage aware and in fact it can't work
+at all on hugetlbfs (but it instead works fine on transparent
+hugepages thanks to FOLL_SPLIT). migration simply can't deal with
+hugepages being returned (as it's not only checking the pfn of the
+page and pinning it during the copy but it pretends to migrate the
+memory in regular page sizes and with regular pte/pmd mappings).
+
+== Optimizing the applications ==
+
+To be guaranteed that the kernel will map a 2M page immediately in any
+memory region, the mmap region has to be hugepage naturally
+aligned. posix_memalign() can provide that guarantee.
+
+== Hugetlbfs ==
+
+You can use hugetlbfs on a kernel that has transparent hugepage
+support enabled just fine as always. No difference can be noted in
+hugetlbfs other than there will be less overall fragmentation. All
+usual features belonging to hugetlbfs are preserved and
+unaffected. libhugetlbfs will also work fine as usual.
+
+== Graceful fallback ==
+
+Code walking pagetables but unware about huge pmds can simply call
+split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
+pmd_offset. It's trivial to make the code transparent hugepage aware
+by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+missing after pmd_offset returns the pmd. Thanks to the graceful
+fallback design, with a one liner change, you can avoid to write
+hundred if not thousand of lines of complex code to make your code
+hugepage aware.
+
+If you're not walking pagetables but you run into a physical hugepage
+but you can't handle it natively in your code, you can split it by
+calling split_huge_page(page). This is what the Linux VM does before
+it tries to swapout the hugepage for example.
+
+Example to make mremap.c transparent hugepage aware with a one liner
+change:
+
+diff --git a/mm/mremap.c b/mm/mremap.c
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
+ return NULL;
+
+ pmd = pmd_offset(pud, addr);
++ split_huge_page_pmd(mm, pmd);
+ if (pmd_none_or_clear_bad(pmd))
+ return NULL;
+
+== Locking in hugepage aware code ==
+
+We want as much code as possible hugepage aware, as calling
+split_huge_page() or split_huge_page_pmd() has a cost.
+
+To make pagetable walks huge pmd aware, all you need to do is to call
+pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
+mmap_sem in read (or write) mode to be sure an huge pmd cannot be
+created from under you by khugepaged (khugepaged collapse_huge_page
+takes the mmap_sem in write mode in addition to the anon_vma lock). If
+pmd_trans_huge returns false, you just fallback in the old code
+paths. If instead pmd_trans_huge returns true, you have to take the
+mm->page_table_lock and re-run pmd_trans_huge. Taking the
+page_table_lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_page can run in parallel to the
+pagetable walk). If the second pmd_trans_huge returns false, you
+should just drop the page_table_lock and fallback to the old code as
+before. Otherwise you should run pmd_trans_splitting on the pmd. In
+case pmd_trans_splitting returns true, it means split_huge_page is
+already in the middle of splitting the page. So if pmd_trans_splitting
+returns true it's enough to drop the page_table_lock and call
+wait_split_huge_page and then fallback the old code paths. You are
+guaranteed by the time wait_split_huge_page returns, the pmd isn't
+huge anymore. If pmd_trans_splitting returns false, you can proceed to
+process the huge pmd and the hugepage natively. Once finished you can
+drop the page_table_lock.
+
+== compound_lock, get_user_pages and put_page ==
+
+split_huge_page internally has to distribute the refcounts in the head
+page to the tail pages before clearing all PG_head/tail bits from the
+page structures. It can do that easily for refcounts taken by huge pmd
+mappings. But the GUI API as created by hugetlbfs (that returns head
+and tail pages if running get_user_pages on an address backed by any
+hugepage), requires the refcount to be accounted on the tail pages and
+not only in the head pages, if we want to be able to run
+split_huge_page while there are gup pins established on any tail
+page. Failure to be able to run split_huge_page if there's any gup pin
+on any tail page, would mean having to split all hugepages upfront in
+get_user_pages which is unacceptable as too many gup users are
+performance critical and they must work natively on hugepages like
+they work natively on hugetlbfs already (hugetlbfs is simpler because
+hugetlbfs pages cannot be splitted so there wouldn't be requirement of
+accounting the pins on the tail pages for hugetlbfs). If we wouldn't
+account the gup refcounts on the tail pages during gup, we won't know
+anymore which tail page is pinned by gup and which is not while we run
+split_huge_page. But we still have to add the gup pin to the head page
+too, to know when we can free the compound page in case it's never
+splitted during its lifetime. That requires changing not just
+get_page, but put_page as well so that when put_page runs on a tail
+page (and only on a tail page) it will find its respective head page,
+and then it will decrease the head page refcount in addition to the
+tail page refcount. To obtain a head page reliably and to decrease its
+refcount without race conditions, put_page has to serialize against
+__split_huge_page_refcount using a special per-page lock called
+compound_lock.
diff --git a/MAINTAINERS b/MAINTAINERS
index 3dd5c6fce989..af656ded404e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6592,13 +6592,12 @@ F: Documentation/i2c/busses/i2c-viapro
F: drivers/i2c/busses/i2c-viapro.c
VIA SD/MMC CARD CONTROLLER DRIVER
-M: Joseph Chan <JosephChan@via.com.tw>
+M: Bruce Chang <brucechang@via.com.tw>
M: Harald Welte <HaraldWelte@viatech.com>
S: Maintained
F: drivers/mmc/host/via-sdmmc.c
VIA UNICHROME(PRO)/CHROME9 FRAMEBUFFER DRIVER
-M: Joseph Chan <JosephChan@via.com.tw>
M: Florian Tobias Schandinat <FlorianSchandinat@gmx.de>
L: linux-fbdev@vger.kernel.org
S: Maintained
diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h
index 99c56d47879d..72db984f8781 100644
--- a/arch/alpha/include/asm/mman.h
+++ b/arch/alpha/include/asm/mman.h
@@ -53,6 +53,9 @@
#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
+#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index 0c1bb68ff4a8..2cfe8161b478 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -38,17 +38,9 @@
#ifdef CONFIG_MMU
void *module_alloc(unsigned long size)
{
- struct vm_struct *area;
-
- size = PAGE_ALIGN(size);
- if (!size)
- return NULL;
-
- area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
- if (!area)
- return NULL;
-
- return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+ return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, -1,
+ __builtin_return_address(0));
}
#else /* CONFIG_MMU */
void *module_alloc(unsigned long size)
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index 93292a18cf77..709244c66fa3 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -50,7 +50,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
if (!new_pmd)
goto no_pmd;
- new_pte = pte_alloc_map(mm, new_pmd, 0);
+ new_pte = pte_alloc_map(mm, NULL, new_pmd, 0);
if (!new_pte)
goto no_pte;
diff --git a/arch/avr32/boards/atngw100/setup.c b/arch/avr32/boards/atngw100/setup.c
index 8c6a2440e345..659d119ce712 100644
--- a/arch/avr32/boards/atngw100/setup.c
+++ b/arch/avr32/boards/atngw100/setup.c
@@ -188,7 +188,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
*/
regs = (void __iomem __force *)res->start;
pclk = clk_get(&pdev->dev, "pclk");
- if (!pclk)
+ if (IS_ERR(pclk))
return;
clk_enable(pclk);
diff --git a/arch/avr32/boards/atstk1000/atstk1002.c b/arch/avr32/boards/atstk1000/atstk1002.c
index 2adc261c9e3d..6ce30fb2ec94 100644
--- a/arch/avr32/boards/atstk1000/atstk1002.c
+++ b/arch/avr32/boards/atstk1000/atstk1002.c
@@ -203,7 +203,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
*/
regs = (void __iomem __force *)res->start;
pclk = clk_get(&pdev->dev, "pclk");
- if (!pclk)
+ if (IS_ERR(pclk))
return;
clk_enable(pclk);
diff --git a/arch/avr32/boards/favr-32/setup.c b/arch/avr32/boards/favr-32/setup.c
index 75f19f47fb2f..86fab77a5a00 100644
--- a/arch/avr32/boards/favr-32/setup.c
+++ b/arch/avr32/boards/favr-32/setup.c
@@ -206,7 +206,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
*/
regs = (void __iomem __force *)res->start;
pclk = clk_get(&pdev->dev, "pclk");
- if (!pclk)
+ if (IS_ERR(pclk))
return;
clk_enable(pclk);
diff --git a/arch/avr32/boards/hammerhead/setup.c b/arch/avr32/boards/hammerhead/setup.c
index dd009875a405..da14fbdd4e8e 100644
--- a/arch/avr32/boards/hammerhead/setup.c
+++ b/arch/avr32/boards/hammerhead/setup.c
@@ -150,7 +150,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
regs = (void __iomem __force *)res->start;
pclk = clk_get(&pdev->dev, "pclk");
- if (!pclk)
+ if (IS_ERR(pclk))
return;
clk_enable(pclk);
diff --git a/arch/avr32/boards/merisc/setup.c b/arch/avr32/boards/merisc/setup.c
index 623b077594fc..e61bc948f959 100644
--- a/arch/avr32/boards/merisc/setup.c
+++ b/arch/avr32/boards/merisc/setup.c
@@ -134,7 +134,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
regs = (void __iomem __force *)res->start;
pclk = clk_get(&pdev->dev, "pclk");
- if (!pclk)
+ if (IS_ERR(pclk))
return;
clk_enable(pclk);
diff --git a/arch/avr32/boards/mimc200/setup.c b/arch/avr32/boards/mimc200/setup.c
index 523d8e183bef..c4da5cba2dbf 100644
--- a/arch/avr32/boards/mimc200/setup.c
+++ b/arch/avr32/boards/mimc200/setup.c
@@ -162,7 +162,7 @@ static void __init set_hw_addr(struct platform_device *pdev)
*/
regs = (void __iomem __force *)res->start;
pclk = clk_get(&pdev->dev, "pclk");
- if (!pclk)
+ if (IS_ERR(pclk))
return;
clk_enable(pclk);
diff --git a/arch/avr32/configs/atngw100_defconfig b/arch/avr32/configs/atngw100_defconfig
index 9854013d2728..6f9ca56de1f6 100644
--- a/arch/avr32/configs/atngw100_defconfig
+++ b/arch/avr32/configs/atngw100_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_SYSCTL_SYSCALL is not set
# CONFIG_BASE_FULL is not set
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
CONFIG_NO_HZ=y
@@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -72,8 +70,8 @@ CONFIG_MTD_UBI=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
CONFIG_ATMEL_TCLIB=y
-CONFIG_EEPROM_AT24=m
CONFIG_NETDEVICES=y
CONFIG_TUN=m
CONFIG_NET_ETHERNET=y
@@ -106,6 +104,7 @@ CONFIG_GPIO_SYSFS=y
CONFIG_WATCHDOG=y
CONFIG_AT32AP700X_WDT=y
CONFIG_USB_GADGET=y
+CONFIG_USB_GADGET_VBUS_DRAW=350
CONFIG_USB_ZERO=m
CONFIG_USB_ETH=m
CONFIG_USB_GADGETFS=m
@@ -115,14 +114,12 @@ CONFIG_USB_CDC_COMPOSITE=m
CONFIG_MMC=y
CONFIG_MMC_TEST=m
CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y
CONFIG_LEDS_GPIO=y
CONFIG_LEDS_TRIGGERS=y
CONFIG_LEDS_TRIGGER_TIMER=y
CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_AT32AP700X=y
CONFIG_DMADEVICES=y
@@ -130,21 +127,23 @@ CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
# CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
# CONFIG_DNOTIFY is not set
CONFIG_FUSE_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=m
+CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
-CONFIG_UFS_FS=y
+CONFIG_UBIFS_FS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
CONFIG_NFSD=m
CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
CONFIG_CIFS=m
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_CODEPAGE_850=m
@@ -155,5 +154,3 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_PCBC=m
diff --git a/arch/avr32/configs/atngw100_evklcd100_defconfig b/arch/avr32/configs/atngw100_evklcd100_defconfig
index 7ceda354597b..7eece0af34c9 100644
--- a/arch/avr32/configs/atngw100_evklcd100_defconfig
+++ b/arch/avr32/configs/atngw100_evklcd100_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_SYSCTL_SYSCALL is not set
# CONFIG_BASE_FULL is not set
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
CONFIG_NO_HZ=y
@@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -74,8 +72,10 @@ CONFIG_MTD_UBI=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
CONFIG_ATMEL_TCLIB=y
CONFIG_NETDEVICES=y
+CONFIG_TUN=m
CONFIG_NET_ETHERNET=y
CONFIG_MACB=y
# CONFIG_NETDEV_1000 is not set
@@ -104,6 +104,7 @@ CONFIG_I2C_GPIO=m
CONFIG_SPI=y
CONFIG_SPI_ATMEL=y
CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
# CONFIG_HWMON is not set
CONFIG_WATCHDOG=y
CONFIG_AT32AP700X_WDT=y
@@ -127,6 +128,7 @@ CONFIG_USB_FILE_STORAGE=m
CONFIG_USB_G_SERIAL=m
CONFIG_USB_CDC_COMPOSITE=m
CONFIG_MMC=y
+CONFIG_MMC_TEST=m
CONFIG_MMC_ATMELMCI=y
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y
@@ -141,11 +143,14 @@ CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
# CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
# CONFIG_DNOTIFY is not set
CONFIG_FUSE_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
@@ -155,7 +160,6 @@ CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
CONFIG_NFSD=m
CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
CONFIG_CIFS=m
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_CODEPAGE_850=m
@@ -166,4 +170,3 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atngw100_evklcd101_defconfig b/arch/avr32/configs/atngw100_evklcd101_defconfig
index 7bc5b2ce68d5..387eb9d6e423 100644
--- a/arch/avr32/configs/atngw100_evklcd101_defconfig
+++ b/arch/avr32/configs/atngw100_evklcd101_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_SYSCTL_SYSCALL is not set
# CONFIG_BASE_FULL is not set
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
CONFIG_NO_HZ=y
@@ -30,6 +27,7 @@ CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -73,8 +71,10 @@ CONFIG_MTD_UBI=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
CONFIG_ATMEL_TCLIB=y
CONFIG_NETDEVICES=y
+CONFIG_TUN=m
CONFIG_NET_ETHERNET=y
CONFIG_MACB=y
# CONFIG_NETDEV_1000 is not set
@@ -103,6 +103,7 @@ CONFIG_I2C_GPIO=m
CONFIG_SPI=y
CONFIG_SPI_ATMEL=y
CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
# CONFIG_HWMON is not set
CONFIG_WATCHDOG=y
CONFIG_AT32AP700X_WDT=y
@@ -126,6 +127,7 @@ CONFIG_USB_FILE_STORAGE=m
CONFIG_USB_G_SERIAL=m
CONFIG_USB_CDC_COMPOSITE=m
CONFIG_MMC=y
+CONFIG_MMC_TEST=m
CONFIG_MMC_ATMELMCI=y
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y
@@ -140,11 +142,14 @@ CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
# CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
# CONFIG_DNOTIFY is not set
CONFIG_FUSE_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
@@ -154,7 +159,6 @@ CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
CONFIG_NFSD=m
CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
CONFIG_CIFS=m
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_CODEPAGE_850=m
@@ -165,4 +169,3 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atngw100mkii_defconfig b/arch/avr32/configs/atngw100mkii_defconfig
index 4bd36821d4a2..f0fe237133a9 100644
--- a/arch/avr32/configs/atngw100mkii_defconfig
+++ b/arch/avr32/configs/atngw100mkii_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_SYSCTL_SYSCALL is not set
# CONFIG_BASE_FULL is not set
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
CONFIG_NO_HZ=y
@@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -74,6 +72,7 @@ CONFIG_MTD_UBI=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
CONFIG_ATMEL_TCLIB=y
CONFIG_NETDEVICES=y
CONFIG_TUN=m
@@ -107,6 +106,7 @@ CONFIG_GPIO_SYSFS=y
CONFIG_WATCHDOG=y
CONFIG_AT32AP700X_WDT=y
CONFIG_USB_GADGET=y
+CONFIG_USB_GADGET_VBUS_DRAW=350
CONFIG_USB_ZERO=m
CONFIG_USB_ETH=m
CONFIG_USB_GADGETFS=m
@@ -116,14 +116,12 @@ CONFIG_USB_CDC_COMPOSITE=m
CONFIG_MMC=y
CONFIG_MMC_TEST=m
CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y
CONFIG_LEDS_GPIO=y
CONFIG_LEDS_TRIGGERS=y
CONFIG_LEDS_TRIGGER_TIMER=y
CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_AT32AP700X=y
CONFIG_DMADEVICES=y
@@ -131,21 +129,23 @@ CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
# CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
# CONFIG_DNOTIFY is not set
CONFIG_FUSE_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=m
+CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
-CONFIG_UFS_FS=y
+CONFIG_UBIFS_FS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
CONFIG_NFSD=m
CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
CONFIG_CIFS=m
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_CODEPAGE_850=m
@@ -156,5 +156,3 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_PCBC=m
diff --git a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig
index f8437ef3237f..e4a7c1dc8380 100644
--- a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig
+++ b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_SYSCTL_SYSCALL is not set
# CONFIG_BASE_FULL is not set
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
CONFIG_NO_HZ=y
@@ -32,6 +29,7 @@ CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -77,8 +75,10 @@ CONFIG_MTD_UBI=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
CONFIG_ATMEL_TCLIB=y
CONFIG_NETDEVICES=y
+CONFIG_TUN=m
CONFIG_NET_ETHERNET=y
CONFIG_MACB=y
# CONFIG_NETDEV_1000 is not set
@@ -107,6 +107,7 @@ CONFIG_I2C_GPIO=m
CONFIG_SPI=y
CONFIG_SPI_ATMEL=y
CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
# CONFIG_HWMON is not set
CONFIG_WATCHDOG=y
CONFIG_AT32AP700X_WDT=y
@@ -130,6 +131,7 @@ CONFIG_USB_FILE_STORAGE=m
CONFIG_USB_G_SERIAL=m
CONFIG_USB_CDC_COMPOSITE=m
CONFIG_MMC=y
+CONFIG_MMC_TEST=m
CONFIG_MMC_ATMELMCI=y
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y
@@ -144,11 +146,14 @@ CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
# CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
# CONFIG_DNOTIFY is not set
CONFIG_FUSE_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
@@ -158,7 +163,6 @@ CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
CONFIG_NFSD=m
CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
CONFIG_CIFS=m
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_CODEPAGE_850=m
@@ -169,4 +173,3 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig
index 7f58f996d945..6f37f70c2c37 100644
--- a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig
+++ b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig
@@ -2,20 +2,17 @@ CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_SYSCTL_SYSCALL is not set
# CONFIG_BASE_FULL is not set
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_IOSCHED_DEADLINE is not set
CONFIG_NO_HZ=y
@@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -76,8 +74,10 @@ CONFIG_MTD_UBI=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
CONFIG_ATMEL_TCLIB=y
CONFIG_NETDEVICES=y
+CONFIG_TUN=m
CONFIG_NET_ETHERNET=y
CONFIG_MACB=y
# CONFIG_NETDEV_1000 is not set
@@ -106,6 +106,7 @@ CONFIG_I2C_GPIO=m
CONFIG_SPI=y
CONFIG_SPI_ATMEL=y
CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
# CONFIG_HWMON is not set
CONFIG_WATCHDOG=y
CONFIG_AT32AP700X_WDT=y
@@ -129,6 +130,7 @@ CONFIG_USB_FILE_STORAGE=m
CONFIG_USB_G_SERIAL=m
CONFIG_USB_CDC_COMPOSITE=m
CONFIG_MMC=y
+CONFIG_MMC_TEST=m
CONFIG_MMC_ATMELMCI=y
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y
@@ -143,11 +145,14 @@ CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
# CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
# CONFIG_DNOTIFY is not set
CONFIG_FUSE_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
@@ -157,7 +162,6 @@ CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
CONFIG_NFSD=m
CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
CONFIG_CIFS=m
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_CODEPAGE_850=m
@@ -168,4 +172,3 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/avr32/configs/atstk1002_defconfig b/arch/avr32/configs/atstk1002_defconfig
index aec4c43a75da..4fb01f5ab42f 100644
--- a/arch/avr32/configs/atstk1002_defconfig
+++ b/arch/avr32/configs/atstk1002_defconfig
@@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_SYSCTL_SYSCALL is not set
@@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
@@ -26,6 +25,7 @@ CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -35,6 +35,7 @@ CONFIG_INET=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE_DEMUX=m
CONFIG_NET_IPGRE=m
CONFIG_INET_AH=m
CONFIG_INET_ESP=m
@@ -58,16 +59,14 @@ CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=m
-CONFIG_MTD_M25P80=m
CONFIG_MTD_UBI=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
CONFIG_ATMEL_PWM=m
CONFIG_ATMEL_TCLIB=y
CONFIG_ATMEL_SSC=m
-CONFIG_EEPROM_AT24=m
# CONFIG_SCSI_PROC_FS is not set
CONFIG_BLK_DEV_SD=m
CONFIG_BLK_DEV_SR=m
@@ -120,7 +119,6 @@ CONFIG_SND_MIXER_OSS=m
CONFIG_SND_PCM_OSS=m
# CONFIG_SND_SUPPORT_OLD_API is not set
# CONFIG_SND_VERBOSE_PROCFS is not set
-# CONFIG_SND_DRIVERS is not set
CONFIG_SND_AT73C213=m
# CONFIG_HID_SUPPORT is not set
CONFIG_USB_GADGET=y
@@ -131,16 +129,15 @@ CONFIG_USB_FILE_STORAGE=m
CONFIG_USB_G_SERIAL=m
CONFIG_USB_CDC_COMPOSITE=m
CONFIG_MMC=y
+CONFIG_MMC_TEST=m
CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=m
+CONFIG_LEDS_CLASS=y
CONFIG_LEDS_ATMEL_PWM=m
CONFIG_LEDS_GPIO=m
CONFIG_LEDS_TRIGGERS=y
CONFIG_LEDS_TRIGGER_TIMER=m
CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_AT32AP700X=y
CONFIG_DMADEVICES=y
@@ -149,20 +146,23 @@ CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
# CONFIG_EXT3_FS_XATTR is not set
CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
# CONFIG_DNOTIFY is not set
CONFIG_FUSE_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
+CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
-# CONFIG_JFFS2_FS_WRITEBUFFER is not set
CONFIG_UBIFS_FS=y
-CONFIG_MINIX_FS=m
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
+CONFIG_CIFS=m
CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_UTF8=m
CONFIG_MAGIC_SYSRQ=y
@@ -170,6 +170,3 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-# CONFIG_CRYPTO_HW is not set
-CONFIG_CRC_T10DIF=m
diff --git a/arch/avr32/configs/atstk1003_defconfig b/arch/avr32/configs/atstk1003_defconfig
index 50ba3db682ca..9faaf9b900f2 100644
--- a/arch/avr32/configs/atstk1003_defconfig
+++ b/arch/avr32/configs/atstk1003_defconfig
@@ -2,22 +2,15 @@ CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_AUDIT=y
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_SYSCTL_SYSCALL is not set
# CONFIG_BASE_FULL is not set
-# CONFIG_SLUB_DEBUG is not set
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
@@ -33,6 +26,7 @@ CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -54,18 +48,18 @@ CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=m
-CONFIG_MTD_M25P80=m
+CONFIG_MTD_UBI=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
CONFIG_ATMEL_PWM=m
CONFIG_ATMEL_TCLIB=y
CONFIG_ATMEL_SSC=m
-CONFIG_EEPROM_AT24=m
# CONFIG_SCSI_PROC_FS is not set
CONFIG_BLK_DEV_SD=m
CONFIG_BLK_DEV_SR=m
+# CONFIG_SCSI_LOWLEVEL is not set
CONFIG_ATA=m
# CONFIG_SATA_PMP is not set
CONFIG_PATA_AT32=m
@@ -77,6 +71,7 @@ CONFIG_PPP_ASYNC=m
CONFIG_PPP_DEFLATE=m
CONFIG_PPP_BSDCOMP=m
CONFIG_INPUT=m
+CONFIG_INPUT_EVDEV=m
# CONFIG_KEYBOARD_ATKBD is not set
CONFIG_KEYBOARD_GPIO=m
# CONFIG_MOUSE_PS2 is not set
@@ -106,7 +101,6 @@ CONFIG_SND_PCM_OSS=m
CONFIG_SND_AT73C213=m
# CONFIG_HID_SUPPORT is not set
CONFIG_USB_GADGET=y
-CONFIG_USB_GADGET_DEBUG_FS=y
CONFIG_USB_ZERO=m
CONFIG_USB_ETH=m
CONFIG_USB_GADGETFS=m
@@ -116,36 +110,39 @@ CONFIG_USB_CDC_COMPOSITE=m
CONFIG_MMC=y
CONFIG_MMC_TEST=m
CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y
CONFIG_LEDS_ATMEL_PWM=m
-CONFIG_LEDS_GPIO=y
+CONFIG_LEDS_GPIO=m
CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
+CONFIG_LEDS_TRIGGER_TIMER=m
+CONFIG_LEDS_TRIGGER_HEARTBEAT=m
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_AT32AP700X=y
CONFIG_DMADEVICES=y
-CONFIG_DW_DMAC=y
-CONFIG_EXT2_FS=m
-CONFIG_EXT3_FS=m
+CONFIG_EXT2_FS=y
+CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
# CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
# CONFIG_DNOTIFY is not set
CONFIG_FUSE_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=m
+CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
+CONFIG_UBIFS_FS=y
# CONFIG_NETWORK_FILESYSTEMS is not set
CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_UTF8=m
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
+CONFIG_DETECT_HUNG_TASK=y
CONFIG_FRAME_POINTER=y
-CONFIG_CRC_T10DIF=m
diff --git a/arch/avr32/configs/atstk1004_defconfig b/arch/avr32/configs/atstk1004_defconfig
index 329e10ba3b54..3d2a5d85f970 100644
--- a/arch/avr32/configs/atstk1004_defconfig
+++ b/arch/avr32/configs/atstk1004_defconfig
@@ -1,19 +1,32 @@
CONFIG_EXPERIMENTAL=y
# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
+CONFIG_RELAY=y
+CONFIG_BLK_DEV_INITRD=y
# CONFIG_SYSCTL_SYSCALL is not set
# CONFIG_BASE_FULL is not set
-# CONFIG_FUTEX is not set
-# CONFIG_EPOLL is not set
-# CONFIG_SIGNALFD is not set
-# CONFIG_TIMERFD is not set
-# CONFIG_EVENTFD is not set
# CONFIG_COMPAT_BRK is not set
-CONFIG_SLOB=y
-# CONFIG_BLOCK is not set
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=m
+# CONFIG_KPROBES is not set
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
CONFIG_BOARD_ATSTK1004=y
# CONFIG_OWNERSHIP_TRACE is not set
+CONFIG_NMI_DEBUGGING=y
+CONFIG_PM=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_AT32AP=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -31,40 +44,104 @@ CONFIG_MTD=y
CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_CMDLINE_PARTS=y
CONFIG_MTD_CHAR=y
+CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_PHYSMAP=y
-# CONFIG_MISC_DEVICES is not set
-# CONFIG_INPUT is not set
+CONFIG_MTD_UBI=y
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
+CONFIG_ATMEL_PWM=m
+CONFIG_ATMEL_TCLIB=y
+CONFIG_ATMEL_SSC=m
+# CONFIG_SCSI_PROC_FS is not set
+CONFIG_BLK_DEV_SD=m
+CONFIG_BLK_DEV_SR=m
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=m
+# CONFIG_SATA_PMP is not set
+CONFIG_PATA_AT32=m
+CONFIG_NETDEVICES=y
+# CONFIG_NETDEV_1000 is not set
+# CONFIG_NETDEV_10000 is not set
+CONFIG_PPP=m
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_INPUT=m
+CONFIG_INPUT_EVDEV=m
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GPIO=m
+# CONFIG_MOUSE_PS2 is not set
+CONFIG_MOUSE_GPIO=m
# CONFIG_SERIO is not set
# CONFIG_VT is not set
# CONFIG_DEVKMEM is not set
CONFIG_SERIAL_ATMEL=y
CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_SERIAL_ATMEL_PDC is not set
# CONFIG_LEGACY_PTYS is not set
# CONFIG_HW_RANDOM is not set
+CONFIG_I2C=m
+CONFIG_I2C_CHARDEV=m
+CONFIG_I2C_GPIO=m
CONFIG_SPI=y
CONFIG_SPI_ATMEL=y
+CONFIG_SPI_SPIDEV=m
+CONFIG_GPIO_SYSFS=y
# CONFIG_HWMON is not set
CONFIG_WATCHDOG=y
CONFIG_AT32AP700X_WDT=y
CONFIG_FB=y
CONFIG_FB_ATMEL=y
CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LCD_CLASS_DEVICE=y
CONFIG_LCD_LTV350QV=y
# CONFIG_BACKLIGHT_CLASS_DEVICE is not set
CONFIG_USB_GADGET=y
-CONFIG_USB_ETH=y
-# CONFIG_USB_ETH_RNDIS is not set
+CONFIG_USB_ZERO=m
+CONFIG_USB_ETH=m
+CONFIG_USB_GADGETFS=m
+CONFIG_USB_FILE_STORAGE=m
+CONFIG_USB_G_SERIAL=m
+CONFIG_USB_CDC_COMPOSITE=m
+CONFIG_MMC=y
+CONFIG_MMC_TEST=m
+CONFIG_MMC_ATMELMCI=y
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=y
+CONFIG_LEDS_ATMEL_PWM=m
+CONFIG_LEDS_GPIO=m
+CONFIG_LEDS_TRIGGERS=y
+CONFIG_LEDS_TRIGGER_TIMER=m
+CONFIG_LEDS_TRIGGER_HEARTBEAT=m
CONFIG_RTC_CLASS=y
-# CONFIG_RTC_INTF_PROC is not set
CONFIG_RTC_DRV_AT32AP700X=y
+CONFIG_DMADEVICES=y
+CONFIG_EXT2_FS=y
+CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
+# CONFIG_EXT3_FS_XATTR is not set
+CONFIG_EXT4_FS=y
+# CONFIG_EXT4_FS_XATTR is not set
# CONFIG_DNOTIFY is not set
+CONFIG_FUSE_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
CONFIG_PROC_KCORE=y
-# CONFIG_PROC_PAGE_MONITOR is not set
CONFIG_TMPFS=y
+CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
-# CONFIG_JFFS2_FS_WRITEBUFFER is not set
+CONFIG_UBIFS_FS=y
# CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_UTF8=m
CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_FS=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_FRAME_POINTER=y
diff --git a/arch/avr32/configs/atstk1006_defconfig b/arch/avr32/configs/atstk1006_defconfig
index dbcc1b51e506..1ed8f22d4fe2 100644
--- a/arch/avr32/configs/atstk1006_defconfig
+++ b/arch/avr32/configs/atstk1006_defconfig
@@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED_V2=y
CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
# CONFIG_SYSCTL_SYSCALL is not set
@@ -11,7 +10,7 @@ CONFIG_BLK_DEV_INITRD=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
@@ -37,6 +36,7 @@ CONFIG_INET=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE_DEMUX=m
CONFIG_NET_IPGRE=m
CONFIG_INET_AH=m
CONFIG_INET_ESP=m
@@ -60,15 +60,13 @@ CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=m
-CONFIG_MTD_DATAFLASH_OTP=y
-CONFIG_MTD_M25P80=m
CONFIG_MTD_NAND=y
CONFIG_MTD_NAND_ATMEL=y
CONFIG_MTD_UBI=y
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=m
+CONFIG_MISC_DEVICES=y
CONFIG_ATMEL_PWM=m
CONFIG_ATMEL_TCLIB=y
CONFIG_ATMEL_SSC=m
@@ -132,17 +130,17 @@ CONFIG_USB_ETH=m
CONFIG_USB_GADGETFS=m
CONFIG_USB_FILE_STORAGE=m
CONFIG_USB_G_SERIAL=m
+CONFIG_USB_CDC_COMPOSITE=m
CONFIG_MMC=y
+CONFIG_MMC_TEST=m
CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=m
CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=m
+CONFIG_LEDS_CLASS=y
CONFIG_LEDS_ATMEL_PWM=m
CONFIG_LEDS_GPIO=m
CONFIG_LEDS_TRIGGERS=y
CONFIG_LEDS_TRIGGER_TIMER=m
CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_AT32AP700X=y
CONFIG_DMADEVICES=y
@@ -156,15 +154,18 @@ CONFIG_EXT4_FS=y
CONFIG_FUSE_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=850
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
+CONFIG_CONFIGFS_FS=y
CONFIG_JFFS2_FS=y
CONFIG_UBIFS_FS=y
-CONFIG_MINIX_FS=m
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_ROOT_NFS=y
+CONFIG_CIFS=m
CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_850=m
CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_UTF8=m
CONFIG_MAGIC_SYSRQ=y
@@ -172,7 +173,3 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_FRAME_POINTER=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_CRYPTO_FIPS=y
-# CONFIG_CRYPTO_HW is not set
-CONFIG_CRC_T10DIF=m
diff --git a/arch/avr32/configs/favr-32_defconfig b/arch/avr32/configs/favr-32_defconfig
index 0c813b661a0a..aeadc955db32 100644
--- a/arch/avr32/configs/favr-32_defconfig
+++ b/arch/avr32/configs/favr-32_defconfig
@@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
diff --git a/arch/avr32/configs/hammerhead_defconfig b/arch/avr32/configs/hammerhead_defconfig
index dcc01f0eb294..1692beeb7ed3 100644
--- a/arch/avr32/configs/hammerhead_defconfig
+++ b/arch/avr32/configs/hammerhead_defconfig
@@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_OPROFILE=m
-CONFIG_KPROBES=y
+# CONFIG_KPROBES is not set
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
diff --git a/arch/avr32/include/asm/syscalls.h b/arch/avr32/include/asm/syscalls.h
index ab608b70b24d..244f2acab546 100644
--- a/arch/avr32/include/asm/syscalls.h
+++ b/arch/avr32/include/asm/syscalls.h
@@ -15,20 +15,6 @@
#include <linux/types.h>
#include <linux/signal.h>
-/* kernel/process.c */
-asmlinkage int sys_fork(struct pt_regs *);
-asmlinkage int sys_clone(unsigned long, unsigned long,
- unsigned long, unsigned long,
- struct pt_regs *);
-asmlinkage int sys_vfork(struct pt_regs *);
-asmlinkage int sys_execve(const char __user *, char __user *__user *,
- char __user *__user *, struct pt_regs *);
-
-/* kernel/signal.c */
-asmlinkage int sys_sigaltstack(const stack_t __user *, stack_t __user *,
- struct pt_regs *);
-asmlinkage int sys_rt_sigreturn(struct pt_regs *);
-
/* mm/cache.c */
asmlinkage int sys_cacheflush(int, void __user *, size_t);
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 9c46aaad11ce..ef5a2a08fcca 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -367,14 +367,13 @@ asmlinkage int sys_fork(struct pt_regs *regs)
}
asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
- unsigned long parent_tidptr,
- unsigned long child_tidptr, struct pt_regs *regs)
+ void __user *parent_tidptr, void __user *child_tidptr,
+ struct pt_regs *regs)
{
if (!newsp)
newsp = regs->sp;
- return do_fork(clone_flags, newsp, regs, 0,
- (int __user *)parent_tidptr,
- (int __user *)child_tidptr);
+ return do_fork(clone_flags, newsp, regs, 0, parent_tidptr,
+ child_tidptr);
}
asmlinkage int sys_vfork(struct pt_regs *regs)
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index 668ed2817e51..05ad29112ff4 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -35,7 +35,6 @@ static struct clocksource counter = {
.rating = 50,
.read = read_cycle_count,
.mask = CLOCKSOURCE_MASK(32),
- .shift = 16,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -123,9 +122,7 @@ void __init time_init(void)
/* figure rate for counter */
counter_hz = clk_get_rate(boot_cpu_data.clk);
- counter.mult = clocksource_hz2mult(counter_hz, counter.shift);
-
- ret = clocksource_register(&counter);
+ ret = clocksource_register_hz(&counter, counter_hz);
if (ret)
pr_debug("timer: could not register clocksource: %d\n", ret);
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index ac76da099a6d..89accc626b86 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -618,7 +618,7 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
}
/* forward declaration */
-static static const struct dentry_operations pfmfs_dentry_operations;
+static const struct dentry_operations pfmfs_dentry_operations;
static struct dentry *
pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 1841ee7e65f9..5ca674b74737 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
if (pud) {
pmd = pmd_alloc(mm, pud, taddr);
if (pmd)
- pte = pte_alloc_map(mm, pmd, taddr);
+ pte = pte_alloc_map(mm, NULL, pmd, taddr);
}
return pte;
}
diff --git a/arch/mips/include/asm/mman.h b/arch/mips/include/asm/mman.h
index c892bfb3e2c1..785b4ea4ec3f 100644
--- a/arch/mips/include/asm/mman.h
+++ b/arch/mips/include/asm/mman.h
@@ -77,6 +77,9 @@
#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
#define MADV_HWPOISON 100 /* poison a page for testing */
+#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 6f51dda87fce..d87a72e9fac7 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -46,17 +46,9 @@ static DEFINE_SPINLOCK(dbe_lock);
void *module_alloc(unsigned long size)
{
#ifdef MODULE_START
- struct vm_struct *area;
-
- size = PAGE_ALIGN(size);
- if (!size)
- return NULL;
-
- area = __get_vm_area(size, VM_ALLOC, MODULE_START, MODULE_END);
- if (!area)
- return NULL;
-
- return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL);
+ return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+ GFP_KERNEL, PAGE_KERNEL, -1,
+ __builtin_return_address(0));
#else
if (size == 0)
return NULL;
diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h
index 9749c8afe83a..f5b7bf5fba68 100644
--- a/arch/parisc/include/asm/mman.h
+++ b/arch/parisc/include/asm/mman.h
@@ -59,6 +59,9 @@
#define MADV_MERGEABLE 65 /* KSM may merge identical pages */
#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */
+#define MADV_HUGEPAGE 67 /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE 68 /* Not worth backing with hugepages */
+
/* compatibility flags */
#define MAP_FILE 0
#define MAP_VARIABLE 0
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index d7efdbf640c7..fec13200868f 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -16,6 +16,16 @@
#ifdef __HAVE_ARCH_PTE_SPECIAL
+static inline void get_huge_page_tail(struct page *page)
+{
+ /*
+ * __split_huge_page_refcount() cannot run
+ * from under us.
+ */
+ VM_BUG_ON(atomic_read(&page->_count) < 0);
+ atomic_inc(&page->_count);
+}
+
/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
@@ -47,6 +57,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
put_page(page);
return 0;
}
+ if (PageTail(page))
+ get_huge_page_tail(page);
pages[*nr] = page;
(*nr)++;
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 9163db3e8d15..d7762349ea48 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
if (pud) {
pmd = pmd_alloc(mm, pud, addr);
if (pmd)
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = pte_alloc_map(mm, NULL, pmd, addr);
}
}
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index ee3c7dde8d9f..8d348c474a2f 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -23,17 +23,11 @@
static void *module_map(unsigned long size)
{
- struct vm_struct *area;
-
- size = PAGE_ALIGN(size);
- if (!size || size > MODULES_LEN)
- return NULL;
-
- area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
- if (!area)
+ if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
-
- return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL);
+ return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+ GFP_KERNEL, PAGE_KERNEL, -1,
+ __builtin_return_address(0));
}
static char *dot2underscore(char *name)
diff --git a/arch/sparc/mm/generic_32.c b/arch/sparc/mm/generic_32.c
index 5edcac184eaf..e6067b75f11c 100644
--- a/arch/sparc/mm/generic_32.c
+++ b/arch/sparc/mm/generic_32.c
@@ -50,7 +50,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned
end = PGDIR_SIZE;
offset -= address;
do {
- pte_t * pte = pte_alloc_map(mm, pmd, address);
+ pte_t *pte = pte_alloc_map(mm, NULL, pmd, address);
if (!pte)
return -ENOMEM;
io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space);
diff --git a/arch/sparc/mm/generic_64.c b/arch/sparc/mm/generic_64.c
index 04f2bf4cd571..3cb00dfd4bd6 100644
--- a/arch/sparc/mm/generic_64.c
+++ b/arch/sparc/mm/generic_64.c
@@ -92,7 +92,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned
end = PGDIR_SIZE;
offset -= address;
do {
- pte_t * pte = pte_alloc_map(mm, pmd, address);
+ pte_t *pte = pte_alloc_map(mm, NULL, pmd, address);
if (!pte)
return -ENOMEM;
io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space);
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 5fdddf134caa..f4e97646ce23 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -214,7 +214,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
if (pud) {
pmd = pmd_alloc(mm, pud, addr);
if (pmd)
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = pte_alloc_map(mm, NULL, pmd, addr);
}
return pte;
}
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 3d099f974785..1aee587e9c5d 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
if (!pmd)
goto out_pmd;
- pte = pte_alloc_map(mm, pmd, proc);
+ pte = pte_alloc_map(mm, NULL, pmd, proc);
if (!pte)
goto out_pte;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21a9fba..ffd7f8d29187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -822,6 +822,7 @@ extern bool kvm_rebooting;
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7709c12431b8..2071a8b2b32f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
{
PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
}
+static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
+}
static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
}
+static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
+}
+
static inline pte_t __pte(pteval_t val)
{
pteval_t ret;
@@ -543,6 +554,20 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+#if PAGETABLE_LEVELS >= 3
+ if (sizeof(pmdval_t) > sizeof(long))
+ /* 5 arg words */
+ pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
+ else
+ PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd);
+#endif
+}
+#endif
+
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b82bac975250..82885099c869 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -265,10 +265,16 @@ struct pv_mmu_ops {
void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+ void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmdval);
void (*pte_update)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
void (*pte_update_defer)(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
+ void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp);
+ void (*pmd_update_defer)(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp);
pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..98391db840c6 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
+#ifdef CONFIG_SMP
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
+{
+ return __pmd(xchg((pmdval_t *)xp, 0));
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
/*
* Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
* split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..94b979d1b58d 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
+#ifdef CONFIG_SMP
+union split_pmd {
+ struct {
+ u32 pmd_low;
+ u32 pmd_high;
+ };
+ pmd_t pmd;
+};
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ union split_pmd res, *orig = (union split_pmd *)pmdp;
+
+ /* xchg acts as a barrier before setting of the high bits */
+ res.pmd_low = xchg(&orig->pmd_low, 0);
+ res.pmd_high = orig->pmd_high;
+ orig->pmd_high = 0;
+
+ return res.pmd;
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
/*
* Bits 0, 6 and 7 are taken in the low part of the pte,
* put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ada823a13c7c..18601c86fab1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#else /* !CONFIG_PARAVIRT */
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
+#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
#define set_pte_atomic(ptep, pte) \
native_set_pte_atomic(ptep, pte)
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#define pte_update(mm, addr, ptep) do { } while (0)
#define pte_update_defer(mm, addr, ptep) do { } while (0)
+#define pmd_update(mm, addr, ptep) do { } while (0)
+#define pmd_update_defer(mm, addr, ptep) do { } while (0)
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
+static inline int pmd_young(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
static inline int pte_write(pte_t pte)
{
return pte_flags(pte) & _PAGE_RW;
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
(_PAGE_PSE | _PAGE_PRESENT);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+ return pmd_val(pmd) & _PAGE_SPLITTING;
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return pmd_val(pmd) & _PAGE_PSE;
+}
+
+static inline int has_transparent_hugepage(void)
+{
+ return cpu_has_pse;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
pteval_t v = native_pte_val(pte);
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
return pte_set_flags(pte, _PAGE_SPECIAL);
}
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return __pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return __pmd(v & ~clear);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+
/*
* Mask out unsupported bits in a present pgprot. Non-present pgprots
* can use those bits for other purposes, so leave them be.
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
return __pte(val);
}
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ pmdval_t val = pmd_val(pmd);
+
+ val &= _HPAGE_CHG_MASK;
+ val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+
+ return __pmd(val);
+}
+
/* mprotect needs to preserve PAT bits when updating vm_page_prot */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
+#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
/*
* the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
return res;
}
+static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ pmd_t res = *pmdp;
+
+ native_pmd_clear(pmdp);
+ return res;
+}
+
static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep , pte_t pte)
{
native_set_pte(ptep, pte);
}
+static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp , pmd_t pmd)
+{
+ native_set_pmd(pmdp, pmd);
+}
+
#ifndef CONFIG_PARAVIRT
/*
* Rules for using pte_update - it must be called after any PTE update which
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
#define flush_tlb_fix_spurious_fault(vma, address)
+#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
+
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+ pmd_update(mm, addr, pmdp);
+ return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+ pmd_update(mm, addr, pmdp);
+}
+
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f86da20347f2..975f709e09ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
native_set_pte(ptep, pte);
}
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ *pmdp = pmd;
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+ native_set_pmd(pmd, native_make_pmd(0));
+}
+
static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#endif
}
-static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
- *pmdp = pmd;
-}
-
-static inline void native_pmd_clear(pmd_t *pmd)
-{
- native_set_pmd(pmd, native_make_pmd(0));
+#ifdef CONFIG_SMP
+ return native_make_pmd(xchg(&xp->pmd, 0));
+#else
+ /* native_local_pmdp_get_and_clear,
+ but duplicated because of cyclic dependency */
+ pmd_t ret = *xp;
+ native_pmd_clear(xp);
+ return ret;
+#endif
}
static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
#define __HAVE_ARCH_PTE_SAME
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a760be23..7db7723d1f32 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,6 +22,7 @@
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
#define __HAVE_ARCH_PTE_SPECIAL
#ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 8760cc60a21c..f25bdf238a33 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -42,6 +42,11 @@ extern unsigned int machine_to_phys_order;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern int m2p_add_override(unsigned long mfn, struct page *page);
+extern int m2p_remove_override(struct page *page);
+extern struct page *m2p_find_override(unsigned long mfn);
+extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
+
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
unsigned long mfn;
@@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
- if (unlikely((mfn >> machine_to_phys_order) != 0))
- return ~0;
-
pfn = 0;
/*
* The array access can fail (e.g., device space beyond end of RAM).
@@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
*/
__get_user(pfn, &machine_to_phys_mapping[mfn]);
+ /*
+ * If this appears to be a foreign mfn (because the pfn
+ * doesn't map back to the mfn), then check the local override
+ * table to see if there's a better pfn to use.
+ */
+ if (get_phys_to_machine(pfn) != mfn)
+ pfn = m2p_find_override_pfn(mfn, pfn);
+
return pfn;
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8f2956091735..ab23f1ad4bf1 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -37,20 +37,11 @@
void *module_alloc(unsigned long size)
{
- struct vm_struct *area;
-
- if (!size)
- return NULL;
- size = PAGE_ALIGN(size);
- if (size > MODULES_LEN)
+ if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
-
- area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
- if (!area)
- return NULL;
-
- return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
- PAGE_KERNEL_EXEC);
+ return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+ GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+ -1, __builtin_return_address(0));
}
/* Free memory returned from module_alloc */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c5b250011fd4..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
.set_pmd = native_set_pmd,
+ .set_pmd_at = native_set_pmd_at,
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
+ .pmd_update = paravirt_nop,
+ .pmd_update_defer = paravirt_nop,
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..998e972f3b1a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
pmd = pmd_alloc(&tboot_mm, pud, vaddr);
if (!pmd)
return -1;
- pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+ pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
if (!pte)
return -1;
set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 61fb98519622..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
+ split_huge_page_pmd(mm, pmd);
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cafbb499813..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret;
}
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
struct kvm_memory_slot *slot;
- int host_level, level, max_level;
-
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap)
- return PT_PAGE_TABLE_LEVEL;
+ return true;
+ return false;
+}
+
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+ int host_level, level, max_level;
host_level = host_mapping_level(vcpu->kvm, large_gfn);
@@ -941,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
return young;
}
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ u64 *spte;
+ int young = 0;
+
+ /*
+ * If there's no access bit in the secondary pte set by the
+ * hardware it's up to gup-fast/gup to set the access bit in
+ * the primary pte or in the page structure.
+ */
+ if (!shadow_accessed_mask)
+ goto out;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ u64 _spte = *spte;
+ BUG_ON(!(_spte & PT_PRESENT_MASK));
+ young = _spte & PT_ACCESSED_MASK;
+ if (young) {
+ young = 1;
+ break;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+out:
+ return young;
+}
+
#define RMAP_RECYCLE_THRESHOLD 1000
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -961,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
}
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+}
+
#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
{
@@ -2281,6 +2319,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
return 1;
}
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+ gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+{
+ pfn_t pfn = *pfnp;
+ gfn_t gfn = *gfnp;
+ int level = *levelp;
+
+ /*
+ * Check if it's a transparent hugepage. If this would be an
+ * hugetlbfs page, level wouldn't be set to
+ * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+ * here.
+ */
+ if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+ level == PT_PAGE_TABLE_LEVEL &&
+ PageTransCompound(pfn_to_page(pfn)) &&
+ !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+ unsigned long mask;
+ /*
+ * mmu_notifier_retry was successful and we hold the
+ * mmu_lock here, so the pmd can't become splitting
+ * from under us, and in turn
+ * __split_huge_page_refcount() can't run from under
+ * us and we can safely transfer the refcount from
+ * PG_tail to PG_head as we switch the pfn to tail to
+ * head.
+ */
+ *levelp = level = PT_DIRECTORY_LEVEL;
+ mask = KVM_PAGES_PER_HPAGE(level) - 1;
+ VM_BUG_ON((gfn & mask) != (pfn & mask));
+ if (pfn & mask) {
+ gfn &= ~mask;
+ *gfnp = gfn;
+ kvm_release_pfn_clean(pfn);
+ pfn &= ~mask;
+ if (!get_page_unless_zero(pfn_to_page(pfn)))
+ BUG();
+ *pfnp = pfn;
+ }
+ }
+}
+
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
@@ -2289,20 +2369,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
{
int r;
int level;
+ int force_pt_level;
pfn_t pfn;
unsigned long mmu_seq;
bool map_writable;
- level = mapping_level(vcpu, gfn);
-
- /*
- * This path builds a PAE pagetable - so we can map 2mb pages at
- * maximum. Therefore check if the level is larger than that.
- */
- if (level > PT_DIRECTORY_LEVEL)
- level = PT_DIRECTORY_LEVEL;
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ if (likely(!force_pt_level)) {
+ level = mapping_level(vcpu, gfn);
+ /*
+ * This path builds a PAE pagetable - so we can map
+ * 2mb pages at maximum. Therefore check if the level
+ * is larger than that.
+ */
+ if (level > PT_DIRECTORY_LEVEL)
+ level = PT_DIRECTORY_LEVEL;
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ } else
+ level = PT_PAGE_TABLE_LEVEL;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -2318,6 +2403,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2655,6 +2742,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
pfn_t pfn;
int r;
int level;
+ int force_pt_level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
@@ -2667,9 +2755,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;
- level = mapping_level(vcpu, gfn);
-
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ if (likely(!force_pt_level)) {
+ level = mapping_level(vcpu, gfn);
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ } else
+ level = PT_PAGE_TABLE_LEVEL;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -2684,6 +2775,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, gpa, write, map_writable,
level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 53210f1e94c2..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
+ int force_pt_level;
unsigned long mmu_seq;
bool map_writable;
@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0;
}
- if (walker.level >= PT_DIRECTORY_LEVEL) {
+ if (walker.level >= PT_DIRECTORY_LEVEL)
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+ else
+ force_pt_level = 1;
+ if (!force_pt_level) {
level = min(walker.level, mapping_level(vcpu, walker.gfn));
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
+ if (!force_pt_level)
+ transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
level, &write_pt, pfn, map_writable, prefault);
(void)sptep;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
#include <asm/pgtable.h>
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page);
+ SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
VM_BUG_ON(page != compound_head(page));
VM_BUG_ON(page_count(page) == 0);
atomic_add(nr, &page->_count);
+ SetPageReferenced(page);
+}
+
+static inline void get_huge_page_tail(struct page *page)
+{
+ /*
+ * __split_huge_page_refcount() cannot run
+ * from under us.
+ */
+ VM_BUG_ON(atomic_read(&page->_count) < 0);
+ atomic_inc(&page->_count);
}
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
+ if (PageTail(page))
+ get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
+ /*
+ * The pmd_trans_splitting() check below explains why
+ * pmdp_splitting_flush has to flush the tlb, to stop
+ * this gup-fast code from running while we set the
+ * splitting bit in the pmd. Returning zero will take
+ * the slow path that will call wait_split_huge_page()
+ * if the pmd is still in splitting state. gup-fast
+ * can't because it has irq disabled and
+ * wait_split_huge_page() would never return as the
+ * tlb flush IPI wouldn't run.
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+ int changed = !pmd_same(*pmdp, entry);
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ if (changed && dirty) {
+ *pmdp = entry;
+ pmd_update_defer(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+
+ return changed;
+}
+#endif
+
int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
return ret;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ int ret = 0;
+
+ if (pmd_young(*pmdp))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ (unsigned long *)pmdp);
+
+ if (ret)
+ pmd_update(vma->vm_mm, addr, pmdp);
+
+ return ret;
+}
+#endif
+
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
return young;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ return young;
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int set;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+ (unsigned long *)pmdp);
+ if (set) {
+ pmd_update(vma->vm_mm, address, pmdp);
+ /* need tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+}
+#endif
+
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 779385158915..17c565de3d64 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
- grant-table.o suspend.o platform-pci-unplug.o
+ grant-table.o suspend.o platform-pci-unplug.o \
+ p2m.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 44924e551fde..7575e55cd52e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
*/
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
-/*
- * Xen leaves the responsibility for maintaining p2m mappings to the
- * guests themselves, but it must also access and update the p2m array
- * during suspend/resume when all the pages are reallocated.
- *
- * The p2m table is logically a flat array, but we implement it as a
- * three-level tree to allow the address space to be sparse.
- *
- * Xen
- * |
- * p2m_top p2m_top_mfn
- * / \ / \
- * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
- * / \ / \ / /
- * p2m p2m p2m p2m p2m p2m p2m ...
- *
- * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
- *
- * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
- * maximum representable pseudo-physical address space is:
- * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
- *
- * P2M_PER_PAGE depends on the architecture, as a mfn is always
- * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
- * 512 and 1024 entries respectively.
- */
-
-unsigned long xen_max_p2m_pfn __read_mostly;
-
-#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-/* Placeholders for holes in the address space */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
-
-static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
-
-RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-
-static inline unsigned p2m_top_index(unsigned long pfn)
-{
- BUG_ON(pfn >= MAX_P2M_PFN);
- return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
-}
-
-static inline unsigned p2m_mid_index(unsigned long pfn)
-{
- return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
-}
-
-static inline unsigned p2m_index(unsigned long pfn)
-{
- return pfn % P2M_PER_PAGE;
-}
-
-static void p2m_top_init(unsigned long ***top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = p2m_mid_missing;
-}
-
-static void p2m_top_mfn_init(unsigned long *top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = virt_to_mfn(p2m_mid_missing_mfn);
-}
-
-static void p2m_top_mfn_p_init(unsigned long **top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = p2m_mid_missing_mfn;
-}
-
-static void p2m_mid_init(unsigned long **mid)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = p2m_missing;
-}
-
-static void p2m_mid_mfn_init(unsigned long *mid)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = virt_to_mfn(p2m_missing);
-}
-
-static void p2m_init(unsigned long *p2m)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- p2m[i] = INVALID_P2M_ENTRY;
-}
-
-/*
- * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
- *
- * This is called both at boot time, and after resuming from suspend:
- * - At boot time we're called very early, and must use extend_brk()
- * to allocate memory.
- *
- * - After resume we're called from within stop_machine, but the mfn
- * tree should alreay be completely allocated.
- */
-void xen_build_mfn_list_list(void)
-{
- unsigned long pfn;
-
- /* Pre-initialize p2m_top_mfn to be completely missing */
- if (p2m_top_mfn == NULL) {
- p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
-
- p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_mfn_p_init(p2m_top_mfn_p);
-
- p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_mfn_init(p2m_top_mfn);
- } else {
- /* Reinitialise, mfn's all change after migration */
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
- }
-
- for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
- unsigned long **mid;
- unsigned long *mid_mfn_p;
-
- mid = p2m_top[topidx];
- mid_mfn_p = p2m_top_mfn_p[topidx];
-
- /* Don't bother allocating any mfn mid levels if
- * they're just missing, just update the stored mfn,
- * since all could have changed over a migrate.
- */
- if (mid == p2m_mid_missing) {
- BUG_ON(mididx);
- BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
- p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
- pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
- continue;
- }
-
- if (mid_mfn_p == p2m_mid_missing_mfn) {
- /*
- * XXX boot-time only! We should never find
- * missing parts of the mfn tree after
- * runtime. extend_brk() will BUG if we call
- * it too late.
- */
- mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p);
-
- p2m_top_mfn_p[topidx] = mid_mfn_p;
- }
-
- p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
- mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
- }
-}
-
-void xen_setup_mfn_list_list(void)
-{
- BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
-
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(p2m_top_mfn);
- HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
-}
-
-/* Set up p2m_top to point to the domain-builder provided p2m pages */
-void __init xen_build_dynamic_phys_to_machine(void)
-{
- unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
- unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
- unsigned long pfn;
-
- xen_max_p2m_pfn = max_pfn;
-
- p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_missing);
-
- p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_missing);
-
- p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_init(p2m_top);
-
- /*
- * The domain builder gives us a pre-constructed p2m array in
- * mfn_list for all the pages initially given to us, so we just
- * need to graft that into our tree structure.
- */
- for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
-
- if (p2m_top[topidx] == p2m_mid_missing) {
- unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid);
-
- p2m_top[topidx] = mid;
- }
-
- p2m_top[topidx][mididx] = &mfn_list[pfn];
- }
-}
-
-unsigned long get_phys_to_machine(unsigned long pfn)
-{
- unsigned topidx, mididx, idx;
-
- if (unlikely(pfn >= MAX_P2M_PFN))
- return INVALID_P2M_ENTRY;
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- return p2m_top[topidx][mididx][idx];
-}
-EXPORT_SYMBOL_GPL(get_phys_to_machine);
-
-static void *alloc_p2m_page(void)
-{
- return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
-}
-
-static void free_p2m_page(void *p)
-{
- free_page((unsigned long)p);
-}
-
-/*
- * Fully allocate the p2m structure for a given pfn. We need to check
- * that both the top and mid levels are allocated, and make sure the
- * parallel mfn tree is kept in sync. We may race with other cpus, so
- * the new pages are installed with cmpxchg; if we lose the race then
- * simply free the page we allocated and use the one that's there.
- */
-static bool alloc_p2m(unsigned long pfn)
-{
- unsigned topidx, mididx;
- unsigned long ***top_p, **mid;
- unsigned long *top_mfn_p, *mid_mfn;
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
-
- top_p = &p2m_top[topidx];
- mid = *top_p;
-
- if (mid == p2m_mid_missing) {
- /* Mid level is missing, allocate a new one */
- mid = alloc_p2m_page();
- if (!mid)
- return false;
-
- p2m_mid_init(mid);
-
- if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
- free_p2m_page(mid);
- }
-
- top_mfn_p = &p2m_top_mfn[topidx];
- mid_mfn = p2m_top_mfn_p[topidx];
-
- BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
-
- if (mid_mfn == p2m_mid_missing_mfn) {
- /* Separately check the mid mfn level */
- unsigned long missing_mfn;
- unsigned long mid_mfn_mfn;
-
- mid_mfn = alloc_p2m_page();
- if (!mid_mfn)
- return false;
-
- p2m_mid_mfn_init(mid_mfn);
-
- missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
- mid_mfn_mfn = virt_to_mfn(mid_mfn);
- if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
- free_p2m_page(mid_mfn);
- else
- p2m_top_mfn_p[topidx] = mid_mfn;
- }
-
- if (p2m_top[topidx][mididx] == p2m_missing) {
- /* p2m leaf page is missing */
- unsigned long *p2m;
-
- p2m = alloc_p2m_page();
- if (!p2m)
- return false;
-
- p2m_init(p2m);
-
- if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
- free_p2m_page(p2m);
- else
- mid_mfn[mididx] = virt_to_mfn(p2m);
- }
-
- return true;
-}
-
-/* Try to install p2m mapping; fail if intermediate bits missing */
-bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- unsigned topidx, mididx, idx;
-
- if (unlikely(pfn >= MAX_P2M_PFN)) {
- BUG_ON(mfn != INVALID_P2M_ENTRY);
- return true;
- }
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- if (p2m_top[topidx][mididx] == p2m_missing)
- return mfn == INVALID_P2M_ENTRY;
-
- p2m_top[topidx][mididx][idx] = mfn;
-
- return true;
-}
-
-bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
- BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
- return true;
- }
-
- if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!alloc_p2m(pfn))
- return false;
-
- if (!__set_phys_to_machine(pfn, mfn))
- return false;
- }
-
- return true;
-}
-
unsigned long arbitrary_virt_to_mfn(void *vaddr)
{
xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000000..8f2251d2a3f8
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,510 @@
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ * Xen
+ * |
+ * p2m_top p2m_top_mfn
+ * / \ / \
+ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
+ * / \ / \ / /
+ * p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+
+#include <asm/cache.h>
+#include <asm/setup.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+
+static void __init m2p_override_init(void);
+
+unsigned long xen_max_p2m_pfn __read_mostly;
+
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
+
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+
+static inline unsigned p2m_top_index(unsigned long pfn)
+{
+ BUG_ON(pfn >= MAX_P2M_PFN);
+ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
+}
+
+static inline unsigned p2m_index(unsigned long pfn)
+{
+ return pfn % P2M_PER_PAGE;
+}
+
+static void p2m_top_init(unsigned long ***top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_init(unsigned long **mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = p2m_missing;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = virt_to_mfn(p2m_missing);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ * to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ * tree should alreay be completely allocated.
+ */
+void xen_build_mfn_list_list(void)
+{
+ unsigned long pfn;
+
+ /* Pre-initialize p2m_top_mfn to be completely missing */
+ if (p2m_top_mfn == NULL) {
+ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
+
+ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p_init(p2m_top_mfn_p);
+
+ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_init(p2m_top_mfn);
+ } else {
+ /* Reinitialise, mfn's all change after migration */
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ }
+
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned long **mid;
+ unsigned long *mid_mfn_p;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+
+ /* Don't bother allocating any mfn mid levels if
+ * they're just missing, just update the stored mfn,
+ * since all could have changed over a migrate.
+ */
+ if (mid == p2m_mid_missing) {
+ BUG_ON(mididx);
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+ continue;
+ }
+
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ /*
+ * XXX boot-time only! We should never find
+ * missing parts of the mfn tree after
+ * runtime. extend_brk() will BUG if we call
+ * it too late.
+ */
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ }
+
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+ }
+}
+
+void xen_setup_mfn_list_list(void)
+{
+ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(p2m_top_mfn);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+}
+
+/* Set up p2m_top to point to the domain-builder provided p2m pages */
+void __init xen_build_dynamic_phys_to_machine(void)
+{
+ unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
+ unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+ unsigned long pfn;
+
+ xen_max_p2m_pfn = max_pfn;
+
+ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_missing);
+
+ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_missing);
+
+ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_init(p2m_top);
+
+ /*
+ * The domain builder gives us a pre-constructed p2m array in
+ * mfn_list for all the pages initially given to us, so we just
+ * need to graft that into our tree structure.
+ */
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(mid);
+
+ p2m_top[topidx] = mid;
+ }
+
+ p2m_top[topidx][mididx] = &mfn_list[pfn];
+ }
+
+ m2p_override_init();
+}
+
+unsigned long get_phys_to_machine(unsigned long pfn)
+{
+ unsigned topidx, mididx, idx;
+
+ if (unlikely(pfn >= MAX_P2M_PFN))
+ return INVALID_P2M_ENTRY;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ return p2m_top[topidx][mididx][idx];
+}
+EXPORT_SYMBOL_GPL(get_phys_to_machine);
+
+static void *alloc_p2m_page(void)
+{
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
+
+static void free_p2m_page(void *p)
+{
+ free_page((unsigned long)p);
+}
+
+/*
+ * Fully allocate the p2m structure for a given pfn. We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync. We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+ unsigned topidx, mididx;
+ unsigned long ***top_p, **mid;
+ unsigned long *top_mfn_p, *mid_mfn;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+
+ top_p = &p2m_top[topidx];
+ mid = *top_p;
+
+ if (mid == p2m_mid_missing) {
+ /* Mid level is missing, allocate a new one */
+ mid = alloc_p2m_page();
+ if (!mid)
+ return false;
+
+ p2m_mid_init(mid);
+
+ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+ free_p2m_page(mid);
+ }
+
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = p2m_top_mfn_p[topidx];
+
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return false;
+
+ p2m_mid_mfn_init(mid_mfn);
+
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+ free_p2m_page(mid_mfn);
+ else
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_missing) {
+ /* p2m leaf page is missing */
+ unsigned long *p2m;
+
+ p2m = alloc_p2m_page();
+ if (!p2m)
+ return false;
+
+ p2m_init(p2m);
+
+ if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+ free_p2m_page(p2m);
+ else
+ mid_mfn[mididx] = virt_to_mfn(p2m);
+ }
+
+ return true;
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ unsigned topidx, mididx, idx;
+
+ if (unlikely(pfn >= MAX_P2M_PFN)) {
+ BUG_ON(mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ if (p2m_top[topidx][mididx] == p2m_missing)
+ return mfn == INVALID_P2M_ENTRY;
+
+ p2m_top[topidx][mididx][idx] = mfn;
+
+ return true;
+}
+
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ if (!alloc_p2m(pfn))
+ return false;
+
+ if (!__set_phys_to_machine(pfn, mfn))
+ return false;
+ }
+
+ return true;
+}
+
+#define M2P_OVERRIDE_HASH_SHIFT 10
+#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
+
+static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
+static DEFINE_SPINLOCK(m2p_override_lock);
+
+static void __init m2p_override_init(void)
+{
+ unsigned i;
+
+ m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
+ sizeof(unsigned long));
+
+ for (i = 0; i < M2P_OVERRIDE_HASH; i++)
+ INIT_LIST_HEAD(&m2p_overrides[i]);
+}
+
+static unsigned long mfn_hash(unsigned long mfn)
+{
+ return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
+}
+
+/* Add an MFN override for a particular page */
+int m2p_add_override(unsigned long mfn, struct page *page)
+{
+ unsigned long flags;
+ unsigned long pfn;
+ unsigned long address;
+ unsigned level;
+ pte_t *ptep = NULL;
+
+ pfn = page_to_pfn(page);
+ if (!PageHighMem(page)) {
+ address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ ptep = lookup_address(address, &level);
+
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_add_override: pfn %lx not mapped", pfn))
+ return -EINVAL;
+ }
+
+ page->private = mfn;
+ page->index = pfn_to_mfn(pfn);
+
+ __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+ if (!PageHighMem(page))
+ /* Just zap old mapping for now */
+ pte_clear(&init_mm, address, ptep);
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return 0;
+}
+
+int m2p_remove_override(struct page *page)
+{
+ unsigned long flags;
+ unsigned long mfn;
+ unsigned long pfn;
+ unsigned long address;
+ unsigned level;
+ pte_t *ptep = NULL;
+
+ pfn = page_to_pfn(page);
+ mfn = get_phys_to_machine(pfn);
+ if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
+ return -EINVAL;
+
+ if (!PageHighMem(page)) {
+ address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ ptep = lookup_address(address, &level);
+
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_remove_override: pfn %lx not mapped", pfn))
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_del(&page->lru);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+ __set_phys_to_machine(pfn, page->index);
+
+ if (!PageHighMem(page))
+ set_pte_at(&init_mm, address, ptep,
+ pfn_pte(pfn, PAGE_KERNEL));
+ /* No tlb flush necessary because the caller already
+ * left the pte unmapped. */
+
+ return 0;
+}
+
+struct page *m2p_find_override(unsigned long mfn)
+{
+ unsigned long flags;
+ struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
+ struct page *p, *ret;
+
+ ret = NULL;
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+
+ list_for_each_entry(p, bucket, lru) {
+ if (p->private == mfn) {
+ ret = p;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return ret;
+}
+
+unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
+{
+ struct page *p = m2p_find_override(mfn);
+ unsigned long ret = pfn;
+
+ if (p)
+ ret = page_to_pfn(p);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
diff --git a/arch/xtensa/include/asm/mman.h b/arch/xtensa/include/asm/mman.h
index fca4db425f6e..30789010733d 100644
--- a/arch/xtensa/include/asm/mman.h
+++ b/arch/xtensa/include/asm/mman.h
@@ -83,6 +83,9 @@
#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
+#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/drivers/base/node.c b/drivers/base/node.c
index ce012a9c6201..36b43052001d 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -117,12 +117,21 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
"Node %d WritebackTmp: %8lu kB\n"
"Node %d Slab: %8lu kB\n"
"Node %d SReclaimable: %8lu kB\n"
- "Node %d SUnreclaim: %8lu kB\n",
+ "Node %d SUnreclaim: %8lu kB\n"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "Node %d AnonHugePages: %8lu kB\n"
+#endif
+ ,
nid, K(node_page_state(nid, NR_FILE_DIRTY)),
nid, K(node_page_state(nid, NR_WRITEBACK)),
nid, K(node_page_state(nid, NR_FILE_PAGES)),
nid, K(node_page_state(nid, NR_FILE_MAPPED)),
- nid, K(node_page_state(nid, NR_ANON_PAGES)),
+ nid, K(node_page_state(nid, NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
+ HPAGE_PMD_NR
+#endif
+ ),
nid, K(node_page_state(nid, NR_SHMEM)),
nid, node_page_state(nid, NR_KERNEL_STACK) *
THREAD_SIZE / 1024,
@@ -133,7 +142,13 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
- nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
+ nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ , nid,
+ K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
+ HPAGE_PMD_NR)
+#endif
+ );
n += hugetlb_report_node_meminfo(nid, buf + n);
return n;
}
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e31559..98d9ec85e0eb 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,30 @@ config DM_MIRROR
Allow volume managers to mirror logical volumes, also
needed for live data migration tools such as 'pvmove'.
+config DM_RAID
+ tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ select MD_RAID456
+ select BLK_DEV_MD
+ ---help---
+ A dm target that supports RAID4, RAID5 and RAID6 mappings
+
+ A RAID-5 set of N drives with a capacity of C MB per drive provides
+ the capacity of C * (N - 1) MB, and protects against a failure
+ of a single drive. For a given sector (row) number, (N - 1) drives
+ contain data sectors, and one drive contains the parity protection.
+ For a RAID-4 set, the parity blocks are present on a single drive,
+ while a RAID-5 set distributes the parity across the drives in one
+ of the available parity distribution methods.
+
+ A RAID-6 set of N drives with a capacity of C MB per drive
+ provides the capacity of C * (N - 2) MB, and protects
+ against a failure of any two drives. For a given sector
+ (row) number, (N - 2) drives contain data sectors, and two
+ drives contains two independent redundancy syndromes. Like
+ RAID-5, RAID-6 distributes the syndromes across the drives
+ in one of the available parity distribution methods.
+
config DM_LOG_USERSPACE
tristate "Mirror userspace logging (EXPERIMENTAL)"
depends on DM_MIRROR && EXPERIMENTAL && NET
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac41919d..d0138606c2e8 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
+obj-$(CONFIG_DM_RAID) += dm-raid.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5a1ffe3527aa..9a35320fb59f 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
|| test_bit(Faulty, &rdev->flags))
continue;
- target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
+ target = offset + index * (PAGE_SIZE/512);
if (sync_page_io(rdev, target,
roundup(size, bdev_logical_block_size(rdev->bdev)),
- page, READ)) {
+ page, READ, true)) {
page->index = index;
attach_page_buffers(page, NULL); /* so that free_buffer will
* quietly no-op */
@@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{
mdk_rdev_t *rdev = NULL;
+ struct block_device *bdev;
mddev_t *mddev = bitmap->mddev;
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
int size = PAGE_SIZE;
loff_t offset = mddev->bitmap_info.offset;
+
+ bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
+
if (page->index == bitmap->file_pages-1)
size = roundup(bitmap->last_page_size,
- bdev_logical_block_size(rdev->bdev));
+ bdev_logical_block_size(bdev));
/* Just make sure we aren't corrupting data or
* metadata
*/
@@ -1542,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
wait_event(bitmap->mddev->recovery_wait,
atomic_read(&bitmap->mddev->recovery_active) == 0);
- bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
+ bitmap->mddev->curr_resync_completed = sector;
set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
s = 0;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d5b0e4c0e702..4e054bd91664 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,10 +18,14 @@
#include <linux/crypto.h>
#include <linux/workqueue.h>
#include <linux/backing-dev.h>
+#include <linux/percpu.h>
#include <asm/atomic.h>
#include <linux/scatterlist.h>
#include <asm/page.h>
#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/md5.h>
+#include <crypto/algapi.h>
#include <linux/device-mapper.h>
@@ -63,6 +67,7 @@ struct dm_crypt_request {
struct convert_context *ctx;
struct scatterlist sg_in;
struct scatterlist sg_out;
+ sector_t iv_sector;
};
struct crypt_config;
@@ -73,11 +78,13 @@ struct crypt_iv_operations {
void (*dtr)(struct crypt_config *cc);
int (*init)(struct crypt_config *cc);
int (*wipe)(struct crypt_config *cc);
- int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
+ int (*generator)(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq);
+ int (*post)(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq);
};
struct iv_essiv_private {
- struct crypto_cipher *tfm;
struct crypto_hash *hash_tfm;
u8 *salt;
};
@@ -86,11 +93,32 @@ struct iv_benbi_private {
int shift;
};
+#define LMK_SEED_SIZE 64 /* hash + 0 */
+struct iv_lmk_private {
+ struct crypto_shash *hash_tfm;
+ u8 *seed;
+};
+
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
*/
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+
+/*
+ * Duplicated per-CPU state for cipher.
+ */
+struct crypt_cpu {
+ struct ablkcipher_request *req;
+ /* ESSIV: struct crypto_cipher *essiv_tfm */
+ void *iv_private;
+ struct crypto_ablkcipher *tfms[0];
+};
+
+/*
+ * The fields in here must be read only after initialization,
+ * changing state should be in crypt_cpu.
+ */
struct crypt_config {
struct dm_dev *dev;
sector_t start;
@@ -108,17 +136,25 @@ struct crypt_config {
struct workqueue_struct *crypt_queue;
char *cipher;
- char *cipher_mode;
+ char *cipher_string;
struct crypt_iv_operations *iv_gen_ops;
union {
struct iv_essiv_private essiv;
struct iv_benbi_private benbi;
+ struct iv_lmk_private lmk;
} iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
/*
+ * Duplicated per cpu state. Access through
+ * per_cpu_ptr() only.
+ */
+ struct crypt_cpu __percpu *cpu;
+ unsigned tfms_count;
+
+ /*
* Layout of each crypto request:
*
* struct ablkcipher_request
@@ -132,11 +168,10 @@ struct crypt_config {
* correctly aligned.
*/
unsigned int dmreq_start;
- struct ablkcipher_request *req;
- struct crypto_ablkcipher *tfm;
unsigned long flags;
unsigned int key_size;
+ unsigned int key_parts;
u8 key[0];
};
@@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool;
static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
+static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+
+static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
+{
+ return this_cpu_ptr(cc->cpu);
+}
+
+/*
+ * Use this to access cipher attributes that are the same for each CPU.
+ */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+ return __this_cpu_ptr(cc->cpu)->tfms[0];
+}
/*
* Different IV generation algorithms:
@@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
* null: the initial vector is always zero. Provides compatibility with
* obsolete loop_fish2 devices. Do not use for new devices.
*
+ * lmk: Compatible implementation of the block chaining mode used
+ * by the Loop-AES block device encryption system
+ * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
+ * It operates on full 512 byte sectors and uses CBC
+ * with an IV derived from the sector number, the data and
+ * optionally extra IV seed.
+ * This means that after decryption the first block
+ * of sector must be tweaked according to decrypted data.
+ * Loop-AES can use three encryption schemes:
+ * version 1: is plain aes-cbc mode
+ * version 2: uses 64 multikey scheme with lmk IV generator
+ * version 3: the same as version 2 with additional IV seed
+ * (it uses 65 keys, last key is used as IV seed)
+ *
* plumb: unimplemented, see:
* http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
*/
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
{
memset(iv, 0, cc->iv_size);
- *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
+ *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
return 0;
}
static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
- sector_t sector)
+ struct dm_crypt_request *dmreq)
{
memset(iv, 0, cc->iv_size);
- *(u64 *)iv = cpu_to_le64(sector);
+ *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
return 0;
}
@@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
struct hash_desc desc;
struct scatterlist sg;
- int err;
+ struct crypto_cipher *essiv_tfm;
+ int err, cpu;
sg_init_one(&sg, cc->key, cc->key_size);
desc.tfm = essiv->hash_tfm;
@@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
if (err)
return err;
- return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+ for_each_possible_cpu(cpu) {
+ essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
+
+ err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
crypto_hash_digestsize(essiv->hash_tfm));
+ if (err)
+ return err;
+ }
+
+ return 0;
}
/* Wipe salt and reset key derived from volume key */
@@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
{
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+ struct crypto_cipher *essiv_tfm;
+ int cpu, r, err = 0;
memset(essiv->salt, 0, salt_size);
- return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+ for_each_possible_cpu(cpu) {
+ essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
+ r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
+ if (r)
+ err = r;
+ }
+
+ return err;
+}
+
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+ struct dm_target *ti,
+ u8 *salt, unsigned saltsize)
+{
+ struct crypto_cipher *essiv_tfm;
+ int err;
+
+ /* Setup the essiv_tfm with the given salt */
+ essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(essiv_tfm)) {
+ ti->error = "Error allocating crypto tfm for ESSIV";
+ return essiv_tfm;
+ }
+
+ if (crypto_cipher_blocksize(essiv_tfm) !=
+ crypto_ablkcipher_ivsize(any_tfm(cc))) {
+ ti->error = "Block size of ESSIV cipher does "
+ "not match IV size of block cipher";
+ crypto_free_cipher(essiv_tfm);
+ return ERR_PTR(-EINVAL);
+ }
+
+ err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+ if (err) {
+ ti->error = "Failed to set key for ESSIV cipher";
+ crypto_free_cipher(essiv_tfm);
+ return ERR_PTR(err);
+ }
+
+ return essiv_tfm;
}
static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
+ int cpu;
+ struct crypt_cpu *cpu_cc;
+ struct crypto_cipher *essiv_tfm;
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
- crypto_free_cipher(essiv->tfm);
- essiv->tfm = NULL;
-
crypto_free_hash(essiv->hash_tfm);
essiv->hash_tfm = NULL;
kzfree(essiv->salt);
essiv->salt = NULL;
+
+ for_each_possible_cpu(cpu) {
+ cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+ essiv_tfm = cpu_cc->iv_private;
+
+ if (essiv_tfm)
+ crypto_free_cipher(essiv_tfm);
+
+ cpu_cc->iv_private = NULL;
+ }
}
static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
struct crypto_cipher *essiv_tfm = NULL;
struct crypto_hash *hash_tfm = NULL;
u8 *salt = NULL;
- int err;
+ int err, cpu;
if (!opts) {
ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
goto bad;
}
- /* Allocate essiv_tfm */
- essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(essiv_tfm)) {
- ti->error = "Error allocating crypto tfm for ESSIV";
- err = PTR_ERR(essiv_tfm);
- goto bad;
- }
- if (crypto_cipher_blocksize(essiv_tfm) !=
- crypto_ablkcipher_ivsize(cc->tfm)) {
- ti->error = "Block size of ESSIV cipher does "
- "not match IV size of block cipher";
- err = -EINVAL;
- goto bad;
- }
-
cc->iv_gen_private.essiv.salt = salt;
- cc->iv_gen_private.essiv.tfm = essiv_tfm;
cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
+ for_each_possible_cpu(cpu) {
+ essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+ crypto_hash_digestsize(hash_tfm));
+ if (IS_ERR(essiv_tfm)) {
+ crypt_iv_essiv_dtr(cc);
+ return PTR_ERR(essiv_tfm);
+ }
+ per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
+ }
+
return 0;
bad:
- if (essiv_tfm && !IS_ERR(essiv_tfm))
- crypto_free_cipher(essiv_tfm);
if (hash_tfm && !IS_ERR(hash_tfm))
crypto_free_hash(hash_tfm);
kfree(salt);
return err;
}
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
{
+ struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
+
memset(iv, 0, cc->iv_size);
- *(u64 *)iv = cpu_to_le64(sector);
- crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
+ *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+ crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
+
return 0;
}
static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
- unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
+ unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
int log = ilog2(bs);
/* we need to calculate how far we must shift the sector count
@@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
{
}
-static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
{
__be64 val;
memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
- val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
+ val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
return 0;
}
-static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
{
memset(iv, 0, cc->iv_size);
return 0;
}
+static void crypt_iv_lmk_dtr(struct crypt_config *cc)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+ if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
+ crypto_free_shash(lmk->hash_tfm);
+ lmk->hash_tfm = NULL;
+
+ kzfree(lmk->seed);
+ lmk->seed = NULL;
+}
+
+static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+ lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
+ if (IS_ERR(lmk->hash_tfm)) {
+ ti->error = "Error initializing LMK hash";
+ return PTR_ERR(lmk->hash_tfm);
+ }
+
+ /* No seed in LMK version 2 */
+ if (cc->key_parts == cc->tfms_count) {
+ lmk->seed = NULL;
+ return 0;
+ }
+
+ lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
+ if (!lmk->seed) {
+ crypt_iv_lmk_dtr(cc);
+ ti->error = "Error kmallocing seed storage in LMK";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int crypt_iv_lmk_init(struct crypt_config *cc)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+ int subkey_size = cc->key_size / cc->key_parts;
+
+ /* LMK seed is on the position of LMK_KEYS + 1 key */
+ if (lmk->seed)
+ memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
+ crypto_shash_digestsize(lmk->hash_tfm));
+
+ return 0;
+}
+
+static int crypt_iv_lmk_wipe(struct crypt_config *cc)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+ if (lmk->seed)
+ memset(lmk->seed, 0, LMK_SEED_SIZE);
+
+ return 0;
+}
+
+static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq,
+ u8 *data)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+ struct {
+ struct shash_desc desc;
+ char ctx[crypto_shash_descsize(lmk->hash_tfm)];
+ } sdesc;
+ struct md5_state md5state;
+ u32 buf[4];
+ int i, r;
+
+ sdesc.desc.tfm = lmk->hash_tfm;
+ sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ r = crypto_shash_init(&sdesc.desc);
+ if (r)
+ return r;
+
+ if (lmk->seed) {
+ r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
+ if (r)
+ return r;
+ }
+
+ /* Sector is always 512B, block size 16, add data of blocks 1-31 */
+ r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
+ if (r)
+ return r;
+
+ /* Sector is cropped to 56 bits here */
+ buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
+ buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
+ buf[2] = cpu_to_le32(4024);
+ buf[3] = 0;
+ r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
+ if (r)
+ return r;
+
+ /* No MD5 padding here */
+ r = crypto_shash_export(&sdesc.desc, &md5state);
+ if (r)
+ return r;
+
+ for (i = 0; i < MD5_HASH_WORDS; i++)
+ __cpu_to_le32s(&md5state.hash[i]);
+ memcpy(iv, &md5state.hash, cc->iv_size);
+
+ return 0;
+}
+
+static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *src;
+ int r = 0;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
+ r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+ kunmap_atomic(src, KM_USER0);
+ } else
+ memset(iv, 0, cc->iv_size);
+
+ return r;
+}
+
+static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *dst;
+ int r;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
+ return 0;
+
+ dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
+ r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+
+ /* Tweak the first block of plaintext sector */
+ if (!r)
+ crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+
+ kunmap_atomic(dst, KM_USER0);
+ return r;
+}
+
static struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
.generator = crypt_iv_null_gen
};
+static struct crypt_iv_operations crypt_iv_lmk_ops = {
+ .ctr = crypt_iv_lmk_ctr,
+ .dtr = crypt_iv_lmk_dtr,
+ .init = crypt_iv_lmk_init,
+ .wipe = crypt_iv_lmk_wipe,
+ .generator = crypt_iv_lmk_gen,
+ .post = crypt_iv_lmk_post
+};
+
static void crypt_convert_init(struct crypt_config *cc,
struct convert_context *ctx,
struct bio *bio_out, struct bio *bio_in,
@@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
}
+static u8 *iv_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+{
+ return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+ crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+}
+
static int crypt_convert_block(struct crypt_config *cc,
struct convert_context *ctx,
struct ablkcipher_request *req)
@@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc,
int r = 0;
dmreq = dmreq_of_req(cc, req);
- iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
- crypto_ablkcipher_alignmask(cc->tfm) + 1);
+ iv = iv_of_dmreq(cc, dmreq);
+ dmreq->iv_sector = ctx->sector;
dmreq->ctx = ctx;
sg_init_table(&dmreq->sg_in, 1);
sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc,
}
if (cc->iv_gen_ops) {
- r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
+ r = cc->iv_gen_ops->generator(cc, iv, dmreq);
if (r < 0)
return r;
}
@@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc,
else
r = crypto_ablkcipher_decrypt(req);
+ if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+ r = cc->iv_gen_ops->post(cc, iv, dmreq);
+
return r;
}
static void kcryptd_async_done(struct crypto_async_request *async_req,
int error);
+
static void crypt_alloc_req(struct crypt_config *cc,
struct convert_context *ctx)
{
- if (!cc->req)
- cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
- ablkcipher_request_set_tfm(cc->req, cc->tfm);
- ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
- CRYPTO_TFM_REQ_MAY_SLEEP,
- kcryptd_async_done,
- dmreq_of_req(cc, cc->req));
+ struct crypt_cpu *this_cc = this_crypt_config(cc);
+ unsigned key_index = ctx->sector & (cc->tfms_count - 1);
+
+ if (!this_cc->req)
+ this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+
+ ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
+ ablkcipher_request_set_callback(this_cc->req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
}
/*
@@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
static int crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
+ struct crypt_cpu *this_cc = this_crypt_config(cc);
int r;
atomic_set(&ctx->pending, 1);
@@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc,
atomic_inc(&ctx->pending);
- r = crypt_convert_block(cc, ctx, cc->req);
+ r = crypt_convert_block(cc, ctx, this_cc->req);
switch (r) {
/* async */
@@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc,
INIT_COMPLETION(ctx->restart);
/* fall through*/
case -EINPROGRESS:
- cc->req = NULL;
+ this_cc->req = NULL;
ctx->sector++;
continue;
@@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
* They must be separated as otherwise the final stages could be
* starved by new requests which can block in the first stages due
* to memory allocation.
+ *
+ * The work is done per CPU global for all dm-crypt instances.
+ * They should not depend on each other and do not block.
*/
static void crypt_endio(struct bio *clone, int error)
{
@@ -691,26 +991,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
clone->bi_destructor = dm_crypt_bio_destructor;
}
-static void kcryptd_io_read(struct dm_crypt_io *io)
+static void kcryptd_unplug(struct crypt_config *cc)
+{
+ blk_unplug(bdev_get_queue(cc->dev->bdev));
+}
+
+static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
{
struct crypt_config *cc = io->target->private;
struct bio *base_bio = io->base_bio;
struct bio *clone;
- crypt_inc_pending(io);
-
/*
* The block layer might modify the bvec array, so always
* copy the required bvecs because we need the original
* one in order to decrypt the whole bio data *afterwards*.
*/
- clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
- if (unlikely(!clone)) {
- io->error = -ENOMEM;
- crypt_dec_pending(io);
- return;
+ clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
+ if (!clone) {
+ kcryptd_unplug(cc);
+ return 1;
}
+ crypt_inc_pending(io);
+
clone_init(io, clone);
clone->bi_idx = 0;
clone->bi_vcnt = bio_segments(base_bio);
@@ -720,6 +1024,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
sizeof(struct bio_vec) * clone->bi_vcnt);
generic_make_request(clone);
+ return 0;
}
static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -732,9 +1037,12 @@ static void kcryptd_io(struct work_struct *work)
{
struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
- if (bio_data_dir(io->base_bio) == READ)
- kcryptd_io_read(io);
- else
+ if (bio_data_dir(io->base_bio) == READ) {
+ crypt_inc_pending(io);
+ if (kcryptd_io_read(io, GFP_NOIO))
+ io->error = -ENOMEM;
+ crypt_dec_pending(io);
+ } else
kcryptd_io_write(io);
}
@@ -901,6 +1209,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
return;
}
+ if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
+ error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+
mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
if (!atomic_dec_and_test(&ctx->pending))
@@ -971,34 +1282,84 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
}
}
-static int crypt_set_key(struct crypt_config *cc, char *key)
+static void crypt_free_tfms(struct crypt_config *cc, int cpu)
{
- unsigned key_size = strlen(key) >> 1;
+ struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+ unsigned i;
- if (cc->key_size && cc->key_size != key_size)
+ for (i = 0; i < cc->tfms_count; i++)
+ if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
+ crypto_free_ablkcipher(cpu_cc->tfms[i]);
+ cpu_cc->tfms[i] = NULL;
+ }
+}
+
+static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
+{
+ struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+ unsigned i;
+ int err;
+
+ for (i = 0; i < cc->tfms_count; i++) {
+ cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+ if (IS_ERR(cpu_cc->tfms[i])) {
+ err = PTR_ERR(cpu_cc->tfms[i]);
+ crypt_free_tfms(cc, cpu);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+ unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+ int cpu, err = 0, i, r;
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < cc->tfms_count; i++) {
+ r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
+ cc->key + (i * subkey_size), subkey_size);
+ if (r)
+ err = r;
+ }
+ }
+
+ return err;
+}
+
+static int crypt_set_key(struct crypt_config *cc, char *key)
+{
+ /* The key size may not be changed. */
+ if (cc->key_size != (strlen(key) >> 1))
return -EINVAL;
- cc->key_size = key_size; /* initial settings */
+ /* Hyphen (which gives a key_size of zero) means there is no key. */
+ if (!cc->key_size && strcmp(key, "-"))
+ return -EINVAL;
- if ((!key_size && strcmp(key, "-")) ||
- (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
+ if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
return -EINVAL;
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
- return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+ return crypt_setkey_allcpus(cc);
}
static int crypt_wipe_key(struct crypt_config *cc)
{
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
memset(&cc->key, 0, cc->key_size * sizeof(u8));
- return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+
+ return crypt_setkey_allcpus(cc);
}
static void crypt_dtr(struct dm_target *ti)
{
struct crypt_config *cc = ti->private;
+ struct crypt_cpu *cpu_cc;
+ int cpu;
ti->private = NULL;
@@ -1010,6 +1371,14 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->crypt_queue)
destroy_workqueue(cc->crypt_queue);
+ if (cc->cpu)
+ for_each_possible_cpu(cpu) {
+ cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+ if (cpu_cc->req)
+ mempool_free(cpu_cc->req, cc->req_pool);
+ crypt_free_tfms(cc, cpu);
+ }
+
if (cc->bs)
bioset_free(cc->bs);
@@ -1023,14 +1392,14 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc);
- if (cc->tfm && !IS_ERR(cc->tfm))
- crypto_free_ablkcipher(cc->tfm);
-
if (cc->dev)
dm_put_device(ti, cc->dev);
+ if (cc->cpu)
+ free_percpu(cc->cpu);
+
kzfree(cc->cipher);
- kzfree(cc->cipher_mode);
+ kzfree(cc->cipher_string);
/* Must zero key material before freeing */
kzfree(cc);
@@ -1040,9 +1409,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
char *cipher_in, char *key)
{
struct crypt_config *cc = ti->private;
- char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
+ char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
char *cipher_api = NULL;
- int ret = -EINVAL;
+ int cpu, ret = -EINVAL;
/* Convert to crypto api definition? */
if (strchr(cipher_in, '(')) {
@@ -1050,23 +1419,31 @@ static int crypt_ctr_cipher(struct dm_target *ti,
return -EINVAL;
}
+ cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+ if (!cc->cipher_string)
+ goto bad_mem;
+
/*
* Legacy dm-crypt cipher specification
- * cipher-mode-iv:ivopts
+ * cipher[:keycount]-mode-iv:ivopts
*/
tmp = cipher_in;
- cipher = strsep(&tmp, "-");
+ keycount = strsep(&tmp, "-");
+ cipher = strsep(&keycount, ":");
+
+ if (!keycount)
+ cc->tfms_count = 1;
+ else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
+ !is_power_of_2(cc->tfms_count)) {
+ ti->error = "Bad cipher key count specification";
+ return -EINVAL;
+ }
+ cc->key_parts = cc->tfms_count;
cc->cipher = kstrdup(cipher, GFP_KERNEL);
if (!cc->cipher)
goto bad_mem;
- if (tmp) {
- cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
- if (!cc->cipher_mode)
- goto bad_mem;
- }
-
chainmode = strsep(&tmp, "-");
ivopts = strsep(&tmp, "-");
ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1451,19 @@ static int crypt_ctr_cipher(struct dm_target *ti,
if (tmp)
DMWARN("Ignoring unexpected additional cipher options");
- /* Compatibility mode for old dm-crypt mappings */
+ cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
+ cc->tfms_count * sizeof(*(cc->cpu->tfms)),
+ __alignof__(struct crypt_cpu));
+ if (!cc->cpu) {
+ ti->error = "Cannot allocate per cpu state";
+ goto bad_mem;
+ }
+
+ /*
+ * For compatibility with the original dm-crypt mapping format, if
+ * only the cipher name is supplied, use cbc-plain.
+ */
if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
- kfree(cc->cipher_mode);
- cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
chainmode = "cbc";
ivmode = "plain";
}
@@ -1099,11 +1485,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
}
/* Allocate cipher */
- cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
- if (IS_ERR(cc->tfm)) {
- ret = PTR_ERR(cc->tfm);
- ti->error = "Error allocating crypto tfm";
- goto bad;
+ for_each_possible_cpu(cpu) {
+ ret = crypt_alloc_tfms(cc, cpu, cipher_api);
+ if (ret < 0) {
+ ti->error = "Error allocating crypto tfm";
+ goto bad;
+ }
}
/* Initialize and set key */
@@ -1114,7 +1501,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
}
/* Initialize IV */
- cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
+ cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
if (cc->iv_size)
/* at least a 64 bit sector number should fit in our buffer */
cc->iv_size = max(cc->iv_size,
@@ -1137,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti,
cc->iv_gen_ops = &crypt_iv_benbi_ops;
else if (strcmp(ivmode, "null") == 0)
cc->iv_gen_ops = &crypt_iv_null_ops;
- else {
+ else if (strcmp(ivmode, "lmk") == 0) {
+ cc->iv_gen_ops = &crypt_iv_lmk_ops;
+ /* Version 2 and 3 is recognised according
+ * to length of provided multi-key string.
+ * If present (version 3), last key is used as IV seed.
+ */
+ if (cc->key_size % cc->key_parts)
+ cc->key_parts++;
+ } else {
ret = -EINVAL;
ti->error = "Invalid IV mode";
goto bad;
@@ -1194,6 +1589,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->error = "Cannot allocate encryption context";
return -ENOMEM;
}
+ cc->key_size = key_size;
ti->private = cc;
ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
@@ -1208,9 +1604,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
cc->dmreq_start = sizeof(struct ablkcipher_request);
- cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
+ cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
- cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
+ cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
~(crypto_tfm_ctx_alignment() - 1);
cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1615,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->error = "Cannot allocate crypt request mempool";
goto bad;
}
- cc->req = NULL;
cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
if (!cc->page_pool) {
@@ -1252,13 +1647,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->start = tmpll;
ret = -ENOMEM;
- cc->io_queue = create_singlethread_workqueue("kcryptd_io");
+ cc->io_queue = alloc_workqueue("kcryptd_io",
+ WQ_NON_REENTRANT|
+ WQ_MEM_RECLAIM,
+ 1);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
- cc->crypt_queue = create_singlethread_workqueue("kcryptd");
+ cc->crypt_queue = alloc_workqueue("kcryptd",
+ WQ_NON_REENTRANT|
+ WQ_CPU_INTENSIVE|
+ WQ_MEM_RECLAIM,
+ 1);
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
goto bad;
@@ -1286,9 +1688,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
- if (bio_data_dir(io->base_bio) == READ)
- kcryptd_queue_io(io);
- else
+ if (bio_data_dir(io->base_bio) == READ) {
+ if (kcryptd_io_read(io, GFP_NOWAIT))
+ kcryptd_queue_io(io);
+ } else
kcryptd_queue_crypt(io);
return DM_MAPIO_SUBMITTED;
@@ -1306,10 +1709,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
- if (cc->cipher_mode)
- DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
- else
- DMEMIT("%s ", cc->cipher);
+ DMEMIT("%s ", cc->cipher_string);
if (cc->key_size > 0) {
if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1421,7 +1821,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 7, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index baa11912cc94..f18375dcedd9 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void)
{
int r = -ENOMEM;
- kdelayd_wq = create_workqueue("kdelayd");
+ kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
if (!kdelayd_wq) {
DMERR("Couldn't start kdelayd");
goto bad_queue;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4b54618b4159..6d12775a1061 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
DMWARN("remove_all left %d open device(s)", dev_skipped);
}
+/*
+ * Set the uuid of a hash_cell that isn't already set.
+ */
+static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
+{
+ mutex_lock(&dm_hash_cells_mutex);
+ hc->uuid = new_uuid;
+ mutex_unlock(&dm_hash_cells_mutex);
+
+ list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+}
+
+/*
+ * Changes the name of a hash_cell and returns the old name for
+ * the caller to free.
+ */
+static char *__change_cell_name(struct hash_cell *hc, char *new_name)
+{
+ char *old_name;
+
+ /*
+ * Rename and move the name cell.
+ */
+ list_del(&hc->name_list);
+ old_name = hc->name;
+
+ mutex_lock(&dm_hash_cells_mutex);
+ hc->name = new_name;
+ mutex_unlock(&dm_hash_cells_mutex);
+
+ list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+
+ return old_name;
+}
+
static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
const char *new)
{
- char *new_name, *old_name;
+ char *new_data, *old_name = NULL;
struct hash_cell *hc;
struct dm_table *table;
struct mapped_device *md;
+ unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
/*
* duplicate new.
*/
- new_name = kstrdup(new, GFP_KERNEL);
- if (!new_name)
+ new_data = kstrdup(new, GFP_KERNEL);
+ if (!new_data)
return ERR_PTR(-ENOMEM);
down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
/*
* Is new free ?
*/
- hc = __get_name_cell(new);
+ if (change_uuid)
+ hc = __get_uuid_cell(new);
+ else
+ hc = __get_name_cell(new);
+
if (hc) {
- DMWARN("asked to rename to an already-existing name %s -> %s",
+ DMWARN("Unable to change %s on mapped device %s to one that "
+ "already exists: %s",
+ change_uuid ? "uuid" : "name",
param->name, new);
dm_put(hc->md);
up_write(&_hash_lock);
- kfree(new_name);
+ kfree(new_data);
return ERR_PTR(-EBUSY);
}
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
*/
hc = __get_name_cell(param->name);
if (!hc) {
- DMWARN("asked to rename a non-existent device %s -> %s",
- param->name, new);
+ DMWARN("Unable to rename non-existent device, %s to %s%s",
+ param->name, change_uuid ? "uuid " : "", new);
up_write(&_hash_lock);
- kfree(new_name);
+ kfree(new_data);
return ERR_PTR(-ENXIO);
}
/*
- * rename and move the name cell.
+ * Does this device already have a uuid?
*/
- list_del(&hc->name_list);
- old_name = hc->name;
- mutex_lock(&dm_hash_cells_mutex);
- hc->name = new_name;
- mutex_unlock(&dm_hash_cells_mutex);
- list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+ if (change_uuid && hc->uuid) {
+ DMWARN("Unable to change uuid of mapped device %s to %s "
+ "because uuid is already set to %s",
+ param->name, new, hc->uuid);
+ dm_put(hc->md);
+ up_write(&_hash_lock);
+ kfree(new_data);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (change_uuid)
+ __set_cell_uuid(hc, new_data);
+ else
+ old_name = __change_cell_name(hc, new_data);
/*
* Wake up any dm event waiters.
@@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
hc = __find_device_hash_cell(param);
if (!hc) {
- DMWARN("device doesn't appear to be in the dev hash table.");
+ DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
up_write(&_hash_lock);
return -ENXIO;
}
@@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
*/
r = dm_lock_for_deletion(md);
if (r) {
- DMWARN("unable to remove open device %s", hc->name);
+ DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
up_write(&_hash_lock);
dm_put(md);
return r;
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
static int dev_rename(struct dm_ioctl *param, size_t param_size)
{
int r;
- char *new_name = (char *) param + param->data_start;
+ char *new_data = (char *) param + param->data_start;
struct mapped_device *md;
+ unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
- if (new_name < param->data ||
- invalid_str(new_name, (void *) param + param_size) ||
- strlen(new_name) > DM_NAME_LEN - 1) {
- DMWARN("Invalid new logical volume name supplied.");
+ if (new_data < param->data ||
+ invalid_str(new_data, (void *) param + param_size) ||
+ strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
+ DMWARN("Invalid new mapped device name or uuid string supplied.");
return -EINVAL;
}
- r = check_name(new_name);
- if (r)
- return r;
+ if (!change_uuid) {
+ r = check_name(new_data);
+ if (r)
+ return r;
+ }
- md = dm_hash_rename(param, new_name);
+ md = dm_hash_rename(param, new_data);
if (IS_ERR(md))
return PTR_ERR(md);
@@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param)
hc = __find_device_hash_cell(param);
if (!hc) {
- DMWARN("device doesn't appear to be in the dev hash table.");
+ DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
up_write(&_hash_lock);
return -ENXIO;
}
@@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
hc = __find_device_hash_cell(param);
if (!hc) {
- DMWARN("device doesn't appear to be in the dev hash table.");
+ DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
up_write(&_hash_lock);
return -ENXIO;
}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d8587bac5682..924f5f0084c2 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,6 +37,13 @@ struct dm_kcopyd_client {
unsigned int nr_pages;
unsigned int nr_free_pages;
+ /*
+ * Block devices to unplug.
+ * Non-NULL pointer means that a block device has some pending requests
+ * and needs to be unplugged.
+ */
+ struct block_device *unplug[2];
+
struct dm_io_client *io_client;
wait_queue_head_t destroyq;
@@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job)
return 0;
}
+/*
+ * Unplug the block device at the specified index.
+ */
+static void unplug(struct dm_kcopyd_client *kc, int rw)
+{
+ if (kc->unplug[rw] != NULL) {
+ blk_unplug(bdev_get_queue(kc->unplug[rw]));
+ kc->unplug[rw] = NULL;
+ }
+}
+
+/*
+ * Prepare block device unplug. If there's another device
+ * to be unplugged at the same array index, we unplug that
+ * device first.
+ */
+static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
+ struct block_device *bdev)
+{
+ if (likely(kc->unplug[rw] == bdev))
+ return;
+ unplug(kc, rw);
+ kc->unplug[rw] = bdev;
+}
+
static void complete_io(unsigned long error, void *context)
{
struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
{
int r;
struct dm_io_request io_req = {
- .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG,
+ .bi_rw = job->rw,
.mem.type = DM_IO_PAGE_LIST,
.mem.ptr.pl = job->pages,
.mem.offset = job->offset,
@@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job)
.client = job->kc->io_client,
};
- if (job->rw == READ)
+ if (job->rw == READ) {
r = dm_io(&io_req, 1, &job->source, NULL);
- else
+ prepare_unplug(job->kc, READ, job->source.bdev);
+ } else {
+ if (job->num_dests > 1)
+ io_req.bi_rw |= REQ_UNPLUG;
r = dm_io(&io_req, job->num_dests, job->dests, NULL);
+ if (!(io_req.bi_rw & REQ_UNPLUG))
+ prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
+ }
return r;
}
@@ -435,10 +473,18 @@ static void do_work(struct work_struct *work)
* Pages jobs when successful will jump onto the io jobs
* list. io jobs call wake when they complete and it all
* starts again.
+ *
+ * Note that io_jobs add block devices to the unplug array,
+ * this array is cleared with "unplug" calls. It is thus
+ * forbidden to run complete_jobs after io_jobs and before
+ * unplug because the block device could be destroyed in
+ * job completion callback.
*/
process_jobs(&kc->complete_jobs, kc, run_complete_job);
process_jobs(&kc->pages_jobs, kc, run_pages_job);
process_jobs(&kc->io_jobs, kc, run_io_job);
+ unplug(kc, READ);
+ unplug(kc, WRITE);
}
/*
@@ -619,12 +665,15 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
INIT_LIST_HEAD(&kc->io_jobs);
INIT_LIST_HEAD(&kc->pages_jobs);
+ memset(kc->unplug, 0, sizeof(kc->unplug));
+
kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
if (!kc->job_pool)
goto bad_slab;
INIT_WORK(&kc->kcopyd_work, do_work);
- kc->kcopyd_wq = create_singlethread_workqueue("kcopyd");
+ kc->kcopyd_wq = alloc_workqueue("kcopyd",
+ WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
if (!kc->kcopyd_wq)
goto bad_workqueue;
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1ed0094f064b..aa2e0c374ab3 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,12 +12,22 @@
#include "dm-log-userspace-transfer.h"
+#define DM_LOG_USERSPACE_VSN "1.1.0"
+
struct flush_entry {
int type;
region_t region;
struct list_head list;
};
+/*
+ * This limit on the number of mark and clear request is, to a degree,
+ * arbitrary. However, there is some basis for the choice in the limits
+ * imposed on the size of data payload by dm-log-userspace-transfer.c:
+ * dm_consult_userspace().
+ */
+#define MAX_FLUSH_GROUP_COUNT 32
+
struct log_c {
struct dm_target *ti;
uint32_t region_size;
@@ -37,8 +47,15 @@ struct log_c {
*/
uint64_t in_sync_hint;
+ /*
+ * Mark and clear requests are held until a flush is issued
+ * so that we can group, and thereby limit, the amount of
+ * network traffic between kernel and userspace. The 'flush_lock'
+ * is used to protect these lists.
+ */
spinlock_t flush_lock;
- struct list_head flush_list; /* only for clear and mark requests */
+ struct list_head mark_list;
+ struct list_head clear_list;
};
static mempool_t *flush_entry_pool;
@@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
strncpy(lc->uuid, argv[0], DM_UUID_LEN);
spin_lock_init(&lc->flush_lock);
- INIT_LIST_HEAD(&lc->flush_list);
+ INIT_LIST_HEAD(&lc->mark_list);
+ INIT_LIST_HEAD(&lc->clear_list);
str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
if (str_size < 0) {
@@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
ctr_str, str_size, NULL, NULL);
- if (r == -ESRCH) {
- DMERR("Userspace log server not found");
+ if (r < 0) {
+ if (r == -ESRCH)
+ DMERR("Userspace log server not found");
+ else
+ DMERR("Userspace log server failed to create log");
goto out;
}
@@ -214,10 +235,9 @@ out:
static void userspace_dtr(struct dm_dirty_log *log)
{
- int r;
struct log_c *lc = log->context;
- r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
+ (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
NULL, 0,
NULL, NULL);
@@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
return (r) ? 0 : (int)in_sync;
}
+static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
+{
+ int r = 0;
+ struct flush_entry *fe;
+
+ list_for_each_entry(fe, flush_list, list) {
+ r = userspace_do_request(lc, lc->uuid, fe->type,
+ (char *)&fe->region,
+ sizeof(fe->region),
+ NULL, NULL);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+{
+ int r = 0;
+ int count;
+ uint32_t type = 0;
+ struct flush_entry *fe, *tmp_fe;
+ LIST_HEAD(tmp_list);
+ uint64_t group[MAX_FLUSH_GROUP_COUNT];
+
+ /*
+ * Group process the requests
+ */
+ while (!list_empty(flush_list)) {
+ count = 0;
+
+ list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
+ group[count] = fe->region;
+ count++;
+
+ list_del(&fe->list);
+ list_add(&fe->list, &tmp_list);
+
+ type = fe->type;
+ if (count >= MAX_FLUSH_GROUP_COUNT)
+ break;
+ }
+
+ r = userspace_do_request(lc, lc->uuid, type,
+ (char *)(group),
+ count * sizeof(uint64_t),
+ NULL, NULL);
+ if (r) {
+ /* Group send failed. Attempt one-by-one. */
+ list_splice_init(&tmp_list, flush_list);
+ r = flush_one_by_one(lc, flush_list);
+ break;
+ }
+ }
+
+ /*
+ * Must collect flush_entrys that were successfully processed
+ * as a group so that they will be free'd by the caller.
+ */
+ list_splice_init(&tmp_list, flush_list);
+
+ return r;
+}
+
/*
* userspace_flush
*
@@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log)
int r = 0;
unsigned long flags;
struct log_c *lc = log->context;
- LIST_HEAD(flush_list);
+ LIST_HEAD(mark_list);
+ LIST_HEAD(clear_list);
struct flush_entry *fe, *tmp_fe;
spin_lock_irqsave(&lc->flush_lock, flags);
- list_splice_init(&lc->flush_list, &flush_list);
+ list_splice_init(&lc->mark_list, &mark_list);
+ list_splice_init(&lc->clear_list, &clear_list);
spin_unlock_irqrestore(&lc->flush_lock, flags);
- if (list_empty(&flush_list))
+ if (list_empty(&mark_list) && list_empty(&clear_list))
return 0;
- /*
- * FIXME: Count up requests, group request types,
- * allocate memory to stick all requests in and
- * send to server in one go. Failing the allocation,
- * do it one by one.
- */
+ r = flush_by_group(lc, &mark_list);
+ if (r)
+ goto fail;
- list_for_each_entry(fe, &flush_list, list) {
- r = userspace_do_request(lc, lc->uuid, fe->type,
- (char *)&fe->region,
- sizeof(fe->region),
- NULL, NULL);
- if (r)
- goto fail;
- }
+ r = flush_by_group(lc, &clear_list);
+ if (r)
+ goto fail;
r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
NULL, 0, NULL, NULL);
@@ -395,7 +474,11 @@ fail:
* Calling code will receive an error and will know that
* the log facility has failed.
*/
- list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+ list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
+ list_del(&fe->list);
+ mempool_free(fe, flush_entry_pool);
+ }
+ list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
list_del(&fe->list);
mempool_free(fe, flush_entry_pool);
}
@@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
spin_lock_irqsave(&lc->flush_lock, flags);
fe->type = DM_ULOG_MARK_REGION;
fe->region = region;
- list_add(&fe->list, &lc->flush_list);
+ list_add(&fe->list, &lc->mark_list);
spin_unlock_irqrestore(&lc->flush_lock, flags);
return;
@@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
spin_lock_irqsave(&lc->flush_lock, flags);
fe->type = DM_ULOG_CLEAR_REGION;
fe->region = region;
- list_add(&fe->list, &lc->flush_list);
+ list_add(&fe->list, &lc->clear_list);
spin_unlock_irqrestore(&lc->flush_lock, flags);
return;
@@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
return r;
}
- DMINFO("version 1.0.0 loaded");
+ DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
return 0;
}
@@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
dm_ulog_tfr_exit();
mempool_destroy(flush_entry_pool);
- DMINFO("version 1.0.0 unloaded");
+ DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
return;
}
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f5..049eaf12aaab 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -198,6 +198,7 @@ resend:
memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+ tfr->version = DM_ULOG_REQUEST_VERSION;
tfr->luid = luid;
tfr->seq = dm_ulog_seq++;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 33420e68d153..6951536ea29c 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
r = PTR_ERR(lc->io_req.client);
DMWARN("couldn't allocate disk io client");
kfree(lc);
- return -ENOMEM;
+ return r;
}
lc->disk_header = vmalloc(buf_size);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 487ecda90ad4..b82d28819e2a 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,8 @@
#define DM_MSG_PREFIX "multipath"
#define MESG_STR(x) x, sizeof(x)
+#define DM_PG_INIT_DELAY_MSECS 2000
+#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
/* Path properties */
struct pgpath {
@@ -33,8 +35,7 @@ struct pgpath {
unsigned fail_count; /* Cumulative failure count */
struct dm_path path;
- struct work_struct deactivate_path;
- struct work_struct activate_path;
+ struct delayed_work activate_path;
};
#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -65,11 +66,15 @@ struct multipath {
const char *hw_handler_name;
char *hw_handler_params;
+
unsigned nr_priority_groups;
struct list_head priority_groups;
+
+ wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+
unsigned pg_init_required; /* pg_init needs calling? */
unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
- wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+ unsigned pg_init_delay_retry; /* Delay pg_init retry? */
unsigned nr_valid_paths; /* Total number of usable paths */
struct pgpath *current_pgpath;
@@ -82,6 +87,7 @@ struct multipath {
unsigned saved_queue_if_no_path;/* Saved state during suspension */
unsigned pg_init_retries; /* Number of times to retry pg_init */
unsigned pg_init_count; /* Number of times pg_init called */
+ unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
struct work_struct process_queued_ios;
struct list_head queued_ios;
@@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
static void process_queued_ios(struct work_struct *work);
static void trigger_event(struct work_struct *work);
static void activate_path(struct work_struct *work);
-static void deactivate_path(struct work_struct *work);
/*-----------------------------------------------
@@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void)
if (pgpath) {
pgpath->is_active = 1;
- INIT_WORK(&pgpath->deactivate_path, deactivate_path);
- INIT_WORK(&pgpath->activate_path, activate_path);
+ INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
}
return pgpath;
@@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath)
kfree(pgpath);
}
-static void deactivate_path(struct work_struct *work)
-{
- struct pgpath *pgpath =
- container_of(work, struct pgpath, deactivate_path);
-
- blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
-}
-
static struct priority_group *alloc_priority_group(void)
{
struct priority_group *pg;
@@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
INIT_LIST_HEAD(&m->queued_ios);
spin_lock_init(&m->lock);
m->queue_io = 1;
+ m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
INIT_WORK(&m->process_queued_ios, process_queued_ios);
INIT_WORK(&m->trigger_event, trigger_event);
init_waitqueue_head(&m->pg_init_wait);
@@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m)
static void __pg_init_all_paths(struct multipath *m)
{
struct pgpath *pgpath;
+ unsigned long pg_init_delay = 0;
m->pg_init_count++;
m->pg_init_required = 0;
+ if (m->pg_init_delay_retry)
+ pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
+ m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
/* Skip failed paths */
if (!pgpath->is_active)
continue;
- if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+ if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
+ pg_init_delay))
m->pg_init_in_progress++;
}
}
@@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m)
const char *param_name;
static struct param _params[] = {
- {0, 3, "invalid number of feature args"},
+ {0, 5, "invalid number of feature args"},
{1, 50, "pg_init_retries must be between 1 and 50"},
+ {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
};
r = read_param(_params, shift(as), &argc, &ti->error);
@@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m)
continue;
}
+ if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+ (argc >= 1)) {
+ r = read_param(_params + 2, shift(as),
+ &m->pg_init_delay_msecs, &ti->error);
+ argc--;
+ continue;
+ }
+
ti->error = "Unrecognised multipath feature request";
r = -EINVAL;
} while (argc && !r);
@@ -931,7 +942,7 @@ static void flush_multipath_work(struct multipath *m)
flush_workqueue(kmpath_handlerd);
multipath_wait_for_pg_init_completion(m);
flush_workqueue(kmultipathd);
- flush_scheduled_work();
+ flush_work_sync(&m->trigger_event);
}
static void multipath_dtr(struct dm_target *ti)
@@ -995,7 +1006,6 @@ static int fail_path(struct pgpath *pgpath)
pgpath->path.dev->name, m->nr_valid_paths);
schedule_work(&m->trigger_event);
- queue_work(kmultipathd, &pgpath->deactivate_path);
out:
spin_unlock_irqrestore(&m->lock, flags);
@@ -1034,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath)
m->current_pgpath = NULL;
queue_work(kmultipathd, &m->process_queued_ios);
} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
- if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+ if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
m->pg_init_in_progress++;
}
@@ -1169,6 +1179,7 @@ static void pg_init_done(void *data, int errors)
struct priority_group *pg = pgpath->pg;
struct multipath *m = pg->m;
unsigned long flags;
+ unsigned delay_retry = 0;
/* device or driver problems */
switch (errors) {
@@ -1193,8 +1204,9 @@ static void pg_init_done(void *data, int errors)
*/
bypass_pg(m, pg, 1);
break;
- /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
case SCSI_DH_RETRY:
+ /* Wait before retrying. */
+ delay_retry = 1;
case SCSI_DH_IMM_RETRY:
case SCSI_DH_RES_TEMP_UNAVAIL:
if (pg_init_limit_reached(m, pgpath))
@@ -1227,6 +1239,7 @@ static void pg_init_done(void *data, int errors)
if (!m->pg_init_required)
m->queue_io = 0;
+ m->pg_init_delay_retry = delay_retry;
queue_work(kmultipathd, &m->process_queued_ios);
/*
@@ -1241,7 +1254,7 @@ out:
static void activate_path(struct work_struct *work)
{
struct pgpath *pgpath =
- container_of(work, struct pgpath, activate_path);
+ container_of(work, struct pgpath, activate_path.work);
scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
pg_init_done, pgpath);
@@ -1382,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
else {
DMEMIT("%u ", m->queue_if_no_path +
- (m->pg_init_retries > 0) * 2);
+ (m->pg_init_retries > 0) * 2 +
+ (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
if (m->queue_if_no_path)
DMEMIT("queue_if_no_path ");
if (m->pg_init_retries)
DMEMIT("pg_init_retries %u ", m->pg_init_retries);
+ if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
+ DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
}
if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1655,7 +1671,7 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 1, 1},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
@@ -1687,7 +1703,7 @@ static int __init dm_multipath_init(void)
return -EINVAL;
}
- kmultipathd = create_workqueue("kmpathd");
+ kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
if (!kmultipathd) {
DMERR("failed to create workqueue kmpathd");
dm_unregister_target(&multipath_target);
@@ -1701,7 +1717,8 @@ static int __init dm_multipath_init(void)
* old workqueue would also create a bottleneck in the
* path of the storage hardware device activation.
*/
- kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd");
+ kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
+ WQ_MEM_RECLAIM);
if (!kmpath_handlerd) {
DMERR("failed to create workqueue kmpath_handlerd");
destroy_workqueue(kmultipathd);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 000000000000..b9e1e15ef11c
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (C) 2010-2011 Neil Brown
+ * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/slab.h>
+
+#include "md.h"
+#include "raid5.h"
+#include "dm.h"
+#include "bitmap.h"
+
+#define DM_MSG_PREFIX "raid"
+
+/*
+ * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
+ * make it so the flag doesn't set anything.
+ */
+#ifndef MD_SYNC_STATE_FORCED
+#define MD_SYNC_STATE_FORCED 0
+#endif
+
+struct raid_dev {
+ /*
+ * Two DM devices, one to hold metadata and one to hold the
+ * actual data/parity. The reason for this is to not confuse
+ * ti->len and give more flexibility in altering size and
+ * characteristics.
+ *
+ * While it is possible for this device to be associated
+ * with a different physical device than the data_dev, it
+ * is intended for it to be the same.
+ * |--------- Physical Device ---------|
+ * |- meta_dev -|------ data_dev ------|
+ */
+ struct dm_dev *meta_dev;
+ struct dm_dev *data_dev;
+ struct mdk_rdev_s rdev;
+};
+
+/*
+ * Flags for rs->print_flags field.
+ */
+#define DMPF_DAEMON_SLEEP 0x1
+#define DMPF_MAX_WRITE_BEHIND 0x2
+#define DMPF_SYNC 0x4
+#define DMPF_NOSYNC 0x8
+#define DMPF_STRIPE_CACHE 0x10
+#define DMPF_MIN_RECOVERY_RATE 0x20
+#define DMPF_MAX_RECOVERY_RATE 0x40
+
+struct raid_set {
+ struct dm_target *ti;
+
+ uint64_t print_flags;
+
+ struct mddev_s md;
+ struct raid_type *raid_type;
+ struct dm_target_callbacks callbacks;
+
+ struct raid_dev dev[0];
+};
+
+/* Supported raid types and properties. */
+static struct raid_type {
+ const char *name; /* RAID algorithm. */
+ const char *descr; /* Descriptor text for logging. */
+ const unsigned parity_devs; /* # of parity devices. */
+ const unsigned minimal_devs; /* minimal # of devices in set. */
+ const unsigned level; /* RAID level. */
+ const unsigned algorithm; /* RAID algorithm. */
+} raid_types[] = {
+ {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
+ {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+ {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+ {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+ {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+ {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
+ {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+ {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+};
+
+static struct raid_type *get_raid_type(char *name)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(raid_types); i++)
+ if (!strcmp(raid_types[i].name, name))
+ return &raid_types[i];
+
+ return NULL;
+}
+
+static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
+{
+ unsigned i;
+ struct raid_set *rs;
+ sector_t sectors_per_dev;
+
+ if (raid_devs <= raid_type->parity_devs) {
+ ti->error = "Insufficient number of devices";
+ return ERR_PTR(-EINVAL);
+ }
+
+ sectors_per_dev = ti->len;
+ if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+ ti->error = "Target length not divisible by number of data devices";
+ return ERR_PTR(-EINVAL);
+ }
+
+ rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
+ if (!rs) {
+ ti->error = "Cannot allocate raid context";
+ return ERR_PTR(-ENOMEM);
+ }
+
+ mddev_init(&rs->md);
+
+ rs->ti = ti;
+ rs->raid_type = raid_type;
+ rs->md.raid_disks = raid_devs;
+ rs->md.level = raid_type->level;
+ rs->md.new_level = rs->md.level;
+ rs->md.dev_sectors = sectors_per_dev;
+ rs->md.layout = raid_type->algorithm;
+ rs->md.new_layout = rs->md.layout;
+ rs->md.delta_disks = 0;
+ rs->md.recovery_cp = 0;
+
+ for (i = 0; i < raid_devs; i++)
+ md_rdev_init(&rs->dev[i].rdev);
+
+ /*
+ * Remaining items to be initialized by further RAID params:
+ * rs->md.persistent
+ * rs->md.external
+ * rs->md.chunk_sectors
+ * rs->md.new_chunk_sectors
+ */
+
+ return rs;
+}
+
+static void context_free(struct raid_set *rs)
+{
+ int i;
+
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev)
+ dm_put_device(rs->ti, rs->dev[i].data_dev);
+
+ kfree(rs);
+}
+
+/*
+ * For every device we have two words
+ * <meta_dev>: meta device name or '-' if missing
+ * <data_dev>: data device name or '-' if missing
+ *
+ * This code parses those words.
+ */
+static int dev_parms(struct raid_set *rs, char **argv)
+{
+ int i;
+ int rebuild = 0;
+ int metadata_available = 0;
+ int ret = 0;
+
+ for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+ rs->dev[i].rdev.raid_disk = i;
+
+ rs->dev[i].meta_dev = NULL;
+ rs->dev[i].data_dev = NULL;
+
+ /*
+ * There are no offsets, since there is a separate device
+ * for data and metadata.
+ */
+ rs->dev[i].rdev.data_offset = 0;
+ rs->dev[i].rdev.mddev = &rs->md;
+
+ if (strcmp(argv[0], "-")) {
+ rs->ti->error = "Metadata devices not supported";
+ return -EINVAL;
+ }
+
+ if (!strcmp(argv[1], "-")) {
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+ (!rs->dev[i].rdev.recovery_offset)) {
+ rs->ti->error = "Drive designated for rebuild not specified";
+ return -EINVAL;
+ }
+
+ continue;
+ }
+
+ ret = dm_get_device(rs->ti, argv[1],
+ dm_table_get_mode(rs->ti->table),
+ &rs->dev[i].data_dev);
+ if (ret) {
+ rs->ti->error = "RAID device lookup failure";
+ return ret;
+ }
+
+ rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
+ list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+ rebuild++;
+ }
+
+ if (metadata_available) {
+ rs->md.external = 0;
+ rs->md.persistent = 1;
+ rs->md.major_version = 2;
+ } else if (rebuild && !rs->md.recovery_cp) {
+ /*
+ * Without metadata, we will not be able to tell if the array
+ * is in-sync or not - we must assume it is not. Therefore,
+ * it is impossible to rebuild a drive.
+ *
+ * Even if there is metadata, the on-disk information may
+ * indicate that the array is not in-sync and it will then
+ * fail at that time.
+ *
+ * User could specify 'nosync' option if desperate.
+ */
+ DMERR("Unable to rebuild drive while array is not in-sync");
+ rs->ti->error = "RAID device lookup failure";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Possible arguments are...
+ * RAID456:
+ * <chunk_size> [optional_args]
+ *
+ * Optional args:
+ * [[no]sync] Force or prevent recovery of the entire array
+ * [rebuild <idx>] Rebuild the drive indicated by the index
+ * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
+ * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
+ * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
+ */
+static int parse_raid_params(struct raid_set *rs, char **argv,
+ unsigned num_raid_params)
+{
+ unsigned i, rebuild_cnt = 0;
+ unsigned long value;
+ char *key;
+
+ /*
+ * First, parse the in-order required arguments
+ */
+ if ((strict_strtoul(argv[0], 10, &value) < 0) ||
+ !is_power_of_2(value) || (value < 8)) {
+ rs->ti->error = "Bad chunk size";
+ return -EINVAL;
+ }
+
+ rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
+ argv++;
+ num_raid_params--;
+
+ /*
+ * Second, parse the unordered optional arguments
+ */
+ for (i = 0; i < rs->md.raid_disks; i++)
+ set_bit(In_sync, &rs->dev[i].rdev.flags);
+
+ for (i = 0; i < num_raid_params; i++) {
+ if (!strcmp(argv[i], "nosync")) {
+ rs->md.recovery_cp = MaxSector;
+ rs->print_flags |= DMPF_NOSYNC;
+ rs->md.flags |= MD_SYNC_STATE_FORCED;
+ continue;
+ }
+ if (!strcmp(argv[i], "sync")) {
+ rs->md.recovery_cp = 0;
+ rs->print_flags |= DMPF_SYNC;
+ rs->md.flags |= MD_SYNC_STATE_FORCED;
+ continue;
+ }
+
+ /* The rest of the optional arguments come in key/value pairs */
+ if ((i + 1) >= num_raid_params) {
+ rs->ti->error = "Wrong number of raid parameters given";
+ return -EINVAL;
+ }
+
+ key = argv[i++];
+ if (strict_strtoul(argv[i], 10, &value) < 0) {
+ rs->ti->error = "Bad numerical argument given in raid params";
+ return -EINVAL;
+ }
+
+ if (!strcmp(key, "rebuild")) {
+ if (++rebuild_cnt > rs->raid_type->parity_devs) {
+ rs->ti->error = "Too many rebuild drives given";
+ return -EINVAL;
+ }
+ if (value > rs->md.raid_disks) {
+ rs->ti->error = "Invalid rebuild index given";
+ return -EINVAL;
+ }
+ clear_bit(In_sync, &rs->dev[value].rdev.flags);
+ rs->dev[value].rdev.recovery_offset = 0;
+ } else if (!strcmp(key, "max_write_behind")) {
+ rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+
+ /*
+ * In device-mapper, we specify things in sectors, but
+ * MD records this value in kB
+ */
+ value /= 2;
+ if (value > COUNTER_MAX) {
+ rs->ti->error = "Max write-behind limit out of range";
+ return -EINVAL;
+ }
+ rs->md.bitmap_info.max_write_behind = value;
+ } else if (!strcmp(key, "daemon_sleep")) {
+ rs->print_flags |= DMPF_DAEMON_SLEEP;
+ if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
+ rs->ti->error = "daemon sleep period out of range";
+ return -EINVAL;
+ }
+ rs->md.bitmap_info.daemon_sleep = value;
+ } else if (!strcmp(key, "stripe_cache")) {
+ rs->print_flags |= DMPF_STRIPE_CACHE;
+
+ /*
+ * In device-mapper, we specify things in sectors, but
+ * MD records this value in kB
+ */
+ value /= 2;
+
+ if (rs->raid_type->level < 5) {
+ rs->ti->error = "Inappropriate argument: stripe_cache";
+ return -EINVAL;
+ }
+ if (raid5_set_cache_size(&rs->md, (int)value)) {
+ rs->ti->error = "Bad stripe_cache size";
+ return -EINVAL;
+ }
+ } else if (!strcmp(key, "min_recovery_rate")) {
+ rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+ if (value > INT_MAX) {
+ rs->ti->error = "min_recovery_rate out of range";
+ return -EINVAL;
+ }
+ rs->md.sync_speed_min = (int)value;
+ } else if (!strcmp(key, "max_recovery_rate")) {
+ rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+ if (value > INT_MAX) {
+ rs->ti->error = "max_recovery_rate out of range";
+ return -EINVAL;
+ }
+ rs->md.sync_speed_max = (int)value;
+ } else {
+ DMERR("Unable to parse RAID parameter: %s", key);
+ rs->ti->error = "Unable to parse RAID parameters";
+ return -EINVAL;
+ }
+ }
+
+ /* Assume there are no metadata devices until the drives are parsed */
+ rs->md.persistent = 0;
+ rs->md.external = 1;
+
+ return 0;
+}
+
+static void do_table_event(struct work_struct *ws)
+{
+ struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
+
+ dm_table_event(rs->ti->table);
+}
+
+static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
+{
+ struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+
+ return md_raid5_congested(&rs->md, bits);
+}
+
+static void raid_unplug(struct dm_target_callbacks *cb)
+{
+ struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+
+ md_raid5_unplug_device(rs->md.private);
+}
+
+/*
+ * Construct a RAID4/5/6 mapping:
+ * Args:
+ * <raid_type> <#raid_params> <raid_params> \
+ * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *
+ * ** metadata devices are not supported yet, use '-' instead **
+ *
+ * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
+ * details on possible <raid_params>.
+ */
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int ret;
+ struct raid_type *rt;
+ unsigned long num_raid_params, num_raid_devs;
+ struct raid_set *rs = NULL;
+
+ /* Must have at least <raid_type> <#raid_params> */
+ if (argc < 2) {
+ ti->error = "Too few arguments";
+ return -EINVAL;
+ }
+
+ /* raid type */
+ rt = get_raid_type(argv[0]);
+ if (!rt) {
+ ti->error = "Unrecognised raid_type";
+ return -EINVAL;
+ }
+ argc--;
+ argv++;
+
+ /* number of RAID parameters */
+ if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+ ti->error = "Cannot understand number of RAID parameters";
+ return -EINVAL;
+ }
+ argc--;
+ argv++;
+
+ /* Skip over RAID params for now and find out # of devices */
+ if (num_raid_params + 1 > argc) {
+ ti->error = "Arguments do not agree with counts given";
+ return -EINVAL;
+ }
+
+ if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+ (num_raid_devs >= INT_MAX)) {
+ ti->error = "Cannot understand number of raid devices";
+ return -EINVAL;
+ }
+
+ rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
+ if (IS_ERR(rs))
+ return PTR_ERR(rs);
+
+ ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
+ if (ret)
+ goto bad;
+
+ ret = -EINVAL;
+
+ argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+ argv += num_raid_params + 1;
+
+ if (argc != (num_raid_devs * 2)) {
+ ti->error = "Supplied RAID devices does not match the count given";
+ goto bad;
+ }
+
+ ret = dev_parms(rs, argv);
+ if (ret)
+ goto bad;
+
+ INIT_WORK(&rs->md.event_work, do_table_event);
+ ti->split_io = rs->md.chunk_sectors;
+ ti->private = rs;
+
+ mutex_lock(&rs->md.reconfig_mutex);
+ ret = md_run(&rs->md);
+ rs->md.in_sync = 0; /* Assume already marked dirty */
+ mutex_unlock(&rs->md.reconfig_mutex);
+
+ if (ret) {
+ ti->error = "Fail to run raid array";
+ goto bad;
+ }
+
+ rs->callbacks.congested_fn = raid_is_congested;
+ rs->callbacks.unplug_fn = raid_unplug;
+ dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+
+ return 0;
+
+bad:
+ context_free(rs);
+
+ return ret;
+}
+
+static void raid_dtr(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ list_del_init(&rs->callbacks.list);
+ md_stop(&rs->md);
+ context_free(rs);
+}
+
+static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+{
+ struct raid_set *rs = ti->private;
+ mddev_t *mddev = &rs->md;
+
+ mddev->pers->make_request(mddev, bio);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+static int raid_status(struct dm_target *ti, status_type_t type,
+ char *result, unsigned maxlen)
+{
+ struct raid_set *rs = ti->private;
+ unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
+ unsigned sz = 0;
+ int i;
+ sector_t sync;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+ DMEMIT("D");
+ else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
+ DMEMIT("A");
+ else
+ DMEMIT("a");
+ }
+
+ if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+ sync = rs->md.curr_resync_completed;
+ else
+ sync = rs->md.recovery_cp;
+
+ if (sync > rs->md.resync_max_sectors)
+ sync = rs->md.resync_max_sectors;
+
+ DMEMIT(" %llu/%llu",
+ (unsigned long long) sync,
+ (unsigned long long) rs->md.resync_max_sectors);
+
+ break;
+ case STATUSTYPE_TABLE:
+ /* The string you would use to construct this array */
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev &&
+ !test_bit(In_sync, &rs->dev[i].rdev.flags))
+ raid_param_cnt++; /* for rebuilds */
+
+ raid_param_cnt += (hweight64(rs->print_flags) * 2);
+ if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+ raid_param_cnt--;
+
+ DMEMIT("%s %u %u", rs->raid_type->name,
+ raid_param_cnt, rs->md.chunk_sectors);
+
+ if ((rs->print_flags & DMPF_SYNC) &&
+ (rs->md.recovery_cp == MaxSector))
+ DMEMIT(" sync");
+ if (rs->print_flags & DMPF_NOSYNC)
+ DMEMIT(" nosync");
+
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev &&
+ !test_bit(In_sync, &rs->dev[i].rdev.flags))
+ DMEMIT(" rebuild %u", i);
+
+ if (rs->print_flags & DMPF_DAEMON_SLEEP)
+ DMEMIT(" daemon_sleep %lu",
+ rs->md.bitmap_info.daemon_sleep);
+
+ if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+ DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
+
+ if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+ DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
+
+ if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+ DMEMIT(" max_write_behind %lu",
+ rs->md.bitmap_info.max_write_behind);
+
+ if (rs->print_flags & DMPF_STRIPE_CACHE) {
+ raid5_conf_t *conf = rs->md.private;
+
+ /* convert from kiB to sectors */
+ DMEMIT(" stripe_cache %d",
+ conf ? conf->max_nr_stripes * 2 : 0);
+ }
+
+ DMEMIT(" %d", rs->md.raid_disks);
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ DMEMIT(" -"); /* metadata device */
+
+ if (rs->dev[i].data_dev)
+ DMEMIT(" %s", rs->dev[i].data_dev->name);
+ else
+ DMEMIT(" -");
+ }
+ }
+
+ return 0;
+}
+
+static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+ struct raid_set *rs = ti->private;
+ unsigned i;
+ int ret = 0;
+
+ for (i = 0; !ret && i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev)
+ ret = fn(ti,
+ rs->dev[i].data_dev,
+ 0, /* No offset on data devs */
+ rs->md.dev_sectors,
+ data);
+
+ return ret;
+}
+
+static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct raid_set *rs = ti->private;
+ unsigned chunk_size = rs->md.chunk_sectors << 9;
+ raid5_conf_t *conf = rs->md.private;
+
+ blk_limits_io_min(limits, chunk_size);
+ blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
+}
+
+static void raid_presuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ md_stop_writes(&rs->md);
+}
+
+static void raid_postsuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ mddev_suspend(&rs->md);
+}
+
+static void raid_resume(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ mddev_resume(&rs->md);
+}
+
+static struct target_type raid_target = {
+ .name = "raid",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = raid_ctr,
+ .dtr = raid_dtr,
+ .map = raid_map,
+ .status = raid_status,
+ .iterate_devices = raid_iterate_devices,
+ .io_hints = raid_io_hints,
+ .presuspend = raid_presuspend,
+ .postsuspend = raid_postsuspend,
+ .resume = raid_resume,
+};
+
+static int __init dm_raid_init(void)
+{
+ return dm_register_target(&raid_target);
+}
+
+static void __exit dm_raid_exit(void)
+{
+ dm_unregister_target(&raid_target);
+}
+
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
+MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 19a59b041c27..dee326775c60 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
struct dm_io_request io_req = {
.bi_rw = WRITE_FLUSH,
.mem.type = DM_IO_KMEM,
- .mem.ptr.bvec = NULL,
+ .mem.ptr.addr = NULL,
.client = ms->io_client,
};
@@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
.client = ms->io_client,
};
+ if (bio->bi_rw & REQ_DISCARD) {
+ io_req.bi_rw |= REQ_DISCARD;
+ io_req.mem.type = DM_IO_KMEM;
+ io_req.mem.ptr.addr = NULL;
+ }
+
for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
map_region(dest++, m, bio);
@@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
bio_list_init(&requeue);
while ((bio = bio_list_pop(writes))) {
- if (bio->bi_rw & REQ_FLUSH) {
+ if ((bio->bi_rw & REQ_FLUSH) ||
+ (bio->bi_rw & REQ_DISCARD)) {
bio_list_add(&sync, bio);
continue;
}
@@ -1076,8 +1083,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = ms;
ti->split_io = dm_rh_get_region_size(ms->rh);
ti->num_flush_requests = 1;
+ ti->num_discard_requests = 1;
- ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
+ ms->kmirrord_wq = alloc_workqueue("kmirrord",
+ WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
if (!ms->kmirrord_wq) {
DMERR("couldn't start kmirrord");
r = -ENOMEM;
@@ -1130,7 +1139,7 @@ static void mirror_dtr(struct dm_target *ti)
del_timer_sync(&ms->timer);
flush_workqueue(ms->kmirrord_wq);
- flush_scheduled_work();
+ flush_work_sync(&ms->trigger_event);
dm_kcopyd_client_destroy(ms->kcopyd_client);
destroy_workqueue(ms->kmirrord_wq);
free_context(ms, ti, ms->nr_mirrors);
@@ -1406,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
static struct target_type mirror_target = {
.name = "mirror",
- .version = {1, 12, 0},
+ .version = {1, 12, 1},
.module = THIS_MODULE,
.ctr = mirror_ctr,
.dtr = mirror_dtr,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2129cdb115dc..95891dfcbca0 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
*/
INIT_WORK_ONSTACK(&req.work, do_metadata);
queue_work(ps->metadata_wq, &req.work);
- flush_workqueue(ps->metadata_wq);
+ flush_work(&req.work);
return req.result;
}
@@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store,
atomic_set(&ps->pending_count, 0);
ps->callbacks = NULL;
- ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
+ ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
if (!ps->metadata_wq) {
kfree(ps);
DMERR("couldn't start header metadata update thread");
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 53cf79d8bcbc..fdde53cd12b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
#include <linux/vmalloc.h>
#include <linux/log2.h>
#include <linux/dm-kcopyd.h>
-#include <linux/workqueue.h>
#include "dm-exception-store.h"
@@ -80,9 +79,6 @@ struct dm_snapshot {
/* Origin writes don't trigger exceptions until this is set */
int active;
- /* Whether or not owning mapped_device is suspended */
- int suspended;
-
atomic_t pending_exceptions_count;
mempool_t *pending_pool;
@@ -106,10 +102,6 @@ struct dm_snapshot {
struct dm_kcopyd_client *kcopyd_client;
- /* Queue of snapshot writes for ksnapd to flush */
- struct bio_list queued_bios;
- struct work_struct queued_bios_work;
-
/* Wait for events based on state_bits */
unsigned long state_bits;
@@ -160,9 +152,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
}
EXPORT_SYMBOL(dm_snap_cow);
-static struct workqueue_struct *ksnapd;
-static void flush_queued_bios(struct work_struct *work);
-
static sector_t chunk_to_sector(struct dm_exception_store *store,
chunk_t chunk)
{
@@ -1110,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s->ti = ti;
s->valid = 1;
s->active = 0;
- s->suspended = 0;
atomic_set(&s->pending_exceptions_count, 0);
init_rwsem(&s->lock);
INIT_LIST_HEAD(&s->list);
@@ -1153,9 +1141,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
spin_lock_init(&s->tracked_chunk_lock);
- bio_list_init(&s->queued_bios);
- INIT_WORK(&s->queued_bios_work, flush_queued_bios);
-
ti->private = s;
ti->num_flush_requests = num_flush_requests;
@@ -1279,8 +1264,6 @@ static void snapshot_dtr(struct dm_target *ti)
struct dm_snapshot *s = ti->private;
struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
- flush_workqueue(ksnapd);
-
down_read(&_origins_lock);
/* Check whether exception handover must be cancelled */
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
@@ -1342,20 +1325,6 @@ static void flush_bios(struct bio *bio)
}
}
-static void flush_queued_bios(struct work_struct *work)
-{
- struct dm_snapshot *s =
- container_of(work, struct dm_snapshot, queued_bios_work);
- struct bio *queued_bios;
- unsigned long flags;
-
- spin_lock_irqsave(&s->pe_lock, flags);
- queued_bios = bio_list_get(&s->queued_bios);
- spin_unlock_irqrestore(&s->pe_lock, flags);
-
- flush_bios(queued_bios);
-}
-
static int do_origin(struct dm_dev *origin, struct bio *bio);
/*
@@ -1760,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti)
stop_merge(s);
}
-static void snapshot_postsuspend(struct dm_target *ti)
-{
- struct dm_snapshot *s = ti->private;
-
- down_write(&s->lock);
- s->suspended = 1;
- up_write(&s->lock);
-}
-
static int snapshot_preresume(struct dm_target *ti)
{
int r = 0;
@@ -1783,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti)
DMERR("Unable to resume snapshot source until "
"handover completes.");
r = -EINVAL;
- } else if (!snap_src->suspended) {
+ } else if (!dm_suspended(snap_src->ti)) {
DMERR("Unable to perform snapshot handover until "
"source is suspended.");
r = -EINVAL;
@@ -1816,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti)
down_write(&s->lock);
s->active = 1;
- s->suspended = 0;
up_write(&s->lock);
}
@@ -2194,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti,
static struct target_type origin_target = {
.name = "snapshot-origin",
- .version = {1, 7, 0},
+ .version = {1, 7, 1},
.module = THIS_MODULE,
.ctr = origin_ctr,
.dtr = origin_dtr,
@@ -2207,13 +2166,12 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 9, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
.map = snapshot_map,
.end_io = snapshot_end_io,
- .postsuspend = snapshot_postsuspend,
.preresume = snapshot_preresume,
.resume = snapshot_resume,
.status = snapshot_status,
@@ -2222,14 +2180,13 @@ static struct target_type snapshot_target = {
static struct target_type merge_target = {
.name = dm_snapshot_merge_target_name,
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
.map = snapshot_merge_map,
.end_io = snapshot_end_io,
.presuspend = snapshot_merge_presuspend,
- .postsuspend = snapshot_postsuspend,
.preresume = snapshot_preresume,
.resume = snapshot_merge_resume,
.status = snapshot_status,
@@ -2291,17 +2248,8 @@ static int __init dm_snapshot_init(void)
goto bad_tracked_chunk_cache;
}
- ksnapd = create_singlethread_workqueue("ksnapd");
- if (!ksnapd) {
- DMERR("Failed to create ksnapd workqueue.");
- r = -ENOMEM;
- goto bad_pending_pool;
- }
-
return 0;
-bad_pending_pool:
- kmem_cache_destroy(tracked_chunk_cache);
bad_tracked_chunk_cache:
kmem_cache_destroy(pending_cache);
bad_pending_cache:
@@ -2322,8 +2270,6 @@ bad_register_snapshot_target:
static void __exit dm_snapshot_exit(void)
{
- destroy_workqueue(ksnapd);
-
dm_unregister_target(&snapshot_target);
dm_unregister_target(&origin_target);
dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f0371b4c4fbf..dddfa14f2982 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -39,23 +39,20 @@ struct stripe_c {
struct dm_target *ti;
/* Work struct used for triggering events*/
- struct work_struct kstriped_ws;
+ struct work_struct trigger_event;
struct stripe stripe[0];
};
-static struct workqueue_struct *kstriped;
-
/*
* An event is triggered whenever a drive
* drops out of a stripe volume.
*/
static void trigger_event(struct work_struct *work)
{
- struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
-
+ struct stripe_c *sc = container_of(work, struct stripe_c,
+ trigger_event);
dm_table_event(sc->ti->table);
-
}
static inline struct stripe_c *alloc_context(unsigned int stripes)
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -ENOMEM;
}
- INIT_WORK(&sc->kstriped_ws, trigger_event);
+ INIT_WORK(&sc->trigger_event, trigger_event);
/* Set pointer to dm target; used in trigger_event */
sc->ti = ti;
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti)
for (i = 0; i < sc->stripes; i++)
dm_put_device(ti, sc->stripe[i].dev);
- flush_workqueue(kstriped);
+ flush_work_sync(&sc->trigger_event);
kfree(sc);
}
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
atomic_inc(&(sc->stripe[i].error_count));
if (atomic_read(&(sc->stripe[i].error_count)) <
DM_IO_ERROR_THRESHOLD)
- queue_work(kstriped, &sc->kstriped_ws);
+ schedule_work(&sc->trigger_event);
}
return error;
@@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti,
static struct target_type stripe_target = {
.name = "striped",
- .version = {1, 3, 0},
+ .version = {1, 3, 1},
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
@@ -422,20 +419,10 @@ int __init dm_stripe_init(void)
return r;
}
- kstriped = create_singlethread_workqueue("kstriped");
- if (!kstriped) {
- DMERR("failed to create workqueue kstriped");
- dm_unregister_target(&stripe_target);
- return -ENOMEM;
- }
-
return r;
}
void dm_stripe_exit(void)
{
dm_unregister_target(&stripe_target);
- destroy_workqueue(kstriped);
-
- return;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 985c20a4f30e..dffa0ac7c4f0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -71,6 +71,8 @@ struct dm_table {
void *event_context;
struct dm_md_mempools *mempools;
+
+ struct list_head target_callbacks;
};
/*
@@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
return -ENOMEM;
INIT_LIST_HEAD(&t->devices);
+ INIT_LIST_HEAD(&t->target_callbacks);
atomic_set(&t->holders, 0);
t->discards_supported = 1;
@@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t)
return 0;
}
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
+{
+ list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
+
int dm_table_any_congested(struct dm_table *t, int bdi_bits)
{
struct dm_dev_internal *dd;
struct list_head *devices = dm_table_get_devices(t);
+ struct dm_target_callbacks *cb;
int r = 0;
list_for_each_entry(dd, devices, list) {
@@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
bdevname(dd->dm_dev.bdev, b));
}
+ list_for_each_entry(cb, &t->target_callbacks, list)
+ if (cb->congested_fn)
+ r |= cb->congested_fn(cb, bdi_bits);
+
return r;
}
@@ -1264,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t)
{
struct dm_dev_internal *dd;
struct list_head *devices = dm_table_get_devices(t);
+ struct dm_target_callbacks *cb;
list_for_each_entry(dd, devices, list) {
struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1276,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t)
dm_device_name(t->md),
bdevname(dd->dm_dev.bdev, b));
}
+
+ list_for_each_entry(cb, &t->target_callbacks, list)
+ if (cb->unplug_fn)
+ cb->unplug_fn(cb);
}
struct mapped_device *dm_table_get_md(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f48a2f359ac4..eaa3af0e0632 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -32,7 +32,6 @@
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
#define DM_COOKIE_LENGTH 24
-static DEFINE_MUTEX(dm_mutex);
static const char *_name = DM_NAME;
static unsigned int major = 0;
@@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
{
struct mapped_device *md;
- mutex_lock(&dm_mutex);
spin_lock(&_minor_lock);
md = bdev->bd_disk->private_data;
@@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
out:
spin_unlock(&_minor_lock);
- mutex_unlock(&dm_mutex);
return md ? 0 : -ENXIO;
}
@@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
{
struct mapped_device *md = disk->private_data;
- mutex_lock(&dm_mutex);
+ spin_lock(&_minor_lock);
+
atomic_dec(&md->open_count);
dm_put(md);
- mutex_unlock(&dm_mutex);
+
+ spin_unlock(&_minor_lock);
return 0;
}
@@ -1638,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q)
if (map_request(ti, clone, md))
goto requeued;
- spin_lock_irq(q->queue_lock);
+ BUG_ON(!irqs_disabled());
+ spin_lock(q->queue_lock);
}
goto out;
requeued:
- spin_lock_irq(q->queue_lock);
+ BUG_ON(!irqs_disabled());
+ spin_lock(q->queue_lock);
plug_and_out:
if (!elv_queue_empty(q))
@@ -1884,7 +1885,8 @@ static struct mapped_device *alloc_dev(int minor)
add_disk(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
- md->wq = create_singlethread_workqueue("kdmflush");
+ md->wq = alloc_workqueue("kdmflush",
+ WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
if (!md->wq)
goto bad_thread;
@@ -1992,13 +1994,14 @@ static void event_callback(void *context)
wake_up(&md->eventq);
}
+/*
+ * Protected by md->suspend_lock obtained by dm_swap_table().
+ */
static void __set_size(struct mapped_device *md, sector_t size)
{
set_capacity(md->disk, size);
- mutex_lock(&md->bdev->bd_inode->i_mutex);
i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
- mutex_unlock(&md->bdev->bd_inode->i_mutex);
}
/*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7fc090ac9e28..cf8594c5ea21 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -288,10 +288,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
int rv;
int cpu;
- if (mddev == NULL || mddev->pers == NULL) {
+ if (mddev == NULL || mddev->pers == NULL
+ || !mddev->ready) {
bio_io_error(bio);
return 0;
}
+ smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
if (mddev->suspended) {
DEFINE_WAIT(__wait);
@@ -703,9 +705,9 @@ static struct mdk_personality *find_pers(int level, char *clevel)
}
/* return the offset of the super block in 512byte sectors */
-static inline sector_t calc_dev_sboffset(struct block_device *bdev)
+static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
{
- sector_t num_sectors = i_size_read(bdev->bd_inode) / 512;
+ sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
return MD_NEW_SIZE_SECTORS(num_sectors);
}
@@ -763,7 +765,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
*/
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
- bio->bi_bdev = rdev->bdev;
+ bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
bio->bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
@@ -793,7 +795,7 @@ static void bi_complete(struct bio *bio, int error)
}
int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
- struct page *page, int rw)
+ struct page *page, int rw, bool metadata_op)
{
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
struct completion event;
@@ -801,8 +803,12 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
rw |= REQ_SYNC | REQ_UNPLUG;
- bio->bi_bdev = rdev->bdev;
- bio->bi_sector = sector;
+ bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
+ rdev->meta_bdev : rdev->bdev;
+ if (metadata_op)
+ bio->bi_sector = sector + rdev->sb_start;
+ else
+ bio->bi_sector = sector + rdev->data_offset;
bio_add_page(bio, page, size, 0);
init_completion(&event);
bio->bi_private = &event;
@@ -827,7 +833,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
return 0;
- if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ))
+ if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
goto fail;
rdev->sb_loaded = 1;
return 0;
@@ -989,7 +995,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
*
* It also happens to be a multiple of 4Kb.
*/
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
ret = read_disk_sb(rdev, MD_SB_BYTES);
if (ret) return ret;
@@ -1330,7 +1336,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
return 0; /* component must fit device */
if (rdev->mddev->bitmap_info.offset)
return 0; /* can't move bitmap */
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
if (!num_sectors || num_sectors > rdev->sb_start)
num_sectors = rdev->sb_start;
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -2465,6 +2471,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (rdev2->raid_disk == slot)
return -EEXIST;
+ if (slot >= rdev->mddev->raid_disks &&
+ slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
+ return -ENOSPC;
+
rdev->raid_disk = slot;
if (test_bit(In_sync, &rdev->flags))
rdev->saved_raid_disk = slot;
@@ -2482,7 +2492,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
/* failure here is OK */;
/* don't wakeup anyone, leave that to userspace. */
} else {
- if (slot >= rdev->mddev->raid_disks)
+ if (slot >= rdev->mddev->raid_disks &&
+ slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
return -ENOSPC;
rdev->raid_disk = slot;
/* assume it is working */
@@ -3107,7 +3118,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
char nm[20];
if (rdev->raid_disk < 0)
continue;
- if (rdev->new_raid_disk > mddev->raid_disks)
+ if (rdev->new_raid_disk >= mddev->raid_disks)
rdev->new_raid_disk = -1;
if (rdev->new_raid_disk == rdev->raid_disk)
continue;
@@ -3736,6 +3747,8 @@ action_show(mddev_t *mddev, char *page)
return sprintf(page, "%s\n", type);
}
+static void reap_sync_thread(mddev_t *mddev);
+
static ssize_t
action_store(mddev_t *mddev, const char *page, size_t len)
{
@@ -3750,9 +3763,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
- mddev->recovery = 0;
+ reap_sync_thread(mddev);
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -3904,7 +3915,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
static ssize_t
sync_completed_show(mddev_t *mddev, char *page)
{
- unsigned long max_sectors, resync;
+ unsigned long long max_sectors, resync;
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
@@ -3915,7 +3926,7 @@ sync_completed_show(mddev_t *mddev, char *page)
max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync_completed;
- return sprintf(page, "%lu / %lu\n", resync, max_sectors);
+ return sprintf(page, "%llu / %llu\n", resync, max_sectors);
}
static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -4002,19 +4013,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
+ unsigned long long old = mddev->suspend_lo;
if (mddev->pers == NULL ||
mddev->pers->quiesce == NULL)
return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
- if (new >= mddev->suspend_hi ||
- (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
- mddev->suspend_lo = new;
+
+ mddev->suspend_lo = new;
+ if (new >= old)
+ /* Shrinking suspended region */
mddev->pers->quiesce(mddev, 2);
- return len;
- } else
- return -EINVAL;
+ else {
+ /* Expanding suspended region - need to wait */
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ return len;
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4031,20 +4047,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
+ unsigned long long old = mddev->suspend_hi;
if (mddev->pers == NULL ||
mddev->pers->quiesce == NULL)
return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
- if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
- (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
- mddev->suspend_hi = new;
+
+ mddev->suspend_hi = new;
+ if (new <= old)
+ /* Shrinking suspended region */
+ mddev->pers->quiesce(mddev, 2);
+ else {
+ /* Expanding suspended region - need to wait */
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
- return len;
- } else
- return -EINVAL;
+ }
+ return len;
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4422,7 +4442,9 @@ int md_run(mddev_t *mddev)
* We don't want the data to overlap the metadata,
* Internal Bitmap issues have been handled elsewhere.
*/
- if (rdev->data_offset < rdev->sb_start) {
+ if (rdev->meta_bdev) {
+ /* Nothing to check */;
+ } else if (rdev->data_offset < rdev->sb_start) {
if (mddev->dev_sectors &&
rdev->data_offset + mddev->dev_sectors
> rdev->sb_start) {
@@ -4556,7 +4578,8 @@ int md_run(mddev_t *mddev)
mddev->safemode_timer.data = (unsigned long) mddev;
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
-
+ smp_wmb();
+ mddev->ready = 1;
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0) {
char nm[20];
@@ -4693,13 +4716,12 @@ static void md_clean(mddev_t *mddev)
mddev->plug = NULL;
}
-void md_stop_writes(mddev_t *mddev)
+static void __md_stop_writes(mddev_t *mddev)
{
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
+ reap_sync_thread(mddev);
}
del_timer_sync(&mddev->safemode_timer);
@@ -4713,10 +4735,18 @@ void md_stop_writes(mddev_t *mddev)
md_update_sb(mddev, 1);
}
}
+
+void md_stop_writes(mddev_t *mddev)
+{
+ mddev_lock(mddev);
+ __md_stop_writes(mddev);
+ mddev_unlock(mddev);
+}
EXPORT_SYMBOL_GPL(md_stop_writes);
void md_stop(mddev_t *mddev)
{
+ mddev->ready = 0;
mddev->pers->stop(mddev);
if (mddev->pers->sync_request && mddev->to_remove == NULL)
mddev->to_remove = &md_redundancy_group;
@@ -4736,7 +4766,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
goto out;
}
if (mddev->pers) {
- md_stop_writes(mddev);
+ __md_stop_writes(mddev);
err = -ENXIO;
if (mddev->ro==1)
@@ -4773,7 +4803,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
if (mddev->ro)
set_disk_ro(disk, 0);
- md_stop_writes(mddev);
+ __md_stop_writes(mddev);
md_stop(mddev);
mddev->queue->merge_bvec_fn = NULL;
mddev->queue->unplug_fn = NULL;
@@ -5151,9 +5181,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
/* set saved_raid_disk if appropriate */
if (!mddev->persistent) {
if (info->state & (1<<MD_DISK_SYNC) &&
- info->raid_disk < mddev->raid_disks)
+ info->raid_disk < mddev->raid_disks) {
rdev->raid_disk = info->raid_disk;
- else
+ set_bit(In_sync, &rdev->flags);
+ } else
rdev->raid_disk = -1;
} else
super_types[mddev->major_version].
@@ -5230,7 +5261,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
printk(KERN_INFO "md: nonpersistent superblock ...\n");
rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
} else
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
rdev->sectors = rdev->sb_start;
err = bind_rdev_to_array(rdev, mddev);
@@ -5297,7 +5328,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
}
if (mddev->persistent)
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
else
rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
@@ -5510,7 +5541,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
* sb_start or, if that is <data_offset, it must fit before the size
* of each device. If num_sectors is zero, we find the largest size
* that fits.
-
*/
if (mddev->sync_thread)
return -EBUSY;
@@ -6033,7 +6063,8 @@ static int md_thread(void * arg)
|| kthread_should_stop(),
thread->timeout);
- if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags))
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+ if (!kthread_should_stop())
thread->run(thread->mddev);
}
@@ -6799,7 +6830,7 @@ void md_do_sync(mddev_t *mddev)
desc, mdname(mddev));
mddev->curr_resync = j;
}
- mddev->curr_resync_completed = mddev->curr_resync;
+ mddev->curr_resync_completed = j;
while (j < max_sectors) {
sector_t sectors;
@@ -6817,8 +6848,7 @@ void md_do_sync(mddev_t *mddev)
md_unplug(mddev);
wait_event(mddev->recovery_wait,
atomic_read(&mddev->recovery_active) == 0);
- mddev->curr_resync_completed =
- mddev->curr_resync;
+ mddev->curr_resync_completed = j;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
@@ -7023,6 +7053,45 @@ static int remove_and_add_spares(mddev_t *mddev)
}
return spares;
}
+
+static void reap_sync_thread(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ if (mddev->pers->spare_active(mddev))
+ sysfs_notify(&mddev->kobj, NULL,
+ "degraded");
+ }
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ mddev->pers->finish_reshape)
+ mddev->pers->finish_reshape(mddev);
+ md_update_sb(mddev, 1);
+
+ /* if array is no-longer degraded, then any saved_raid_disk
+ * information must be scrapped
+ */
+ if (!mddev->degraded)
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev->saved_raid_disk = -1;
+
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ /* flag recovery needed just to double check */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ md_new_event(mddev);
+}
+
/*
* This routine is regularly called by all per-raid-array threads to
* deal with generic issues like resync and super-block update.
@@ -7047,9 +7116,6 @@ static int remove_and_add_spares(mddev_t *mddev)
*/
void md_check_recovery(mddev_t *mddev)
{
- mdk_rdev_t *rdev;
-
-
if (mddev->bitmap)
bitmap_daemon_work(mddev);
@@ -7117,34 +7183,7 @@ void md_check_recovery(mddev_t *mddev)
goto unlock;
}
if (mddev->sync_thread) {
- /* resync has finished, collect result */
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
- if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
- /* success...*/
- /* activate any spares */
- if (mddev->pers->spare_active(mddev))
- sysfs_notify(&mddev->kobj, NULL,
- "degraded");
- }
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- mddev->pers->finish_reshape)
- mddev->pers->finish_reshape(mddev);
- md_update_sb(mddev, 1);
-
- /* if array is no-longer degraded, then any saved_raid_disk
- * information must be scrapped
- */
- if (!mddev->degraded)
- list_for_each_entry(rdev, &mddev->disks, same_set)
- rdev->saved_raid_disk = -1;
-
- mddev->recovery = 0;
- /* flag recovery needed just to double check */
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- md_new_event(mddev);
+ reap_sync_thread(mddev);
goto unlock;
}
/* Set RUNNING before clearing NEEDED to avoid
@@ -7202,7 +7241,11 @@ void md_check_recovery(mddev_t *mddev)
" thread...\n",
mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
- mddev->recovery = 0;
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
} else
md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d05bab55df4e..eec517ced31a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -60,6 +60,12 @@ struct mdk_rdev_s
mddev_t *mddev; /* RAID array if running */
int last_events; /* IO event timestamp */
+ /*
+ * If meta_bdev is non-NULL, it means that a separate device is
+ * being used to store the metadata (superblock/bitmap) which
+ * would otherwise be contained on the same device as the data (bdev).
+ */
+ struct block_device *meta_bdev;
struct block_device *bdev; /* block device handle */
struct page *sb_page;
@@ -148,7 +154,8 @@ struct mddev_s
* are happening, so run/
* takeover/stop are not safe
*/
-
+ int ready; /* See when safe to pass
+ * IO requests down */
struct gendisk *gendisk;
struct kobject kobj;
@@ -497,8 +504,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
- struct page *page, int rw);
+extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
+ struct page *page, int rw, bool metadata_op);
extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
extern int md_allow_write(mddev_t *mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 845cf95b612c..a23ffa397ba9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
} else
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
- KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
+ printk(KERN_ALERT
+ "md/raid1:%s: Disk failure on %s, disabling device.\n"
+ "md/raid1:%s: Operation continuing on %d devices.\n",
mdname(mddev), bdevname(rdev->bdev, b),
mdname(mddev), conf->raid_disks - mddev->degraded);
}
@@ -1364,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
*/
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev,
- sect + rdev->data_offset,
+ sect,
s<<9,
bio->bi_io_vec[idx].bv_page,
- READ)) {
+ READ, false)) {
success = 1;
break;
}
@@ -1390,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
rdev = conf->mirrors[d].rdev;
atomic_add(s, &rdev->corrected_errors);
if (sync_page_io(rdev,
- sect + rdev->data_offset,
+ sect,
s<<9,
bio->bi_io_vec[idx].bv_page,
- WRITE) == 0)
+ WRITE, false) == 0)
md_error(mddev, rdev);
}
d = start;
@@ -1405,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
continue;
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev,
- sect + rdev->data_offset,
+ sect,
s<<9,
bio->bi_io_vec[idx].bv_page,
- READ) == 0)
+ READ, false) == 0)
md_error(mddev, rdev);
}
} else {
@@ -1488,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
- sync_page_io(rdev,
- sect + rdev->data_offset,
- s<<9,
- conf->tmppage, READ))
+ sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false))
success = 1;
else {
d++;
@@ -1514,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE)
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, WRITE, false)
== 0)
/* Well, this device is dead */
md_error(mddev, rdev);
@@ -1531,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, READ)
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false)
== 0)
/* Well, this device is dead */
md_error(mddev, rdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0641674827f0..69b659544390 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
}
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
- KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
+ printk(KERN_ALERT
+ "md/raid10:%s: Disk failure on %s, disabling device.\n"
+ "md/raid10:%s: Operation continuing on %d devices.\n",
mdname(mddev), bdevname(rdev->bdev, b),
mdname(mddev), conf->raid_disks - mddev->degraded);
}
@@ -1559,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_unlock();
success = sync_page_io(rdev,
r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
+ sect,
s<<9,
- conf->tmppage, READ);
+ conf->tmppage, READ, false);
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
if (success)
@@ -1598,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
atomic_add(s, &rdev->corrected_errors);
if (sync_page_io(rdev,
r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE)
+ sect,
+ s<<9, conf->tmppage, WRITE, false)
== 0) {
/* Well, this device is dead */
printk(KERN_NOTICE
@@ -1635,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_unlock();
if (sync_page_io(rdev,
r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
+ sect,
s<<9, conf->tmppage,
- READ) == 0) {
+ READ, false) == 0) {
/* Well, this device is dead */
printk(KERN_NOTICE
"md/raid10:%s: unable to read back "
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dc574f303f8b..5044babfcda0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(Faulty, &rdev->flags);
printk(KERN_ALERT
"md/raid:%s: Disk failure on %s, disabling device.\n"
- KERN_ALERT
"md/raid:%s: Operation continuing on %d devices.\n",
mdname(mddev),
bdevname(rdev->bdev, b),
@@ -4237,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
mddev->reshape_position = conf->reshape_progress;
- mddev->curr_resync_completed = mddev->curr_resync;
+ mddev->curr_resync_completed = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
@@ -4338,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes) == 0);
mddev->reshape_position = conf->reshape_progress;
- mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
+ mddev->curr_resync_completed = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
@@ -5339,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev)
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
count++;
- sysfs_notify_dirent(tmp->rdev->sysfs_state);
+ sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
@@ -5528,8 +5527,8 @@ static int raid5_start_reshape(mddev_t *mddev)
return -ENOSPC;
list_for_each_entry(rdev, &mddev->disks, same_set)
- if (rdev->raid_disk < 0 &&
- !test_bit(Faulty, &rdev->flags))
+ if ((rdev->raid_disk < 0 || rdev->raid_disk >= conf->raid_disks)
+ && !test_bit(Faulty, &rdev->flags))
spares++;
if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
@@ -5589,6 +5588,11 @@ static int raid5_start_reshape(mddev_t *mddev)
/* Failure here is OK */;
} else
break;
+ } else if (rdev->raid_disk >= conf->previous_raid_disks
+ && !test_bit(Faulty, &rdev->flags)) {
+ /* This is a spare that was manually added */
+ set_bit(In_sync, &rdev->flags);
+ added_devices++;
}
/* When a reshape changes the number of devices, ->degraded
diff --git a/drivers/serial/atmel_serial.c b/drivers/serial/atmel_serial.c
index 3892666b5fbd..2a1d52fb4936 100644
--- a/drivers/serial/atmel_serial.c
+++ b/drivers/serial/atmel_serial.c
@@ -1732,6 +1732,11 @@ static int __devinit atmel_serial_probe(struct platform_device *pdev)
device_init_wakeup(&pdev->dev, 1);
platform_set_drvdata(pdev, port);
+ if (port->rs485.flags & SER_RS485_ENABLED) {
+ UART_PUT_MR(&port->uart, ATMEL_US_USMODE_NORMAL);
+ UART_PUT_CR(&port->uart, ATMEL_US_RTSEN);
+ }
+
return 0;
err_add_port:
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 5a48ce996dea..07bec09d1dad 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -71,11 +71,18 @@ config XEN_SYS_HYPERVISOR
but will have no xen contents.
config XEN_XENBUS_FRONTEND
- tristate
+ tristate
+
+config XEN_GNTDEV
+ tristate "userspace grant access device driver"
+ depends on XEN
+ select MMU_NOTIFIER
+ help
+ Allows userspace processes to use grants.
config XEN_PLATFORM_PCI
tristate "xen platform pci device driver"
- depends on XEN_PVHVM
+ depends on XEN_PVHVM && PCI
default m
help
Driver for the Xen PCI Platform device: it is responsible for
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 533a199e7a3f..5088cc2e6fe2 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -9,11 +9,14 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
obj-$(CONFIG_XEN_BALLOON) += balloon.o
obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
+obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
obj-$(CONFIG_XENFS) += xenfs/
obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
-obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o
+obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o
obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
obj-$(CONFIG_XEN_DOM0) += pci.o
xen-evtchn-y := evtchn.o
+xen-gntdev-y := gntdev.o
+xen-platform-pci-y := platform-pci.o
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
new file mode 100644
index 000000000000..1e31cdcdae1e
--- /dev/null
+++ b/drivers/xen/gntdev.c
@@ -0,0 +1,665 @@
+/******************************************************************************
+ * gntdev.c
+ *
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ * (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#undef DEBUG
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+
+#include <xen/xen.h>
+#include <xen/grant_table.h>
+#include <xen/gntdev.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
+ "Gerd Hoffmann <kraxel@redhat.com>");
+MODULE_DESCRIPTION("User-space granted page access driver");
+
+static int limit = 1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at "
+ "once by a gntdev instance");
+
+struct gntdev_priv {
+ struct list_head maps;
+ uint32_t used;
+ uint32_t limit;
+ /* lock protects maps from concurrent changes */
+ spinlock_t lock;
+ struct mm_struct *mm;
+ struct mmu_notifier mn;
+};
+
+struct grant_map {
+ struct list_head next;
+ struct gntdev_priv *priv;
+ struct vm_area_struct *vma;
+ int index;
+ int count;
+ int flags;
+ int is_mapped;
+ struct ioctl_gntdev_grant_ref *grants;
+ struct gnttab_map_grant_ref *map_ops;
+ struct gnttab_unmap_grant_ref *unmap_ops;
+ struct page **pages;
+};
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_print_maps(struct gntdev_priv *priv,
+ char *text, int text_index)
+{
+#ifdef DEBUG
+ struct grant_map *map;
+
+ pr_debug("maps list (priv %p, usage %d/%d)\n",
+ priv, priv->used, priv->limit);
+
+ list_for_each_entry(map, &priv->maps, next)
+ pr_debug(" index %2d, count %2d %s\n",
+ map->index, map->count,
+ map->index == text_index && text ? text : "");
+#endif
+}
+
+static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
+{
+ struct grant_map *add;
+ int i;
+
+ add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
+ if (NULL == add)
+ return NULL;
+
+ add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL);
+ add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL);
+ add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
+ add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL);
+ if (NULL == add->grants ||
+ NULL == add->map_ops ||
+ NULL == add->unmap_ops ||
+ NULL == add->pages)
+ goto err;
+
+ for (i = 0; i < count; i++) {
+ add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (add->pages[i] == NULL)
+ goto err;
+ }
+
+ add->index = 0;
+ add->count = count;
+ add->priv = priv;
+
+ if (add->count + priv->used > priv->limit)
+ goto err;
+
+ return add;
+
+err:
+ if (add->pages)
+ for (i = 0; i < count; i++) {
+ if (add->pages[i])
+ __free_page(add->pages[i]);
+ }
+ kfree(add->pages);
+ kfree(add->grants);
+ kfree(add->map_ops);
+ kfree(add->unmap_ops);
+ kfree(add);
+ return NULL;
+}
+
+static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (add->index + add->count < map->index) {
+ list_add_tail(&add->next, &map->next);
+ goto done;
+ }
+ add->index = map->index + map->count;
+ }
+ list_add_tail(&add->next, &priv->maps);
+
+done:
+ priv->used += add->count;
+ gntdev_print_maps(priv, "[new]", add->index);
+}
+
+static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
+ int index, int count)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (map->index != index)
+ continue;
+ if (map->count != count)
+ continue;
+ return map;
+ }
+ return NULL;
+}
+
+static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
+ unsigned long vaddr)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ if (vaddr < map->vma->vm_start)
+ continue;
+ if (vaddr >= map->vma->vm_end)
+ continue;
+ return map;
+ }
+ return NULL;
+}
+
+static int gntdev_del_map(struct grant_map *map)
+{
+ int i;
+
+ if (map->vma)
+ return -EBUSY;
+ for (i = 0; i < map->count; i++)
+ if (map->unmap_ops[i].handle)
+ return -EBUSY;
+
+ map->priv->used -= map->count;
+ list_del(&map->next);
+ return 0;
+}
+
+static void gntdev_free_map(struct grant_map *map)
+{
+ int i;
+
+ if (!map)
+ return;
+
+ if (map->pages)
+ for (i = 0; i < map->count; i++) {
+ if (map->pages[i])
+ __free_page(map->pages[i]);
+ }
+ kfree(map->pages);
+ kfree(map->grants);
+ kfree(map->map_ops);
+ kfree(map->unmap_ops);
+ kfree(map);
+}
+
+/* ------------------------------------------------------------------ */
+
+static int find_grant_ptes(pte_t *pte, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ struct grant_map *map = data;
+ unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+ u64 pte_maddr;
+
+ BUG_ON(pgnr >= map->count);
+ pte_maddr = arbitrary_virt_to_machine(pte).maddr;
+
+ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
+ GNTMAP_contains_pte | map->flags,
+ map->grants[pgnr].ref,
+ map->grants[pgnr].domid);
+ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr,
+ GNTMAP_contains_pte | map->flags,
+ 0 /* handle */);
+ return 0;
+}
+
+static int map_grant_pages(struct grant_map *map)
+{
+ int i, err = 0;
+
+ pr_debug("map %d+%d\n", map->index, map->count);
+ err = gnttab_map_refs(map->map_ops, map->pages, map->count);
+ if (err)
+ return err;
+
+ for (i = 0; i < map->count; i++) {
+ if (map->map_ops[i].status)
+ err = -EINVAL;
+ map->unmap_ops[i].handle = map->map_ops[i].handle;
+ }
+ return err;
+}
+
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+ int i, err = 0;
+
+ pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
+ err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages);
+ if (err)
+ return err;
+
+ for (i = 0; i < pages; i++) {
+ if (map->unmap_ops[offset+i].status)
+ err = -EINVAL;
+ map->unmap_ops[offset+i].handle = 0;
+ }
+ return err;
+}
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_vma_close(struct vm_area_struct *vma)
+{
+ struct grant_map *map = vma->vm_private_data;
+
+ pr_debug("close %p\n", vma);
+ map->is_mapped = 0;
+ map->vma = NULL;
+ vma->vm_private_data = NULL;
+}
+
+static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n",
+ vmf->virtual_address, vmf->pgoff);
+ vmf->flags = VM_FAULT_ERROR;
+ return 0;
+}
+
+static struct vm_operations_struct gntdev_vmops = {
+ .close = gntdev_vma_close,
+ .fault = gntdev_vma_fault,
+};
+
+/* ------------------------------------------------------------------ */
+
+static void mn_invl_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+ struct grant_map *map;
+ unsigned long mstart, mend;
+ int err;
+
+ spin_lock(&priv->lock);
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ if (!map->is_mapped)
+ continue;
+ if (map->vma->vm_start >= end)
+ continue;
+ if (map->vma->vm_end <= start)
+ continue;
+ mstart = max(start, map->vma->vm_start);
+ mend = min(end, map->vma->vm_end);
+ pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
+ map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end,
+ start, end, mstart, mend);
+ err = unmap_grant_pages(map,
+ (mstart - map->vma->vm_start) >> PAGE_SHIFT,
+ (mend - mstart) >> PAGE_SHIFT);
+ WARN_ON(err);
+ }
+ spin_unlock(&priv->lock);
+}
+
+static void mn_invl_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
+}
+
+static void mn_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+ struct grant_map *map;
+ int err;
+
+ spin_lock(&priv->lock);
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ pr_debug("map %d+%d (%lx %lx)\n",
+ map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end);
+ err = unmap_grant_pages(map, /* offset */ 0, map->count);
+ WARN_ON(err);
+ }
+ spin_unlock(&priv->lock);
+}
+
+struct mmu_notifier_ops gntdev_mmu_ops = {
+ .release = mn_release,
+ .invalidate_page = mn_invl_page,
+ .invalidate_range_start = mn_invl_range_start,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+ struct gntdev_priv *priv;
+ int ret = 0;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&priv->maps);
+ spin_lock_init(&priv->lock);
+ priv->limit = limit;
+
+ priv->mm = get_task_mm(current);
+ if (!priv->mm) {
+ kfree(priv);
+ return -ENOMEM;
+ }
+ priv->mn.ops = &gntdev_mmu_ops;
+ ret = mmu_notifier_register(&priv->mn, priv->mm);
+ mmput(priv->mm);
+
+ if (ret) {
+ kfree(priv);
+ return ret;
+ }
+
+ flip->private_data = priv;
+ pr_debug("priv %p\n", priv);
+
+ return 0;
+}
+
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ struct grant_map *map;
+ int err;
+
+ pr_debug("priv %p\n", priv);
+
+ spin_lock(&priv->lock);
+ while (!list_empty(&priv->maps)) {
+ map = list_entry(priv->maps.next, struct grant_map, next);
+ err = gntdev_del_map(map);
+ if (WARN_ON(err))
+ gntdev_free_map(map);
+
+ }
+ spin_unlock(&priv->lock);
+
+ mmu_notifier_unregister(&priv->mn, priv->mm);
+ kfree(priv);
+ return 0;
+}
+
+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
+ struct ioctl_gntdev_map_grant_ref __user *u)
+{
+ struct ioctl_gntdev_map_grant_ref op;
+ struct grant_map *map;
+ int err;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, add %d\n", priv, op.count);
+ if (unlikely(op.count <= 0))
+ return -EINVAL;
+ if (unlikely(op.count > priv->limit))
+ return -EINVAL;
+
+ err = -ENOMEM;
+ map = gntdev_alloc_map(priv, op.count);
+ if (!map)
+ return err;
+ if (copy_from_user(map->grants, &u->refs,
+ sizeof(map->grants[0]) * op.count) != 0) {
+ gntdev_free_map(map);
+ return err;
+ }
+
+ spin_lock(&priv->lock);
+ gntdev_add_map(priv, map);
+ op.index = map->index << PAGE_SHIFT;
+ spin_unlock(&priv->lock);
+
+ if (copy_to_user(u, &op, sizeof(op)) != 0) {
+ spin_lock(&priv->lock);
+ gntdev_del_map(map);
+ spin_unlock(&priv->lock);
+ gntdev_free_map(map);
+ return err;
+ }
+ return 0;
+}
+
+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
+ struct ioctl_gntdev_unmap_grant_ref __user *u)
+{
+ struct ioctl_gntdev_unmap_grant_ref op;
+ struct grant_map *map;
+ int err = -ENOENT;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
+ if (map)
+ err = gntdev_del_map(map);
+ spin_unlock(&priv->lock);
+ if (!err)
+ gntdev_free_map(map);
+ return err;
+}
+
+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
+ struct ioctl_gntdev_get_offset_for_vaddr __user *u)
+{
+ struct ioctl_gntdev_get_offset_for_vaddr op;
+ struct grant_map *map;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_vaddr(priv, op.vaddr);
+ if (map == NULL ||
+ map->vma->vm_start != op.vaddr) {
+ spin_unlock(&priv->lock);
+ return -EINVAL;
+ }
+ op.offset = map->index << PAGE_SHIFT;
+ op.count = map->count;
+ spin_unlock(&priv->lock);
+
+ if (copy_to_user(u, &op, sizeof(op)) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
+ struct ioctl_gntdev_set_max_grants __user *u)
+{
+ struct ioctl_gntdev_set_max_grants op;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ pr_debug("priv %p, limit %d\n", priv, op.count);
+ if (op.count > limit)
+ return -E2BIG;
+
+ spin_lock(&priv->lock);
+ priv->limit = op.count;
+ spin_unlock(&priv->lock);
+ return 0;
+}
+
+static long gntdev_ioctl(struct file *flip,
+ unsigned int cmd, unsigned long arg)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ void __user *ptr = (void __user *)arg;
+
+ switch (cmd) {
+ case IOCTL_GNTDEV_MAP_GRANT_REF:
+ return gntdev_ioctl_map_grant_ref(priv, ptr);
+
+ case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+ return gntdev_ioctl_unmap_grant_ref(priv, ptr);
+
+ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+ return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
+
+ case IOCTL_GNTDEV_SET_MAX_GRANTS:
+ return gntdev_ioctl_set_max_grants(priv, ptr);
+
+ default:
+ pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ int index = vma->vm_pgoff;
+ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ struct grant_map *map;
+ int err = -EINVAL;
+
+ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ pr_debug("map %d+%d at %lx (pgoff %lx)\n",
+ index, count, vma->vm_start, vma->vm_pgoff);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_index(priv, index, count);
+ if (!map)
+ goto unlock_out;
+ if (map->vma)
+ goto unlock_out;
+ if (priv->mm != vma->vm_mm) {
+ printk(KERN_WARNING "Huh? Other mm?\n");
+ goto unlock_out;
+ }
+
+ vma->vm_ops = &gntdev_vmops;
+
+ vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP;
+
+ vma->vm_private_data = map;
+ map->vma = vma;
+
+ map->flags = GNTMAP_host_map | GNTMAP_application_map;
+ if (!(vma->vm_flags & VM_WRITE))
+ map->flags |= GNTMAP_readonly;
+
+ spin_unlock(&priv->lock);
+
+ err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+ vma->vm_end - vma->vm_start,
+ find_grant_ptes, map);
+ if (err) {
+ printk(KERN_WARNING "find_grant_ptes() failure.\n");
+ return err;
+ }
+
+ err = map_grant_pages(map);
+ if (err) {
+ printk(KERN_WARNING "map_grant_pages() failure.\n");
+ return err;
+ }
+
+ map->is_mapped = 1;
+
+ return 0;
+
+unlock_out:
+ spin_unlock(&priv->lock);
+ return err;
+}
+
+static const struct file_operations gntdev_fops = {
+ .owner = THIS_MODULE,
+ .open = gntdev_open,
+ .release = gntdev_release,
+ .mmap = gntdev_mmap,
+ .unlocked_ioctl = gntdev_ioctl
+};
+
+static struct miscdevice gntdev_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/gntdev",
+ .fops = &gntdev_fops,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int __init gntdev_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&gntdev_miscdev);
+ if (err != 0) {
+ printk(KERN_ERR "Could not register gntdev device\n");
+ return err;
+ }
+ return 0;
+}
+
+static void __exit gntdev_exit(void)
+{
+ misc_deregister(&gntdev_miscdev);
+}
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* ------------------------------------------------------------------ */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 6c4531816496..9ef54ebc1194 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -447,6 +447,52 @@ unsigned int gnttab_max_grant_frames(void)
}
EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret;
+ pte_t *pte;
+ unsigned long mfn;
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < count; i++) {
+ /* m2p override only supported for GNTMAP_contains_pte mappings */
+ if (!(map_ops[i].flags & GNTMAP_contains_pte))
+ continue;
+ pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+ (map_ops[i].host_addr & ~PAGE_MASK));
+ mfn = pte_mfn(*pte);
+ ret = m2p_add_override(mfn, pages[i]);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_map_refs);
+
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret;
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < count; i++) {
+ ret = m2p_remove_override(pages[i]);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
+
static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
struct gnttab_setup_table setup;
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index c01b5ddce529..afbe041f42c5 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -105,7 +105,7 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
int i, ret;
- long ioaddr, iolen;
+ long ioaddr;
long mmio_addr, mmio_len;
unsigned int max_nr_gframes;
@@ -114,7 +114,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
return i;
ioaddr = pci_resource_start(pdev, 0);
- iolen = pci_resource_len(pdev, 0);
mmio_addr = pci_resource_start(pdev, 1);
mmio_len = pci_resource_len(pdev, 1);
@@ -125,19 +124,13 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
goto pci_out;
}
- if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) {
- dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n",
- mmio_addr, mmio_len);
- ret = -EBUSY;
+ ret = pci_request_region(pdev, 1, DRV_NAME);
+ if (ret < 0)
goto pci_out;
- }
- if (request_region(ioaddr, iolen, DRV_NAME) == NULL) {
- dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n",
- iolen, ioaddr);
- ret = -EBUSY;
+ ret = pci_request_region(pdev, 0, DRV_NAME);
+ if (ret < 0)
goto mem_out;
- }
platform_mmio = mmio_addr;
platform_mmiolen = mmio_len;
@@ -169,9 +162,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
return 0;
out:
- release_region(ioaddr, iolen);
+ pci_release_region(pdev, 0);
mem_out:
- release_mem_region(mmio_addr, mmio_len);
+ pci_release_region(pdev, 1);
pci_out:
pci_disable_device(pdev);
return ret;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9ed476906327..d3b28abdd6aa 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -141,13 +141,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
return rc;
}
-static inode *ecryptfs_get_inode(struct inode *lower_inode,
+static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
struct super_block *sb)
{
struct inode *inode;
int rc = 0;
- lower_inode = lower_dentry->d_inode;
if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
rc = -EXDEV;
goto out;
@@ -202,7 +201,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
{
struct inode *lower_inode = lower_dentry->d_inode;
struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
- if (IS_ERR(inode)
+ if (IS_ERR(inode))
return PTR_ERR(inode);
if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
d_add(dentry, inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953aa..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
return list_entry(head, struct inode, i_wb_list);
}
-static void bdi_queue_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
{
- trace_writeback_queue(bdi, work);
-
- spin_lock_bh(&bdi->wb_lock);
- list_add_tail(&work->list, &bdi->work_list);
if (bdi->wb.task) {
wake_up_process(bdi->wb.task);
} else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
* The bdi thread isn't there, wake up the forker thread which
* will create and run it.
*/
- trace_writeback_nothread(bdi, work);
wake_up_process(default_backing_dev_info.wb.task);
}
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+ struct wb_writeback_work *work)
+{
+ trace_writeback_queue(bdi, work);
+
+ spin_lock_bh(&bdi->wb_lock);
+ list_add_tail(&work->list, &bdi->work_list);
+ if (!bdi->wb.task)
+ trace_writeback_nothread(bdi, work);
+ bdi_wakeup_flusher(bdi);
spin_unlock_bh(&bdi->wb_lock);
}
static void
__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- bool range_cyclic, bool for_background)
+ bool range_cyclic)
{
struct wb_writeback_work *work;
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
work->sync_mode = WB_SYNC_NONE;
work->nr_pages = nr_pages;
work->range_cyclic = range_cyclic;
- work->for_background = for_background;
bdi_queue_work(bdi, work);
}
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
*/
void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
{
- __bdi_start_writeback(bdi, nr_pages, true, false);
+ __bdi_start_writeback(bdi, nr_pages, true);
}
/**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
* @bdi: the backing device to write from
*
* Description:
- * This does WB_SYNC_NONE background writeback. The IO is only
- * started when this function returns, we make no guarentees on
- * completion. Caller need not hold sb s_umount semaphore.
+ * This makes sure WB_SYNC_NONE background writeback happens. When
+ * this function returns, it is only guaranteed that for given BDI
+ * some IO is happening if we are over background dirty threshold.
+ * Caller need not hold sb s_umount semaphore.
*/
void bdi_start_background_writeback(struct backing_dev_info *bdi)
{
- __bdi_start_writeback(bdi, LONG_MAX, true, true);
+ /*
+ * We just wake up the flusher thread. It will perform background
+ * writeback as soon as there is no other work to do.
+ */
+ trace_writeback_wake_background(bdi);
+ spin_lock_bh(&bdi->wb_lock);
+ bdi_wakeup_flusher(bdi);
+ spin_unlock_bh(&bdi->wb_lock);
}
/*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
};
unsigned long oldest_jif;
long wrote = 0;
+ long write_chunk;
struct inode *inode;
if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
wbc.range_end = LLONG_MAX;
}
+ /*
+ * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+ * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+ * here avoids calling into writeback_inodes_wb() more than once.
+ *
+ * The intended call sequence for WB_SYNC_ALL writeback is:
+ *
+ * wb_writeback()
+ * __writeback_inodes_sb() <== called only once
+ * write_cache_pages() <== called once for each inode
+ * (quickly) tag currently dirty pages
+ * (maybe slowly) sync all tagged pages
+ */
+ if (wbc.sync_mode == WB_SYNC_NONE)
+ write_chunk = MAX_WRITEBACK_PAGES;
+ else
+ write_chunk = LONG_MAX;
+
wbc.wb_start = jiffies; /* livelock avoidance */
for (;;) {
/*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
break;
/*
+ * Background writeout and kupdate-style writeback may
+ * run forever. Stop them if there is other work to do
+ * so that e.g. sync can proceed. They'll be restarted
+ * after the other works are all done.
+ */
+ if ((work->for_background || work->for_kupdate) &&
+ !list_empty(&wb->bdi->work_list))
+ break;
+
+ /*
* For background writeout, stop when we are below the
* background dirty threshold
*/
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
break;
wbc.more_io = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+ wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
writeback_inodes_wb(wb, &wbc);
trace_wbc_writeback_written(&wbc, wb->bdi);
- work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ work->nr_pages -= write_chunk - wbc.nr_to_write;
+ wrote += write_chunk - wbc.nr_to_write;
/*
* If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
/*
* Did we write something? Try for more
*/
- if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+ if (wbc.nr_to_write < write_chunk)
continue;
/*
* Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
get_nr_dirty_inodes();
}
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+ if (over_bground_thresh()) {
+
+ struct wb_writeback_work work = {
+ .nr_pages = LONG_MAX,
+ .sync_mode = WB_SYNC_NONE,
+ .for_background = 1,
+ .range_cyclic = 1,
+ };
+
+ return wb_writeback(wb, &work);
+ }
+
+ return 0;
+}
+
static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
+ wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
if (!bdi_has_dirty_io(bdi))
continue;
- __bdi_start_writeback(bdi, nr_pages, false, false);
+ __bdi_start_writeback(bdi, nr_pages, false);
}
rcu_read_unlock();
}
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
* @sb: the superblock
*
* This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
*/
void sync_inodes_sb(struct super_block *sb)
{
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
EXPORT_SYMBOL(sync_inode);
/**
- * sync_inode - write an inode to disk
+ * sync_inode_metadata - write an inode to disk
* @inode: the inode to sync
* @wait: wait for I/O to complete.
*
- * Write an inode to disk and adjust it's dirty state after completion.
+ * Write an inode to disk and adjust its dirty state after completion.
*
* Note: only writes the actual inode, no associated data or other metadata.
*/
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea556..d78455a81ec9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
-static void mpage_end_io_read(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
if (--bvec >= bio->bi_io_vec)
prefetchw(&bvec->bv_page->flags);
-
- if (uptodate) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- } while (bvec >= bio->bi_io_vec);
- bio_put(bio);
-}
-
-static void mpage_end_io_write(struct bio *bio, int err)
-{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-
- do {
- struct page *page = bvec->bv_page;
-
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
-
- if (!uptodate){
- SetPageError(page);
- if (page->mapping)
- set_bit(AS_EIO, &page->mapping->flags);
+ if (bio_data_dir(bio) == READ) {
+ if (uptodate) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ } else { /* bio_data_dir(bio) == WRITE */
+ if (!uptodate) {
+ SetPageError(page);
+ if (page->mapping)
+ set_bit(AS_EIO, &page->mapping->flags);
+ }
+ end_page_writeback(page);
}
- end_page_writeback(page);
} while (bvec >= bio->bi_io_vec);
bio_put(bio);
}
static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
- bio->bi_end_io = mpage_end_io_read;
- if (rw == WRITE)
- bio->bi_end_io = mpage_end_io_write;
+ bio->bi_end_io = mpage_end_io;
submit_bio(rw, bio);
return NULL;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 95b081bc9e25..64ee240f3c80 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1579,6 +1579,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
{
struct iattr attr;
int error;
+ int open_flags = 0;
dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1586,7 +1587,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
- error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
+ if ((nd->flags & LOOKUP_CREATE) != 0)
+ open_flags = nd->intent.open.flags;
+
+ error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
if (error != 0)
goto out_err;
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93f1cdd5d3d7..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
goto err_task_lock;
}
- if (oom_score_adj < task->signal->oom_score_adj &&
+ if (oom_score_adj < task->signal->oom_score_adj_min &&
!capable(CAP_SYS_RESOURCE)) {
err = -EACCES;
goto err_sighand;
@@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
atomic_dec(&task->mm->oom_disable_count);
}
task->signal->oom_score_adj = oom_score_adj;
+ if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ task->signal->oom_score_adj_min = oom_score_adj;
/*
* Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
* always attainable.
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_MEMORY_FAILURE
"HardwareCorrupted: %5lu kB\n"
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "AnonHugePages: %8lu kB\n"
+#endif
,
K(i.totalram),
K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(i.freeswap),
K(global_page_state(NR_FILE_DIRTY)),
K(global_page_state(NR_WRITEBACK)),
- K(global_page_state(NR_ANON_PAGES)),
+ K(global_page_state(NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+ HPAGE_PMD_NR
+#endif
+ ),
K(global_page_state(NR_FILE_MAPPED)),
K(global_page_state(NR_SHMEM)),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_MEMORY_FAILURE
,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+ HPAGE_PMD_NR)
+#endif
);
hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b06c674624e6..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
if (PageHuge(page))
u |= 1 << KPF_HUGE;
- u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
-
/*
- * Caveats on high order pages:
- * PG_buddy will only be set on the head page; SLUB/SLQB do the same
- * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+ * Caveats on high order pages: page->_count will only be set
+ * -1 on the head page; SLUB/SLQB do the same for PG_slab;
+ * SLOB won't set PG_slab at all on compound pages.
*/
+ if (PageBuddy(page))
+ u |= 1 << KPF_BUDDY;
+
+ u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
+
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
- u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c3755bd8dd3e..60b914860f81 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -418,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
"Anonymous: %8lu kB\n"
"Swap: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
- "MMUPageSize: %8lu kB\n",
+ "MMUPageSize: %8lu kB\n"
+ "Locked: %8lu kB\n",
(vma->vm_end - vma->vm_start) >> 10,
mss.resident >> 10,
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -430,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
mss.anonymous >> 10,
mss.swap >> 10,
vma_kernel_pagesize(vma) >> 10,
- vma_mmu_pagesize(vma) >> 10);
+ vma_mmu_pagesize(vma) >> 10,
+ (vma->vm_flags & VM_LOCKED) ?
+ (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 6098cae2af8e..ff5c66080c8c 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data,
/* Always use the library code for GPIO management calls,
* or when sleeping may be involved.
*/
-extern int __must_check gpio_request(unsigned gpio, const char *label);
+extern int gpio_request(unsigned gpio, const char *label);
extern void gpio_free(unsigned gpio);
-extern int __must_check gpio_direction_input(unsigned gpio);
-extern int __must_check gpio_direction_output(unsigned gpio, int value);
+extern int gpio_direction_input(unsigned gpio);
+extern int gpio_direction_output(unsigned gpio, int value);
extern int gpio_set_debounce(unsigned gpio, unsigned debounce);
@@ -192,8 +192,8 @@ struct gpio {
const char *label;
};
-extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
-extern int __must_check gpio_request_array(struct gpio *array, size_t num);
+extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
+extern int gpio_request_array(struct gpio *array, size_t num);
extern void gpio_free_array(struct gpio *array, size_t num);
#ifdef CONFIG_GPIO_SYSFS
diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index 3da9e2742fa0..787abbb6d867 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -45,6 +45,9 @@
#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
+#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */
+#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 6f3c6ae4fe03..f1eddf71dd0c 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -5,67 +5,108 @@
#ifdef CONFIG_MMU
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-/*
- * Largely same as above, but only sets the access flags (dirty,
- * accessed, and writable). Furthermore, we know it always gets set
- * to a "more permissive" setting, which allows most architectures
- * to optimize this. We return whether the PTE actually changed, which
- * in turn instructs the caller to do things like update__mmu_cache.
- * This used to be done in the caller, but sparc needs minor faults to
- * force that call on sun4c so we changed this macro slightly
- */
-#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
-({ \
- int __changed = !pte_same(*(__ptep), __entry); \
- if (__changed) { \
- set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \
- flush_tlb_page(__vma, __address); \
- } \
- __changed; \
-})
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty);
#endif
#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(__vma, __address, __ptep) \
-({ \
- pte_t __pte = *(__ptep); \
- int r = 1; \
- if (!pte_young(__pte)) \
- r = 0; \
- else \
- set_pte_at((__vma)->vm_mm, (__address), \
- (__ptep), pte_mkold(__pte)); \
- r; \
-})
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *ptep)
+{
+ pte_t pte = *ptep;
+ int r = 1;
+ if (!pte_young(pte))
+ r = 0;
+ else
+ set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
+ return r;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = *pmdp;
+ int r = 1;
+ if (!pmd_young(pmd))
+ r = 0;
+ else
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
+ return r;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp)
+{
+ BUG();
+ return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(__vma, __address, __ptep) \
-({ \
- int __young; \
- __young = ptep_test_and_clear_young(__vma, __address, __ptep); \
- if (__young) \
- flush_tlb_page(__vma, __address); \
- __young; \
-})
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
#endif
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define ptep_get_and_clear(__mm, __address, __ptep) \
-({ \
- pte_t __pte = *(__ptep); \
- pte_clear((__mm), (__address), (__ptep)); \
- __pte; \
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long address,
+ pte_t *ptep)
+{
+ pte_t pte = *ptep;
+ pte_clear(mm, address, ptep);
+ return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = *pmdp;
+ pmd_clear(mm, address, pmdp);
+ return pmd;
})
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t *pmdp)
+{
+ BUG();
+ return __pmd(0);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
-#define ptep_get_and_clear_full(__mm, __address, __ptep, __full) \
-({ \
- pte_t __pte; \
- __pte = ptep_get_and_clear((__mm), (__address), (__ptep)); \
- __pte; \
-})
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep,
+ int full)
+{
+ pte_t pte;
+ pte = ptep_get_and_clear(mm, address, ptep);
+ return pte;
+}
#endif
/*
@@ -74,20 +115,25 @@
* not present, or in the process of an address space destruction.
*/
#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
-#define pte_clear_not_present_full(__mm, __address, __ptep, __full) \
-do { \
- pte_clear((__mm), (__address), (__ptep)); \
-} while (0)
+static inline void pte_clear_not_present_full(struct mm_struct *mm,
+ unsigned long address,
+ pte_t *ptep,
+ int full)
+{
+ pte_clear(mm, address, ptep);
+}
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
-#define ptep_clear_flush(__vma, __address, __ptep) \
-({ \
- pte_t __pte; \
- __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \
- flush_tlb_page(__vma, __address); \
- __pte; \
-})
+extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *ptep);
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp);
#endif
#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -99,8 +145,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
}
#endif
+#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+ unsigned long address, pmd_t *pmdp)
+{
+ pmd_t old_pmd = *pmdp;
+ set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+ unsigned long address, pmd_t *pmdp)
+{
+ BUG();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp);
+#endif
+
#ifndef __HAVE_ARCH_PTE_SAME
-#define pte_same(A,B) (pte_val(A) == pte_val(B))
+static inline int pte_same(pte_t pte_a, pte_t pte_b)
+{
+ return pte_val(pte_a) == pte_val(pte_b);
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMD_SAME
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+ return pmd_val(pmd_a) == pmd_val(pmd_b);
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+ BUG();
+ return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PAGE_TEST_DIRTY
@@ -348,6 +435,24 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size);
#endif
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return 0;
+}
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+ return 0;
+}
+#ifndef __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+ BUG();
+ return 0;
+}
+#endif /* __HAVE_ARCH_PMD_WRITE */
+#endif
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 5ac51552d908..dfa2ed4c0d26 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -11,6 +11,9 @@
/* The full zone was compacted */
#define COMPACT_COMPLETE 3
+#define COMPACT_MODE_DIRECT_RECLAIM 0
+#define COMPACT_MODE_KSWAPD 1
+
#ifdef CONFIG_COMPACTION
extern int sysctl_compact_memory;
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
@@ -21,7 +24,12 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
- int order, gfp_t gfp_mask, nodemask_t *mask);
+ int order, gfp_t gfp_mask, nodemask_t *mask,
+ bool sync);
+extern unsigned long compaction_suitable(struct zone *zone, int order);
+extern unsigned long compact_zone_order(struct zone *zone, int order,
+ gfp_t gfp_mask, bool sync,
+ int compact_mode);
/* Do not skip compaction more than 64 times */
#define COMPACT_MAX_DEFER_SHIFT 6
@@ -54,7 +62,20 @@ static inline bool compaction_deferred(struct zone *zone)
#else
static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
- int order, gfp_t gfp_mask, nodemask_t *nodemask)
+ int order, gfp_t gfp_mask, nodemask_t *nodemask,
+ bool sync)
+{
+ return COMPACT_CONTINUE;
+}
+
+static inline unsigned long compaction_suitable(struct zone *zone, int order)
+{
+ return COMPACT_SKIPPED;
+}
+
+static inline unsigned long compact_zone_order(struct zone *zone, int order,
+ gfp_t gfp_mask, bool sync,
+ int compact_mode)
{
return COMPACT_CONTINUE;
}
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 2970022faa63..272496d1fae4 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -193,6 +193,13 @@ struct dm_target {
char *error;
};
+/* Each target can link one of these into the table */
+struct dm_target_callbacks {
+ struct list_head list;
+ int (*congested_fn) (struct dm_target_callbacks *, int);
+ void (*unplug_fn)(struct dm_target_callbacks *);
+};
+
int dm_register_target(struct target_type *t);
void dm_unregister_target(struct target_type *t);
@@ -269,6 +276,11 @@ int dm_table_add_target(struct dm_table *t, const char *type,
sector_t start, sector_t len, char *params);
/*
+ * Target_ctr should call this if it needs to add any callbacks.
+ */
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb);
+
+/*
* Finally call this to make the table ready for use.
*/
int dm_table_complete(struct dm_table *t);
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 49eab360d5d4..78bbf47bbb96 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -44,7 +44,7 @@
* Remove a device, destroy any tables.
*
* DM_DEV_RENAME:
- * Rename a device.
+ * Rename a device or set its uuid if none was previously supplied.
*
* DM_SUSPEND:
* This performs both suspend and resume, depending which flag is
@@ -267,9 +267,9 @@ enum {
#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
#define DM_VERSION_MAJOR 4
-#define DM_VERSION_MINOR 18
-#define DM_VERSION_PATCHLEVEL 0
-#define DM_VERSION_EXTRA "-ioctl (2010-06-29)"
+#define DM_VERSION_MINOR 19
+#define DM_VERSION_PATCHLEVEL 1
+#define DM_VERSION_EXTRA "-ioctl (2011-01-07)"
/* Status bits */
#define DM_READONLY_FLAG (1 << 0) /* In/Out */
@@ -322,4 +322,10 @@ enum {
*/
#define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */
+/*
+ * If set, rename changes the uuid not the name. Only permitted
+ * if no uuid was previously supplied: an existing uuid cannot be changed.
+ */
+#define DM_UUID_FLAG (1 << 14) /* In */
+
#endif /* _LINUX_DM_IOCTL_H */
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
index 0c3c3a2110c4..eeace7d3ff15 100644
--- a/include/linux/dm-log-userspace.h
+++ b/include/linux/dm-log-userspace.h
@@ -370,6 +370,16 @@
#define DM_ULOG_REQUEST_TYPE(request_type) \
(DM_ULOG_REQUEST_MASK & (request_type))
+/*
+ * DM_ULOG_REQUEST_VERSION is incremented when there is a
+ * change to the way information is passed between kernel
+ * and userspace. This could be a structure change of
+ * dm_ulog_request or a change in the way requests are
+ * issued/handled. Changes are outlined here:
+ * version 1: Initial implementation
+ */
+#define DM_ULOG_REQUEST_VERSION 1
+
struct dm_ulog_request {
/*
* The local unique identifier (luid) and the universally unique
@@ -383,8 +393,9 @@ struct dm_ulog_request {
*/
uint64_t luid;
char uuid[DM_UUID_LEN];
- char padding[7]; /* Padding because DM_UUID_LEN = 129 */
+ char padding[3]; /* Padding because DM_UUID_LEN = 129 */
+ uint32_t version; /* See DM_ULOG_REQUEST_VERSION */
int32_t error; /* Used to report back processing errors */
uint32_t seq; /* Sequence number for request */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f54adfcbec9c..a3b148a91874 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -34,6 +34,7 @@ struct vm_area_struct;
#else
#define ___GFP_NOTRACK 0
#endif
+#define ___GFP_NO_KSWAPD 0x400000u
/*
* GFP bitmasks..
@@ -81,13 +82,15 @@ struct vm_area_struct;
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */
+#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
+
/*
* This may seem redundant, but it's a way of annotating false positives vs.
* allocations that simply cannot be supported (e.g. page tables).
*/
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
-#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 23 /* Room for 23 __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/* This equals 0, but use constants in case they ever change */
@@ -106,6 +109,9 @@ struct vm_area_struct;
__GFP_HARDWALL | __GFP_HIGHMEM | \
__GFP_MOVABLE)
#define GFP_IOFS (__GFP_IO | __GFP_FS)
+#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
+ __GFP_NO_KSWAPD)
#ifdef CONFIG_NUMA
#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
@@ -325,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
{
return alloc_pages_current(gfp_mask, order);
}
-extern struct page *alloc_page_vma(gfp_t gfp_mask,
+extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
struct vm_area_struct *vma, unsigned long addr);
#else
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0)
+#define alloc_pages_vma(gfp_mask, order, vma, addr) \
+ alloc_pages(gfp_mask, order)
#endif
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
+#define alloc_page_vma(gfp_mask, vma, addr) \
+ alloc_pages_vma(gfp_mask, 0, vma, addr)
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index f79d67f413e4..4b47ed96f131 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -30,7 +30,7 @@ static inline int gpio_is_valid(int number)
return 0;
}
-static inline int __must_check gpio_request(unsigned gpio, const char *label)
+static inline int gpio_request(unsigned gpio, const char *label)
{
return -ENOSYS;
}
@@ -62,12 +62,12 @@ static inline void gpio_free_array(struct gpio *array, size_t num)
WARN_ON(1);
}
-static inline int __must_check gpio_direction_input(unsigned gpio)
+static inline int gpio_direction_input(unsigned gpio)
{
return -ENOSYS;
}
-static inline int __must_check gpio_direction_output(unsigned gpio, int value)
+static inline int gpio_direction_output(unsigned gpio, int value)
{
return -ENOSYS;
}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
new file mode 100644
index 000000000000..8e6c8c42bc3c
--- /dev/null
+++ b/include/linux/huge_mm.h
@@ -0,0 +1,179 @@
+#ifndef _LINUX_HUGE_MM_H
+#define _LINUX_HUGE_MM_H
+
+extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ unsigned int flags);
+extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *vma);
+extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pmd_t orig_pmd);
+extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm);
+extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags);
+extern int zap_huge_pmd(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ pmd_t *pmd);
+extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec);
+extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, pgprot_t newprot);
+
+enum transparent_hugepage_flag {
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
+#ifdef CONFIG_DEBUG_VM
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
+#endif
+};
+
+enum page_check_address_pmd_flag {
+ PAGE_CHECK_ADDRESS_PMD_FLAG,
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
+};
+extern pmd_t *page_check_address_pmd(struct page *page,
+ struct mm_struct *mm,
+ unsigned long address,
+ enum page_check_address_pmd_flag flag);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define HPAGE_PMD_SHIFT HPAGE_SHIFT
+#define HPAGE_PMD_MASK HPAGE_MASK
+#define HPAGE_PMD_SIZE HPAGE_SIZE
+
+#define transparent_hugepage_enabled(__vma) \
+ ((transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_FLAG) || \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) && \
+ ((__vma)->vm_flags & VM_HUGEPAGE))) && \
+ !((__vma)->vm_flags & VM_NOHUGEPAGE))
+#define transparent_hugepage_defrag(__vma) \
+ ((transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) || \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \
+ (__vma)->vm_flags & VM_HUGEPAGE))
+#ifdef CONFIG_DEBUG_VM
+#define transparent_hugepage_debug_cow() \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG))
+#else /* CONFIG_DEBUG_VM */
+#define transparent_hugepage_debug_cow() 0
+#endif /* CONFIG_DEBUG_VM */
+
+extern unsigned long transparent_hugepage_flags;
+extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end);
+extern int handle_pte_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, unsigned int flags);
+extern int split_huge_page(struct page *page);
+extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
+#define split_huge_page_pmd(__mm, __pmd) \
+ do { \
+ pmd_t *____pmd = (__pmd); \
+ if (unlikely(pmd_trans_huge(*____pmd))) \
+ __split_huge_page_pmd(__mm, ____pmd); \
+ } while (0)
+#define wait_split_huge_page(__anon_vma, __pmd) \
+ do { \
+ pmd_t *____pmd = (__pmd); \
+ spin_unlock_wait(&(__anon_vma)->root->lock); \
+ /* \
+ * spin_unlock_wait() is just a loop in C and so the \
+ * CPU can reorder anything around it. \
+ */ \
+ smp_mb(); \
+ BUG_ON(pmd_trans_splitting(*____pmd) || \
+ pmd_trans_huge(*____pmd)); \
+ } while (0)
+#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
+#if HPAGE_PMD_ORDER > MAX_ORDER
+#error "hugepages can't be allocated by the buddy allocator"
+#endif
+extern int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice);
+extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next);
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+ if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ return;
+ __vma_adjust_trans_huge(vma, start, end, adjust_next);
+}
+static inline int hpage_nr_pages(struct page *page)
+{
+ if (unlikely(PageTransHuge(page)))
+ return HPAGE_PMD_NR;
+ return 1;
+}
+static inline struct page *compound_trans_head(struct page *page)
+{
+ if (PageTail(page)) {
+ struct page *head;
+ head = page->first_page;
+ smp_rmb();
+ /*
+ * head may be a dangling pointer.
+ * __split_huge_page_refcount clears PageTail before
+ * overwriting first_page, so if PageTail is still
+ * there it means the head pointer isn't dangling.
+ */
+ if (PageTail(page))
+ return head;
+ }
+ return page;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+#define HPAGE_PMD_SHIFT ({ BUG(); 0; })
+#define HPAGE_PMD_MASK ({ BUG(); 0; })
+#define HPAGE_PMD_SIZE ({ BUG(); 0; })
+
+#define hpage_nr_pages(x) 1
+
+#define transparent_hugepage_enabled(__vma) 0
+
+#define transparent_hugepage_flags 0UL
+static inline int split_huge_page(struct page *page)
+{
+ return 0;
+}
+#define split_huge_page_pmd(__mm, __pmd) \
+ do { } while (0)
+#define wait_split_huge_page(__anon_vma, __pmd) \
+ do { } while (0)
+#define compound_trans_head(page) compound_head(page)
+static inline int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ BUG();
+ return 0;
+}
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 979c68cc7458..6a64c6fa81af 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -57,7 +57,7 @@ struct irq_desc {
#endif
struct timer_rand_state *timer_rand_state;
- unsigned int *kstat_irqs;
+ unsigned int __percpu *kstat_irqs;
irq_flow_handler_t handle_irq;
struct irqaction *action; /* IRQ action list */
unsigned int status; /* IRQ status */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 57dac7022b63..5a9d9059520b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -600,6 +600,13 @@ struct sysinfo {
#define NUMA_BUILD 0
#endif
+/* This helps us avoid #ifdef CONFIG_COMPACTION */
+#ifdef CONFIG_COMPACTION
+#define COMPACTION_BUILD 1
+#else
+#define COMPACTION_BUILD 0
+#endif
+
/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 44e83ba12b5b..0cce2db580c3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
extern unsigned long long nr_context_switches(void);
#ifndef CONFIG_GENERIC_HARDIRQS
-#define kstat_irqs_this_cpu(irq) \
- (this_cpu_read(kstat.irqs[irq])
struct irq_desc;
static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
struct irq_desc *desc)
{
- kstat_this_cpu.irqs[irq]++;
- kstat_this_cpu.irqs_sum++;
+ __this_cpu_inc(kstat.irqs[irq]);
+ __this_cpu_inc(kstat.irqs_sum);
}
static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
@@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
#else
#include <linux/irq.h>
extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
-#define kstat_irqs_this_cpu(DESC) \
- ((DESC)->kstat_irqs[smp_processor_id()])
-#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\
- ((DESC)->kstat_irqs[smp_processor_id()]++);\
- kstat_this_cpu.irqs_sum++; } while (0)
+
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+do { \
+ __this_cpu_inc(*(DESC)->kstat_irqs); \
+ __this_cpu_inc(kstat.irqs_sum); \
+} while (0)
#endif
static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
{
- kstat_this_cpu.softirqs[irq]++;
+ __this_cpu_inc(kstat.softirqs[irq]);
}
static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
new file mode 100644
index 000000000000..6b394f0b5148
--- /dev/null
+++ b/include/linux/khugepaged.h
@@ -0,0 +1,67 @@
+#ifndef _LINUX_KHUGEPAGED_H
+#define _LINUX_KHUGEPAGED_H
+
+#include <linux/sched.h> /* MMF_VM_HUGEPAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern int __khugepaged_enter(struct mm_struct *mm);
+extern void __khugepaged_exit(struct mm_struct *mm);
+extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma);
+
+#define khugepaged_enabled() \
+ (transparent_hugepage_flags & \
+ ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
+#define khugepaged_always() \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_FLAG))
+#define khugepaged_req_madv() \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
+#define khugepaged_defrag() \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
+
+static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
+ return __khugepaged_enter(mm);
+ return 0;
+}
+
+static inline void khugepaged_exit(struct mm_struct *mm)
+{
+ if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
+ __khugepaged_exit(mm);
+}
+
+static inline int khugepaged_enter(struct vm_area_struct *vma)
+{
+ if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
+ if ((khugepaged_always() ||
+ (khugepaged_req_madv() &&
+ vma->vm_flags & VM_HUGEPAGE)) &&
+ !(vma->vm_flags & VM_NOHUGEPAGE))
+ if (__khugepaged_enter(vma->vm_mm))
+ return -ENOMEM;
+ return 0;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ return 0;
+}
+static inline void khugepaged_exit(struct mm_struct *mm)
+{
+}
+static inline int khugepaged_enter(struct vm_area_struct *vma)
+{
+ return 0;
+}
+static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+ return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#endif /* _LINUX_KHUGEPAGED_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 159a0762aeaf..6a576f989437 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,6 +25,11 @@ struct page_cgroup;
struct page;
struct mm_struct;
+/* Stats that can be updated by kernel. */
+enum mem_cgroup_page_stat_item {
+ MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+};
+
extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
@@ -93,7 +98,7 @@ extern int
mem_cgroup_prepare_migration(struct page *page,
struct page *newpage, struct mem_cgroup **ptr);
extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
- struct page *oldpage, struct page *newpage);
+ struct page *oldpage, struct page *newpage, bool migration_ok);
/*
* For memory reclaim.
@@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void)
return false;
}
-void mem_cgroup_update_file_mapped(struct page *page, int val);
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx,
+ int val);
+
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx)
+{
+ mem_cgroup_update_page_stat(page, idx, 1);
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx)
+{
+ mem_cgroup_update_page_stat(page, idx, -1);
+}
+
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask);
u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
@@ -231,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
}
static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
- struct page *oldpage,
- struct page *newpage)
+ struct page *oldpage, struct page *newpage, bool migration_ok)
{
}
@@ -293,8 +312,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
}
-static inline void mem_cgroup_update_file_mapped(struct page *page,
- int val)
+static inline void mem_cgroup_inc_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx)
+{
+}
+
+static inline void mem_cgroup_dec_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx)
{
}
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 31c237a00c48..24376fe7ee68 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -13,12 +13,16 @@ struct mem_section;
#ifdef CONFIG_MEMORY_HOTPLUG
/*
- * Types for free bootmem.
- * The normal smallest mapcount is -1. Here is smaller value than it.
+ * Types for free bootmem stored in page->lru.next. These have to be in
+ * some random range in unsigned long space for debugging purposes.
*/
-#define SECTION_INFO (-1 - 1)
-#define MIX_SECTION_INFO (-1 - 2)
-#define NODE_INFO (-1 - 3)
+enum {
+ MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
+ SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
+ MIX_SECTION_INFO,
+ NODE_INFO,
+ MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
+};
/*
* pgdat resizing functions
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 085527fb8261..e39aeecfe9a2 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l);
extern int migrate_page(struct address_space *,
struct page *, struct page *);
extern int migrate_pages(struct list_head *l, new_page_t x,
- unsigned long private, int offlining);
+ unsigned long private, bool offlining,
+ bool sync);
extern int migrate_huge_pages(struct list_head *l, new_page_t x,
- unsigned long private, int offlining);
+ unsigned long private, bool offlining,
+ bool sync);
extern int fail_migrate_page(struct address_space *,
struct page *, struct page *);
@@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
static inline void putback_lru_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t x,
- unsigned long private, int offlining) { return -ENOSYS; }
+ unsigned long private, bool offlining,
+ bool sync) { return -ENOSYS; }
static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
- unsigned long private, int offlining) { return -ENOSYS; }
+ unsigned long private, bool offlining,
+ bool sync) { return -ENOSYS; }
static inline int migrate_prep(void) { return -ENOSYS; }
static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 721f451c3029..956a35532f47 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -14,6 +14,7 @@
#include <linux/mm_types.h>
#include <linux/range.h>
#include <linux/pfn.h>
+#include <linux/bit_spinlock.h>
struct mempolicy;
struct anon_vma;
@@ -82,6 +83,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_GROWSUP 0x00000200
#else
#define VM_GROWSUP 0x00000000
+#define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */
#endif
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
@@ -101,7 +103,11 @@ extern unsigned int kobjsize(const void *objp);
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
+#else
+#define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */
+#endif
#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */
@@ -242,6 +248,7 @@ struct inode;
* files which need it (119 of them)
*/
#include <linux/page-flags.h>
+#include <linux/huge_mm.h>
/*
* Methods to modify the page usage count.
@@ -305,6 +312,39 @@ static inline int is_vmalloc_or_module_addr(const void *x)
}
#endif
+static inline void compound_lock(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bit_spin_lock(PG_compound_lock, &page->flags);
+#endif
+}
+
+static inline void compound_unlock(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bit_spin_unlock(PG_compound_lock, &page->flags);
+#endif
+}
+
+static inline unsigned long compound_lock_irqsave(struct page *page)
+{
+ unsigned long uninitialized_var(flags);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ local_irq_save(flags);
+ compound_lock(page);
+#endif
+ return flags;
+}
+
+static inline void compound_unlock_irqrestore(struct page *page,
+ unsigned long flags)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ compound_unlock(page);
+ local_irq_restore(flags);
+#endif
+}
+
static inline struct page *compound_head(struct page *page)
{
if (unlikely(PageTail(page)))
@@ -319,9 +359,29 @@ static inline int page_count(struct page *page)
static inline void get_page(struct page *page)
{
- page = compound_head(page);
- VM_BUG_ON(atomic_read(&page->_count) == 0);
+ /*
+ * Getting a normal page or the head of a compound page
+ * requires to already have an elevated page->_count. Only if
+ * we're getting a tail page, the elevated page->_count is
+ * required only in the head page, so for tail pages the
+ * bugcheck only verifies that the page->_count isn't
+ * negative.
+ */
+ VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
atomic_inc(&page->_count);
+ /*
+ * Getting a tail page will elevate both the head and tail
+ * page->_count(s).
+ */
+ if (unlikely(PageTail(page))) {
+ /*
+ * This is safe only because
+ * __split_huge_page_refcount can't run under
+ * get_page().
+ */
+ VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+ atomic_inc(&page->first_page->_count);
+ }
}
static inline struct page *virt_to_head_page(const void *x)
@@ -339,6 +399,27 @@ static inline void init_page_count(struct page *page)
atomic_set(&page->_count, 1);
}
+/*
+ * PageBuddy() indicate that the page is free and in the buddy system
+ * (see mm/page_alloc.c).
+ */
+static inline int PageBuddy(struct page *page)
+{
+ return atomic_read(&page->_mapcount) == -2;
+}
+
+static inline void __SetPageBuddy(struct page *page)
+{
+ VM_BUG_ON(atomic_read(&page->_mapcount) != -1);
+ atomic_set(&page->_mapcount, -2);
+}
+
+static inline void __ClearPageBuddy(struct page *page)
+{
+ VM_BUG_ON(!PageBuddy(page));
+ atomic_set(&page->_mapcount, -1);
+}
+
void put_page(struct page *page);
void put_pages_list(struct list_head *pages);
@@ -370,12 +451,39 @@ static inline int compound_order(struct page *page)
return (unsigned long)page[1].lru.prev;
}
+static inline int compound_trans_order(struct page *page)
+{
+ int order;
+ unsigned long flags;
+
+ if (!PageHead(page))
+ return 0;
+
+ flags = compound_lock_irqsave(page);
+ order = compound_order(page);
+ compound_unlock_irqrestore(page, flags);
+ return order;
+}
+
static inline void set_compound_order(struct page *page, unsigned long order)
{
page[1].lru.prev = (void *)order;
}
/*
+ * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
+ * servicing faults for write access. In the normal case, do always want
+ * pte_mkwrite. But get_user_pages can cause write faults for mappings
+ * that do not have writing enabled, when used by access_process_vm.
+ */
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pte = pte_mkwrite(pte);
+ return pte;
+}
+
+/*
* Multiple processes may "see" the same page. E.g. for untouched
* mappings of /dev/null, all processes see the same page full of
* zeroes, and text pages of executables and shared libraries have
@@ -657,7 +765,7 @@ static inline struct address_space *page_mapping(struct page *page)
VM_BUG_ON(PageSlab(page));
if (unlikely(PageSwapCache(page)))
mapping = &swapper_space;
- else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
+ else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
mapping = NULL;
return mapping;
}
@@ -1064,7 +1172,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
#endif
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long address);
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
/*
@@ -1133,16 +1242,18 @@ static inline void pgtable_page_dtor(struct page *page)
pte_unmap(pte); \
} while (0)
-#define pte_alloc_map(mm, pmd, address) \
- ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
- NULL: pte_offset_map(pmd, address))
+#define pte_alloc_map(mm, vma, pmd, address) \
+ ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \
+ pmd, address))? \
+ NULL: pte_offset_map(pmd, address))
#define pte_alloc_map_lock(mm, pmd, address, ptlp) \
- ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+ ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \
+ pmd, address))? \
NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
#define pte_alloc_kernel(pmd, address) \
- ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
+ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
NULL: pte_offset_kernel(pmd, address))
extern void free_area_init(unsigned long * zones_size);
@@ -1415,6 +1526,8 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
#define FOLL_GET 0x04 /* do get_page on page */
#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */
#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
+#define FOLL_MLOCK 0x40 /* mark page as mlocked */
+#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data);
@@ -1518,5 +1631,14 @@ static inline int is_hwpoison_address(unsigned long addr)
extern void dump_page(struct page *page);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+extern void clear_huge_page(struct page *page,
+ unsigned long addr,
+ unsigned int pages_per_huge_page);
+extern void copy_user_huge_page(struct page *dst, struct page *src,
+ unsigned long addr, struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8835b877b8db..8f7d24712dc1 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -1,6 +1,8 @@
#ifndef LINUX_MM_INLINE_H
#define LINUX_MM_INLINE_H
+#include <linux/huge_mm.h>
+
/**
* page_is_file_cache - should the page be on a file LRU or anon LRU?
* @page: the page to test
@@ -20,18 +22,25 @@ static inline int page_is_file_cache(struct page *page)
}
static inline void
-add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
+ struct list_head *head)
{
- list_add(&page->lru, &zone->lru[l].list);
- __inc_zone_state(zone, NR_LRU_BASE + l);
+ list_add(&page->lru, head);
+ __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
mem_cgroup_add_lru_list(page, l);
}
static inline void
+add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+{
+ __add_page_to_lru_list(zone, page, l, &zone->lru[l].list);
+}
+
+static inline void
del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
{
list_del(&page->lru);
- __dec_zone_state(zone, NR_LRU_BASE + l);
+ __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
mem_cgroup_del_lru_list(page, l);
}
@@ -66,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
l += LRU_ACTIVE;
}
}
- __dec_zone_state(zone, NR_LRU_BASE + l);
+ __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
mem_cgroup_del_lru_list(page, l);
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bb7288a782fd..26bc4e2cd275 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -310,6 +310,9 @@ struct mm_struct {
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_mm *mmu_notifier_mm;
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ pgtable_t pmd_huge_pte; /* protected by page_table_lock */
+#endif
/* How many tasks sharing this mm are OOM_DISABLE */
atomic_t oom_disable_count;
};
diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h
index bf173502d744..38d393092812 100644
--- a/include/linux/mmc/sh_mmcif.h
+++ b/include/linux/mmc/sh_mmcif.h
@@ -94,12 +94,12 @@ struct sh_mmcif_plat_data {
static inline u32 sh_mmcif_readl(void __iomem *addr, int reg)
{
- return readl(addr + reg);
+ return __raw_readl(addr + reg);
}
static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val)
{
- writel(val, addr + reg);
+ __raw_writel(val, addr + reg);
}
#define SH_MMCIF_BBS 512 /* boot block size */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 43dcfbdc39de..cc2e7dfea9d7 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -62,6 +62,16 @@ struct mmu_notifier_ops {
unsigned long address);
/*
+ * test_young is called to check the young/accessed bitflag in
+ * the secondary pte. This is used to know if the page is
+ * frequently used without actually clearing the flag or tearing
+ * down the secondary mapping on the page.
+ */
+ int (*test_young)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address);
+
+ /*
* change_pte is called in cases that pte mapping to page is changed:
* for example, when ksm remaps pte to point to a new shared page.
*/
@@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long address);
+extern int __mmu_notifier_test_young(struct mm_struct *mm,
+ unsigned long address);
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte);
extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
return 0;
}
+static inline int mmu_notifier_test_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ if (mm_has_notifiers(mm))
+ return __mmu_notifier_test_young(mm, address);
+ return 0;
+}
+
static inline void mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte)
{
@@ -243,6 +263,32 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
__pte; \
})
+#define pmdp_clear_flush_notify(__vma, __address, __pmdp) \
+({ \
+ pmd_t __pmd; \
+ struct vm_area_struct *___vma = __vma; \
+ unsigned long ___address = __address; \
+ VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \
+ mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \
+ (__address)+HPAGE_PMD_SIZE);\
+ __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \
+ mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \
+ (__address)+HPAGE_PMD_SIZE); \
+ __pmd; \
+})
+
+#define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \
+({ \
+ struct vm_area_struct *___vma = __vma; \
+ unsigned long ___address = __address; \
+ VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \
+ mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \
+ (__address)+HPAGE_PMD_SIZE);\
+ pmdp_splitting_flush(___vma, ___address, __pmdp); \
+ mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \
+ (__address)+HPAGE_PMD_SIZE); \
+})
+
#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
({ \
int __young; \
@@ -254,6 +300,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
__young; \
})
+#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \
+({ \
+ int __young; \
+ struct vm_area_struct *___vma = __vma; \
+ unsigned long ___address = __address; \
+ __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \
+ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \
+ ___address); \
+ __young; \
+})
+
#define set_pte_at_notify(__mm, __address, __ptep, __pte) \
({ \
struct mm_struct *___mm = __mm; \
@@ -276,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
return 0;
}
+static inline int mmu_notifier_test_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ return 0;
+}
+
static inline void mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte)
{
@@ -305,7 +368,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
}
#define ptep_clear_flush_young_notify ptep_clear_flush_young
+#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_flush_notify ptep_clear_flush
+#define pmdp_clear_flush_notify pmdp_clear_flush
+#define pmdp_splitting_flush_notify pmdp_splitting_flush
#define set_pte_at_notify set_pte_at
#endif /* CONFIG_MMU_NOTIFIER */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39c24ebe9cfd..02ecb0189b1d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,7 @@ enum zone_stat_item {
NUMA_LOCAL, /* allocation from local node */
NUMA_OTHER, /* allocation from other node */
#endif
+ NR_ANON_TRANSPARENT_HUGEPAGES,
NR_VM_ZONE_STAT_ITEMS };
/*
@@ -458,12 +459,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
return test_bit(ZONE_OOM_LOCKED, &zone->flags);
}
-#ifdef CONFIG_SMP
-unsigned long zone_nr_free_pages(struct zone *zone);
-#else
-#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
-#endif /* CONFIG_SMP */
-
/*
* The "priority" of VM scanning is how much of the queues we will scan in one
* go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -645,6 +640,7 @@ typedef struct pglist_data {
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
int kswapd_max_order;
+ enum zone_type classzone_idx;
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -660,8 +656,10 @@ typedef struct pglist_data {
extern struct mutex zonelists_mutex;
void build_all_zonelists(void *data);
-void wakeup_kswapd(struct zone *zone, int order);
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags);
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags);
enum memmap_context {
MEMMAP_EARLY,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5f38c460367e..0db8037e2725 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -48,9 +48,6 @@
* struct page (these bits with information) are always mapped into kernel
* address space...
*
- * PG_buddy is set to indicate that the page is free and in the buddy system
- * (see mm/page_alloc.c).
- *
* PG_hwpoison indicates that a page got corrupted in hardware and contains
* data with incorrect ECC bits that triggered a machine check. Accessing is
* not safe since it may cause another machine check. Don't touch!
@@ -96,7 +93,6 @@ enum pageflags {
PG_swapcache, /* Swap page: swp_entry_t in private */
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
- PG_buddy, /* Page is free, on buddy lists */
PG_swapbacked, /* Page is backed by RAM/swap */
PG_unevictable, /* Page is "unevictable" */
#ifdef CONFIG_MMU
@@ -108,6 +104,9 @@ enum pageflags {
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ PG_compound_lock,
+#endif
__NR_PAGEFLAGS,
/* Filesystems */
@@ -198,7 +197,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
struct page; /* forward declaration */
TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
-PAGEFLAG(Error, error)
+PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
@@ -230,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
* risky: they bypass page accounting.
*/
TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-__PAGEFLAG(Buddy, buddy)
PAGEFLAG(MappedToDisk, mappedtodisk)
/* PG_readahead is only used for file reads; PG_reclaim is only for writes */
@@ -344,7 +342,7 @@ static inline void set_page_writeback(struct page *page)
* tests can be used in performance sensitive paths. PageCompound is
* generally not used in hot code paths.
*/
-__PAGEFLAG(Head, head)
+__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
__PAGEFLAG(Tail, tail)
static inline int PageCompound(struct page *page)
@@ -352,6 +350,13 @@ static inline int PageCompound(struct page *page)
return page->flags & ((1L << PG_head) | (1L << PG_tail));
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void ClearPageCompound(struct page *page)
+{
+ BUG_ON(!PageHead(page));
+ ClearPageHead(page);
+}
+#endif
#else
/*
* Reduce page flag use as much as possible by overlapping
@@ -389,14 +394,61 @@ static inline void __ClearPageTail(struct page *page)
page->flags &= ~PG_head_tail_mask;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void ClearPageCompound(struct page *page)
+{
+ BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound));
+ clear_bit(PG_compound, &page->flags);
+}
+#endif
+
#endif /* !PAGEFLAGS_EXTENDED */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for
+ * normal or transparent huge pages.
+ *
+ * PageTransHuge() returns true for both transparent huge and
+ * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
+ * called only in the core VM paths where hugetlbfs pages can't exist.
+ */
+static inline int PageTransHuge(struct page *page)
+{
+ VM_BUG_ON(PageTail(page));
+ return PageHead(page);
+}
+
+static inline int PageTransCompound(struct page *page)
+{
+ return PageCompound(page);
+}
+
+#else
+
+static inline int PageTransHuge(struct page *page)
+{
+ return 0;
+}
+
+static inline int PageTransCompound(struct page *page)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_MMU
#define __PG_MLOCKED (1 << PG_mlocked)
#else
#define __PG_MLOCKED 0
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __PG_COMPOUND_LOCK (1 << PG_compound_lock)
+#else
+#define __PG_COMPOUND_LOCK 0
+#endif
+
/*
* Flags checked when a page is freed. Pages being freed should not have
* these flags set. It they are, there is a problem.
@@ -404,9 +456,10 @@ static inline void __ClearPageTail(struct page *page)
#define PAGE_FLAGS_CHECK_AT_FREE \
(1 << PG_lru | 1 << PG_locked | \
1 << PG_private | 1 << PG_private_2 | \
- 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \
+ 1 << PG_writeback | 1 << PG_reserved | \
1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
- 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
+ 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
+ __PG_COMPOUND_LOCK)
/*
* Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index b02195dfc1b0..5b0c971d7cae 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -35,12 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page);
enum {
/* flags for mem_cgroup */
- PCG_LOCK, /* page cgroup is locked */
+ PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
- PCG_ACCT_LRU, /* page has been accounted for */
- PCG_FILE_MAPPED, /* page is accounted as "mapped" */
PCG_MIGRATION, /* under page migration */
+ /* flags for mem_cgroup and file and I/O status */
+ PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
+ PCG_FILE_MAPPED, /* page is accounted as "mapped" */
+ PCG_FILE_DIRTY, /* page is dirty */
+ PCG_FILE_WRITEBACK, /* page is under writeback */
+ PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */
+ /* No lock in page_cgroup */
+ PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
};
#define TESTPCGFLAG(uname, lname) \
@@ -59,6 +65,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
{ return test_and_clear_bit(PCG_##lname, &pc->flags); }
+#define TESTSETPCGFLAG(uname, lname) \
+static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \
+ { return test_and_set_bit(PCG_##lname, &pc->flags); }
+
/* Cache flag is set only once (at allocation) */
TESTPCGFLAG(Cache, CACHE)
CLEARPCGFLAG(Cache, CACHE)
@@ -78,6 +88,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED)
CLEARPCGFLAG(FileMapped, FILE_MAPPED)
TESTPCGFLAG(FileMapped, FILE_MAPPED)
+SETPCGFLAG(FileDirty, FILE_DIRTY)
+CLEARPCGFLAG(FileDirty, FILE_DIRTY)
+TESTPCGFLAG(FileDirty, FILE_DIRTY)
+TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY)
+TESTSETPCGFLAG(FileDirty, FILE_DIRTY)
+
+SETPCGFLAG(FileWriteback, FILE_WRITEBACK)
+CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK)
+TESTPCGFLAG(FileWriteback, FILE_WRITEBACK)
+
+SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS)
+
SETPCGFLAG(Migration, MIGRATION)
CLEARPCGFLAG(Migration, MIGRATION)
TESTPCGFLAG(Migration, MIGRATION)
@@ -94,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
static inline void lock_page_cgroup(struct page_cgroup *pc)
{
+ /*
+ * Don't take this lock in IRQ context.
+ * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION
+ */
bit_spin_lock(PCG_LOCK, &pc->flags);
}
@@ -107,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc)
return bit_spin_is_locked(PCG_LOCK, &pc->flags);
}
+static inline void move_lock_page_cgroup(struct page_cgroup *pc,
+ unsigned long *flags)
+{
+ /*
+ * We know updates to pc->flags of page cache's stats are from both of
+ * usual context or IRQ context. Disable IRQ to avoid deadlock.
+ */
+ local_irq_save(*flags);
+ bit_spin_lock(PCG_MOVE_LOCK, &pc->flags);
+}
+
+static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
+ unsigned long *flags)
+{
+ bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags);
+ local_irq_restore(*flags);
+}
+
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct page_cgroup;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2d1ffe3cf1ee..9c66e994540f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -48,7 +48,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping)
static inline int mapping_unevictable(struct address_space *mapping)
{
- if (likely(mapping))
+ if (mapping)
return test_bit(AS_UNEVICTABLE, &mapping->flags);
return !!mapping;
}
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index ab2baa5c4884..23241c2fecce 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -146,6 +146,22 @@ static inline void *radix_tree_deref_slot(void **pslot)
}
/**
+ * radix_tree_deref_slot_protected - dereference a slot without RCU lock but with tree lock held
+ * @pslot: pointer to slot, returned by radix_tree_lookup_slot
+ * Returns: item that was stored in that slot with any direct pointer flag
+ * removed.
+ *
+ * Similar to radix_tree_deref_slot but only used during migration when a pages
+ * mapping is being moved. The caller does not hold the RCU read lock but it
+ * must hold the tree lock to prevent parallel updates.
+ */
+static inline void *radix_tree_deref_slot_protected(void **pslot,
+ spinlock_t *treelock)
+{
+ return rcu_dereference_protected(*pslot, lockdep_is_held(treelock));
+}
+
+/**
* radix_tree_deref_retry - check radix_tree_deref_slot
* @arg: pointer returned by radix_tree_deref_slot
* Returns: 0 if retry is not required, otherwise retry is required
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bb83c0da2071..e9fd04ca1e51 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -198,6 +198,8 @@ enum ttu_flags {
};
#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
+bool is_vma_temporary_stack(struct vm_area_struct *vma);
+
int try_to_unmap(struct page *, enum ttu_flags flags);
int try_to_unmap_one(struct page *, struct vm_area_struct *,
unsigned long address, enum ttu_flags flags);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 96e23215e276..d747f948b34e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -21,7 +21,8 @@
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
-#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
+/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
+ and is now available for re-use. */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
@@ -433,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm);
#endif
/* leave room for more dump flags */
#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
+#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
@@ -633,6 +635,8 @@ struct signal_struct {
int oom_adj; /* OOM kill score adjustment (bit shift) */
int oom_score_adj; /* OOM kill score adjustment */
+ int oom_score_adj_min; /* OOM kill score adjustment minimum value.
+ * Only settable by CAP_SYS_RESOURCE. */
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
diff --git a/include/linux/swap.h b/include/linux/swap.h
index eba53e71d2cc..4d559325d919 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void);
/* linux/mm/swap.c */
extern void __lru_cache_add(struct page *, enum lru_list lru);
extern void lru_cache_add_lru(struct page *, enum lru_list lru);
+extern void lru_add_page_tail(struct zone* zone,
+ struct page *page, struct page *page_tail);
extern void activate_page(struct page *);
extern void mark_page_accessed(struct page *);
extern void lru_add_drain(void);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 44b54f619ac6..4ed6fcd6b726 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size);
extern void *vmalloc_32(unsigned long size);
extern void *vmalloc_32_user(unsigned long size);
extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
-extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot);
+extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, int node, void *caller);
extern void vfree(const void *addr);
extern void *vmap(struct page **pages, unsigned int count,
@@ -90,9 +91,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size,
unsigned long flags,
unsigned long start, unsigned long end,
void *caller);
-extern struct vm_struct *get_vm_area_node(unsigned long size,
- unsigned long flags, int node,
- gfp_t gfp_mask);
extern struct vm_struct *remove_vm_area(const void *addr);
extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
@@ -120,7 +118,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
#ifdef CONFIG_SMP
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
- size_t align, gfp_t gfp_mask);
+ size_t align);
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
#endif
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index eaaea37b3b75..833e676d6d92 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -254,6 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
void refresh_cpu_vm_stats(int);
+
+int calculate_pressure_threshold(struct zone *zone);
+int calculate_normal_threshold(struct zone *zone);
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ int (*calculate_pressure)(struct zone *));
#else /* CONFIG_SMP */
/*
@@ -298,6 +303,8 @@ static inline void __dec_zone_page_state(struct page *page,
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state
+#define set_pgdat_percpu_threshold(pgdat, callback) { }
+
static inline void refresh_cpu_vm_stats(int cpu) { }
#endif
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
new file mode 100644
index 000000000000..388bcdd26d46
--- /dev/null
+++ b/include/trace/events/compaction.h
@@ -0,0 +1,74 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM compaction
+
+#if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_COMPACTION_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#include "gfpflags.h"
+
+DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
+
+ TP_PROTO(unsigned long nr_scanned,
+ unsigned long nr_taken),
+
+ TP_ARGS(nr_scanned, nr_taken),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, nr_scanned)
+ __field(unsigned long, nr_taken)
+ ),
+
+ TP_fast_assign(
+ __entry->nr_scanned = nr_scanned;
+ __entry->nr_taken = nr_taken;
+ ),
+
+ TP_printk("nr_scanned=%lu nr_taken=%lu",
+ __entry->nr_scanned,
+ __entry->nr_taken)
+);
+
+DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages,
+
+ TP_PROTO(unsigned long nr_scanned,
+ unsigned long nr_taken),
+
+ TP_ARGS(nr_scanned, nr_taken)
+);
+
+DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
+ TP_PROTO(unsigned long nr_scanned,
+ unsigned long nr_taken),
+
+ TP_ARGS(nr_scanned, nr_taken)
+);
+
+TRACE_EVENT(mm_compaction_migratepages,
+
+ TP_PROTO(unsigned long nr_migrated,
+ unsigned long nr_failed),
+
+ TP_ARGS(nr_migrated, nr_failed),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, nr_migrated)
+ __field(unsigned long, nr_failed)
+ ),
+
+ TP_fast_assign(
+ __entry->nr_migrated = nr_migrated;
+ __entry->nr_failed = nr_failed;
+ ),
+
+ TP_printk("nr_migrated=%lu nr_failed=%lu",
+ __entry->nr_migrated,
+ __entry->nr_failed)
+);
+
+
+#endif /* _TRACE_COMPACTION_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index c255fcc587bf..ea422aaa23e1 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -25,13 +25,13 @@
#define trace_reclaim_flags(page, sync) ( \
(page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
- (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
+ (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
)
#define trace_shrink_flags(file, sync) ( \
- (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \
+ (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \
(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \
- (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
+ (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
)
TRACE_EVENT(mm_vmscan_kswapd_sleep,
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 89a2b2db4375..4e249b927eaa 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -81,6 +81,7 @@ DEFINE_EVENT(writeback_class, name, \
TP_ARGS(bdi))
DEFINE_WRITEBACK_EVENT(writeback_nowork);
+DEFINE_WRITEBACK_EVENT(writeback_wake_background);
DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
new file mode 100644
index 000000000000..eb23f4188f5a
--- /dev/null
+++ b/include/xen/gntdev.h
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * gntdev.h
+ *
+ * Interface to /dev/xen/gntdev.
+ *
+ * Copyright (c) 2007, D G Murray
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_GNTDEV_H__
+#define __LINUX_PUBLIC_GNTDEV_H__
+
+struct ioctl_gntdev_grant_ref {
+ /* The domain ID of the grant to be mapped. */
+ uint32_t domid;
+ /* The grant reference of the grant to be mapped. */
+ uint32_t ref;
+};
+
+/*
+ * Inserts the grant references into the mapping table of an instance
+ * of gntdev. N.B. This does not perform the mapping, which is deferred
+ * until mmap() is called with @index as the offset.
+ */
+#define IOCTL_GNTDEV_MAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
+struct ioctl_gntdev_map_grant_ref {
+ /* IN parameters */
+ /* The number of grants to be mapped. */
+ uint32_t count;
+ uint32_t pad;
+ /* OUT parameters */
+ /* The offset to be used on a subsequent call to mmap(). */
+ uint64_t index;
+ /* Variable IN parameter. */
+ /* Array of grant references, of size @count. */
+ struct ioctl_gntdev_grant_ref refs[1];
+};
+
+/*
+ * Removes the grant references from the mapping table of an instance of
+ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
+ * before this ioctl is called, or an error will result.
+ */
+#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
+struct ioctl_gntdev_unmap_grant_ref {
+ /* IN parameters */
+ /* The offset was returned by the corresponding map operation. */
+ uint64_t index;
+ /* The number of pages to be unmapped. */
+ uint32_t count;
+ uint32_t pad;
+};
+
+/*
+ * Returns the offset in the driver's address space that corresponds
+ * to @vaddr. This can be used to perform a munmap(), followed by an
+ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
+ * the caller. The number of pages that were allocated at the same time as
+ * @vaddr is returned in @count.
+ *
+ * N.B. Where more than one page has been mapped into a contiguous range, the
+ * supplied @vaddr must correspond to the start of the range; otherwise
+ * an error will result. It is only possible to munmap() the entire
+ * contiguously-allocated range at once, and not any subrange thereof.
+ */
+#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
+_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
+struct ioctl_gntdev_get_offset_for_vaddr {
+ /* IN parameters */
+ /* The virtual address of the first mapped page in a range. */
+ uint64_t vaddr;
+ /* OUT parameters */
+ /* The offset that was used in the initial mmap() operation. */
+ uint64_t offset;
+ /* The number of pages mapped in the VM area that begins at @vaddr. */
+ uint32_t count;
+ uint32_t pad;
+};
+
+/*
+ * Sets the maximum number of grants that may mapped at once by this gntdev
+ * instance.
+ *
+ * N.B. This must be called before any other ioctl is performed on the device.
+ */
+#define IOCTL_GNTDEV_SET_MAX_GRANTS \
+_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
+struct ioctl_gntdev_set_max_grants {
+ /* IN parameter */
+ /* The maximum number of grants that may be mapped at once. */
+ uint32_t count;
+};
+
+#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 9a731706a016..b1fab6b5b3ef 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -37,10 +37,16 @@
#ifndef __ASM_GNTTAB_H__
#define __ASM_GNTTAB_H__
-#include <asm/xen/hypervisor.h>
+#include <asm/page.h>
+
+#include <xen/interface/xen.h>
#include <xen/interface/grant_table.h>
+
+#include <asm/xen/hypervisor.h>
#include <asm/xen/grant_table.h>
+#include <xen/features.h>
+
/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
#define NR_GRANT_FRAMES 4
@@ -107,6 +113,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
unsigned long pfn);
+static inline void
+gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr,
+ uint32_t flags, grant_ref_t ref, domid_t domid)
+{
+ if (flags & GNTMAP_contains_pte)
+ map->host_addr = addr;
+ else if (xen_feature(XENFEAT_auto_translated_physmap))
+ map->host_addr = __pa(addr);
+ else
+ map->host_addr = addr;
+
+ map->flags = flags;
+ map->ref = ref;
+ map->dom = domid;
+}
+
+static inline void
+gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
+ uint32_t flags, grant_handle_t handle)
+{
+ if (flags & GNTMAP_contains_pte)
+ unmap->host_addr = addr;
+ else if (xen_feature(XENFEAT_auto_translated_physmap))
+ unmap->host_addr = __pa(addr);
+ else
+ unmap->host_addr = addr;
+
+ unmap->handle = handle;
+ unmap->dev_bus_addr = 0;
+}
+
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
struct grant_entry **__shared);
@@ -118,4 +155,9 @@ unsigned int gnttab_max_grant_frames(void);
#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+ struct page **pages, unsigned int count);
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct page **pages, unsigned int count);
+
#endif /* __ASM_GNTTAB_H__ */
diff --git a/kernel/fork.c b/kernel/fork.c
index d9b44f20b6b0..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
+#include <linux/khugepaged.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
+ retval = khugepaged_fork(mm, oldmm);
+ if (retval)
+ goto out;
prev = NULL;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -529,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ VM_BUG_ON(mm->pmd_huge_pte);
+#endif
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -543,6 +550,7 @@ void mmput(struct mm_struct *mm)
if (atomic_dec_and_test(&mm->mm_users)) {
exit_aio(mm);
ksm_exit(mm);
+ khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
@@ -669,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
mm->token_priority = 0;
mm->last_interval = 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ mm->pmd_huge_pte = NULL;
+#endif
+
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -910,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
+ sig->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_init(&sig->cred_guard_mutex);
@@ -1410,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
}
/*
- * We hope to recycle these flags after 2.6.26
- */
- if (unlikely(clone_flags & CLONE_STOPPED)) {
- static int __read_mostly count = 100;
-
- if (count > 0 && printk_ratelimit()) {
- char comm[TASK_COMM_LEN];
-
- count--;
- printk(KERN_INFO "fork(): process `%s' used deprecated "
- "clone flags 0x%lx\n",
- get_task_comm(comm, current),
- clone_flags & CLONE_STOPPED);
- }
- }
-
- /*
* When called from kernel_thread, don't do user tracing stuff.
*/
if (likely(user_mode(regs)))
@@ -1464,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
*/
p->flags &= ~PF_STARTING;
- if (unlikely(clone_flags & CLONE_STOPPED)) {
- /*
- * We'll start up with an immediate SIGSTOP.
- */
- sigaddset(&p->pending.signal, SIGSTOP);
- set_tsk_thread_flag(p, TIF_SIGPENDING);
- __set_task_state(p, TASK_STOPPED);
- } else {
- wake_up_new_task(p, clone_flags);
- }
+ wake_up_new_task(p, clone_flags);
tracehook_report_clone_complete(trace, regs,
clone_flags, nr, p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e6917..52075633373f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page;
+ struct page *page, *page_head;
int err;
/*
@@ -265,11 +265,46 @@ again:
if (err < 0)
return err;
- page = compound_head(page);
- lock_page(page);
- if (!page->mapping) {
- unlock_page(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ page_head = page;
+ if (unlikely(PageTail(page))) {
put_page(page);
+ /* serialize against __split_huge_page_splitting() */
+ local_irq_disable();
+ if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+ page_head = compound_head(page);
+ /*
+ * page_head is valid pointer but we must pin
+ * it before taking the PG_lock and/or
+ * PG_compound_lock. The moment we re-enable
+ * irqs __split_huge_page_splitting() can
+ * return and the head page can be freed from
+ * under us. We can't take the PG_lock and/or
+ * PG_compound_lock on a page that could be
+ * freed from under us.
+ */
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+ local_irq_enable();
+ } else {
+ local_irq_enable();
+ goto again;
+ }
+ }
+#else
+ page_head = compound_head(page);
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+#endif
+
+ lock_page(page_head);
+ if (!page_head->mapping) {
+ unlock_page(page_head);
+ put_page(page_head);
goto again;
}
@@ -280,20 +315,20 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page)) {
+ if (PageAnon(page_head)) {
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page->mapping->host;
- key->shared.pgoff = page->index;
+ key->shared.inode = page_head->mapping->host;
+ key->shared.pgoff = page_head->index;
}
get_futex_key_refs(key);
- unlock_page(page);
- put_page(page);
+ unlock_page(page_head);
+ put_page(page_head);
return 0;
}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
{
+ int cpu;
+
desc->irq_data.irq = irq;
desc->irq_data.chip = &no_irq_chip;
desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
desc->irq_count = 0;
desc->irqs_unhandled = 0;
desc->name = NULL;
- memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
desc_smp_init(desc, node);
}
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
if (!desc)
return NULL;
/* allocate based on nr_cpu_ids */
- desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
- gfp, node);
+ desc->kstat_irqs = alloc_percpu(unsigned int);
if (!desc->kstat_irqs)
goto err_desc;
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
return desc;
err_kstat:
- kfree(desc->kstat_irqs);
+ free_percpu(desc->kstat_irqs);
err_desc:
kfree(desc);
return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
mutex_unlock(&sparse_irq_lock);
free_masks(desc);
- kfree(desc->kstat_irqs);
+ free_percpu(desc->kstat_irqs);
kfree(desc);
}
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
}
};
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
int __init early_irq_init(void)
{
int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
for (i = 0; i < count; i++) {
desc[i].irq_data.irq = i;
desc[i].irq_data.chip = &no_irq_chip;
- desc[i].kstat_irqs = kstat_irqs_all[i];
+ /* TODO : do this allocation on-demand ... */
+ desc[i].kstat_irqs = alloc_percpu(unsigned int);
alloc_masks(desc + i, GFP_KERNEL, node);
desc_smp_init(desc + i, node);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
{
+#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
+ struct irq_desc *desc;
+ unsigned int i;
+
+ for (i = 0; i < cnt; i++) {
+ desc = irq_to_desc(start + i);
+ if (desc && !desc->kstat_irqs) {
+ unsigned int __percpu *stats = alloc_percpu(unsigned int);
+
+ if (!stats)
+ return -1;
+ if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
+ free_percpu(stats);
+ }
+ }
+#endif
return start;
}
#endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
- return desc ? desc->kstat_irqs[cpu] : 0;
+
+ return desc && desc->kstat_irqs ?
+ *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
#ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
int cpu;
int sum = 0;
- if (!desc)
+ if (!desc || !desc->kstat_irqs)
return 0;
for_each_possible_cpu(cpu)
- sum += desc->kstat_irqs[cpu];
+ sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
}
#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/mm/Kconfig b/mm/Kconfig
index c2c8a4a11898..3ad483bdf505 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS
See Documentation/nommu-mmap.txt for more information.
+config TRANSPARENT_HUGEPAGE
+ bool "Transparent Hugepage Support"
+ depends on X86 && MMU
+ select COMPACTION
+ help
+ Transparent Hugepages allows the kernel to use huge pages and
+ huge tlb transparently to the applications whenever possible.
+ This feature can improve computing performance to certain
+ applications by speeding up page faults during memory
+ allocation, by reducing the number of tlb misses and by speeding
+ up the pagetable walking.
+
+ If memory constrained on embedded, you may want to say N.
+
+choice
+ prompt "Transparent Hugepage Support sysfs defaults"
+ depends on TRANSPARENT_HUGEPAGE
+ default TRANSPARENT_HUGEPAGE_ALWAYS
+ help
+ Selects the sysfs defaults for Transparent Hugepage Support.
+
+ config TRANSPARENT_HUGEPAGE_ALWAYS
+ bool "always"
+ help
+ Enabling Transparent Hugepage always, can increase the
+ memory footprint of applications without a guaranteed
+ benefit but it will work automatically for all applications.
+
+ config TRANSPARENT_HUGEPAGE_MADVISE
+ bool "madvise"
+ help
+ Enabling Transparent Hugepage madvise, will only provide a
+ performance improvement benefit to the applications using
+ madvise(MADV_HUGEPAGE) but it won't risk to increase the
+ memory footprint of applications without a guaranteed
+ benefit.
+endchoice
+
#
# UP and nommu archs use km based percpu allocator
#
diff --git a/mm/Makefile b/mm/Makefile
index f73f75a29f82..2b1b575ae712 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,7 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o pagewalk.o
+ vmalloc.o pagewalk.o pgtable-generic.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 1a8894eadf72..6d592a021072 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
#include <linux/sysfs.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
+
/*
* compact_control is used to track pages being migrated and the free pages
* they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
+ bool sync; /* Synchronous migration */
/* Account for isolated anon and file pages */
unsigned long nr_anon;
@@ -38,6 +42,8 @@ struct compact_control {
unsigned int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
+
+ int compact_mode;
};
static unsigned long release_freepages(struct list_head *freelist)
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
struct list_head *freelist)
{
unsigned long zone_end_pfn, end_pfn;
- int total_isolated = 0;
+ int nr_scanned = 0, total_isolated = 0;
struct page *cursor;
/* Get the last PFN we should scan for free pages at */
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
if (!pfn_valid_within(blockpfn))
continue;
+ nr_scanned++;
if (!PageBuddy(page))
continue;
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
}
}
+ trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
return total_isolated;
}
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
+ unsigned long last_pageblock_nr = 0, pageblock_nr;
+ unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
/* Do not scan outside zone boundaries */
@@ -266,20 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone,
struct page *page;
if (!pfn_valid_within(low_pfn))
continue;
+ nr_scanned++;
/* Get the page and skip if free */
page = pfn_to_page(low_pfn);
if (PageBuddy(page))
continue;
+ /*
+ * For async migration, also only scan in MOVABLE blocks. Async
+ * migration is optimistic to see if the minimum amount of work
+ * satisfies the allocation
+ */
+ pageblock_nr = low_pfn >> pageblock_order;
+ if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+ low_pfn += pageblock_nr_pages;
+ low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+ last_pageblock_nr = pageblock_nr;
+ continue;
+ }
+
+ if (!PageLRU(page))
+ continue;
+
+ /*
+ * PageLRU is set, and lru_lock excludes isolation,
+ * splitting and collapsing (collapsing has already
+ * happened if PageLRU is set).
+ */
+ if (PageTransHuge(page)) {
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
/* Try isolate the page */
if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
continue;
+ VM_BUG_ON(PageTransCompound(page));
+
/* Successfully isolated */
del_page_from_lru_list(zone, page, page_lru(page));
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
+ nr_isolated++;
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -291,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
cc->migrate_pfn = low_pfn;
+ trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+
return cc->nr_migratepages;
}
@@ -341,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc)
}
static int compact_finished(struct zone *zone,
- struct compact_control *cc)
+ struct compact_control *cc)
{
unsigned int order;
- unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+ unsigned long watermark;
if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
@@ -354,12 +397,27 @@ static int compact_finished(struct zone *zone,
return COMPACT_COMPLETE;
/* Compaction run is not finished if the watermark is not met */
+ if (cc->compact_mode != COMPACT_MODE_KSWAPD)
+ watermark = low_wmark_pages(zone);
+ else
+ watermark = high_wmark_pages(zone);
+ watermark += (1 << cc->order);
+
if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
return COMPACT_CONTINUE;
if (cc->order == -1)
return COMPACT_CONTINUE;
+ /*
+ * Generating only one page of the right order is not enough
+ * for kswapd, we must continue until we're above the high
+ * watermark as a pool for high order GFP_ATOMIC allocations
+ * too.
+ */
+ if (cc->compact_mode == COMPACT_MODE_KSWAPD)
+ return COMPACT_CONTINUE;
+
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
/* Job done if page is free of the right migratetype */
@@ -374,10 +432,62 @@ static int compact_finished(struct zone *zone,
return COMPACT_CONTINUE;
}
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ * COMPACT_SKIPPED - If there are too few free pages for compaction
+ * COMPACT_PARTIAL - If the allocation would succeed without compaction
+ * COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+ int fragindex;
+ unsigned long watermark;
+
+ /*
+ * Watermarks for order-0 must be met for compaction. Note the 2UL.
+ * This is because during migration, copies of pages need to be
+ * allocated and for a short time, the footprint is higher
+ */
+ watermark = low_wmark_pages(zone) + (2UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return COMPACT_SKIPPED;
+
+ /*
+ * fragmentation index determines if allocation failures are due to
+ * low memory or external fragmentation
+ *
+ * index of -1 implies allocations might succeed dependingon watermarks
+ * index towards 0 implies failure is due to lack of memory
+ * index towards 1000 implies failure is due to fragmentation
+ *
+ * Only compact if a failure would be due to fragmentation.
+ */
+ fragindex = fragmentation_index(zone, order);
+ if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+ return COMPACT_SKIPPED;
+
+ if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+ return COMPACT_PARTIAL;
+
+ return COMPACT_CONTINUE;
+}
+
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
+ ret = compaction_suitable(zone, cc->order);
+ switch (ret) {
+ case COMPACT_PARTIAL:
+ case COMPACT_SKIPPED:
+ /* Compaction is likely to fail */
+ return ret;
+ case COMPACT_CONTINUE:
+ /* Fall through to compaction */
+ ;
+ }
+
/* Setup to move all movable pages to the end of the zone */
cc->migrate_pfn = zone->zone_start_pfn;
cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -393,7 +503,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
migrate_pages(&cc->migratepages, compaction_alloc,
- (unsigned long)cc, 0);
+ (unsigned long)cc, false,
+ cc->sync);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
@@ -401,6 +512,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
if (nr_remaining)
count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+ trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+ nr_remaining);
/* Release LRU pages not migrated */
if (!list_empty(&cc->migratepages)) {
@@ -417,8 +530,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
return ret;
}
-static unsigned long compact_zone_order(struct zone *zone,
- int order, gfp_t gfp_mask)
+unsigned long compact_zone_order(struct zone *zone,
+ int order, gfp_t gfp_mask,
+ bool sync,
+ int compact_mode)
{
struct compact_control cc = {
.nr_freepages = 0,
@@ -426,6 +541,8 @@ static unsigned long compact_zone_order(struct zone *zone,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
+ .sync = sync,
+ .compact_mode = compact_mode,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -441,16 +558,17 @@ int sysctl_extfrag_threshold = 500;
* @order: The order of the current allocation
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
- int order, gfp_t gfp_mask, nodemask_t *nodemask)
+ int order, gfp_t gfp_mask, nodemask_t *nodemask,
+ bool sync)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
- unsigned long watermark;
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_SKIPPED;
@@ -460,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
* made because an assumption is made that the page allocator can satisfy
* the "cheaper" orders without taking special steps
*/
- if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+ if (!order || !may_enter_fs || !may_perform_io)
return rc;
count_vm_event(COMPACTSTALL);
@@ -468,43 +586,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
- int fragindex;
int status;
- /*
- * Watermarks for order-0 must be met for compaction. Note
- * the 2UL. This is because during migration, copies of
- * pages need to be allocated and for a short time, the
- * footprint is higher
- */
- watermark = low_wmark_pages(zone) + (2UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
- continue;
-
- /*
- * fragmentation index determines if allocation failures are
- * due to low memory or external fragmentation
- *
- * index of -1 implies allocations might succeed depending
- * on watermarks
- * index towards 0 implies failure is due to lack of memory
- * index towards 1000 implies failure is due to fragmentation
- *
- * Only compact if a failure would be due to fragmentation.
- */
- fragindex = fragmentation_index(zone, order);
- if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
- continue;
-
- if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
- rc = COMPACT_PARTIAL;
- break;
- }
-
- status = compact_zone_order(zone, order, gfp_mask);
+ status = compact_zone_order(zone, order, gfp_mask, sync,
+ COMPACT_MODE_DIRECT_RECLAIM);
rc = max(status, rc);
- if (zone_watermark_ok(zone, order, watermark, 0, 0))
+ /* If a normal allocation would succeed, stop compacting */
+ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
break;
}
@@ -531,6 +620,7 @@ static int compact_node(int nid)
.nr_freepages = 0,
.nr_migratepages = 0,
.order = -1,
+ .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
};
zone = &pgdat->node_zones[zoneid];
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 4df2de77e069..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
if (mem_flags & __GFP_WAIT) {
DECLARE_WAITQUEUE(wait, current);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
__add_wait_queue(&pool->waitq, &wait);
spin_unlock_irqrestore(&pool->lock, flags);
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
{
- unsigned long flags;
struct dma_page *page;
- spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry(page, &pool->page_list, page_list) {
if (dma < page->dma)
continue;
if (dma < (page->dma + pool->allocation))
- goto done;
+ return page;
}
- page = NULL;
- done:
- spin_unlock_irqrestore(&pool->lock, flags);
- return page;
+ return NULL;
}
/**
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
unsigned long flags;
unsigned int offset;
+ spin_lock_irqsave(&pool->lock, flags);
page = pool_find_page(pool, dma);
if (!page) {
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
"dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
offset = vaddr - page->vaddr;
#ifdef DMAPOOL_DEBUG
if ((dma - page->dma) != offset) {
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
chain = *(int *)(page->vaddr + chain);
continue;
}
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
"already free\n", pool->name,
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
memset(vaddr, POOL_POISON_FREED, pool->size);
#endif
- spin_lock_irqsave(&pool->lock, flags);
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index ca389394fa2a..83a45d35468b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -298,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
continue;
wait_on_page_writeback(page);
- if (PageError(page))
+ if (TestClearPageError(page))
ret = -EIO;
}
pagevec_release(&pvec);
@@ -837,9 +837,6 @@ repeat:
if (radix_tree_deref_retry(page))
goto restart;
- if (page->mapping == NULL || page->index != index)
- break;
-
if (!page_cache_get_speculative(page))
goto repeat;
@@ -849,6 +846,16 @@ repeat:
goto repeat;
}
+ /*
+ * must check mapping and index after taking the ref.
+ * otherwise we can get both false positives and false
+ * negatives, which is just confusing to the caller.
+ */
+ if (page->mapping == NULL || page->index != index) {
+ page_cache_release(page);
+ break;
+ }
+
pages[ret] = page;
ret++;
index++;
@@ -2220,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);
- if (likely(page))
+ if (page)
return page;
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..004c9c2aac78
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2346 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+ (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+ struct list_head mm_head;
+ struct mm_slot *mm_slot;
+ unsigned long address;
+} khugepaged_scan = {
+ .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+
+static int set_recommended_min_free_kbytes(void)
+{
+ struct zone *zone;
+ int nr_zones = 0;
+ unsigned long recommended_min;
+ extern int min_free_kbytes;
+
+ if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) &&
+ !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags))
+ return 0;
+
+ for_each_populated_zone(zone)
+ nr_zones++;
+
+ /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+ recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+ /*
+ * Make sure that on average at least two pageblocks are almost free
+ * of another type, one for a migratetype to fall back to and a
+ * second to avoid subsequent fallbacks of other types There are 3
+ * MIGRATE_TYPES we care about.
+ */
+ recommended_min += pageblock_nr_pages * nr_zones *
+ MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+ /* don't ever allow to reserve more than 5% of the lowmem */
+ recommended_min = min(recommended_min,
+ (unsigned long) nr_free_buffer_pages() / 20);
+ recommended_min <<= (PAGE_SHIFT-10);
+
+ if (recommended_min > min_free_kbytes)
+ min_free_kbytes = recommended_min;
+ setup_per_zone_wmarks();
+ return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+
+static int start_khugepaged(void)
+{
+ int err = 0;
+ if (khugepaged_enabled()) {
+ int wakeup;
+ if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ mutex_lock(&khugepaged_mutex);
+ if (!khugepaged_thread)
+ khugepaged_thread = kthread_run(khugepaged, NULL,
+ "khugepaged");
+ if (unlikely(IS_ERR(khugepaged_thread))) {
+ printk(KERN_ERR
+ "khugepaged: kthread_run(khugepaged) failed\n");
+ err = PTR_ERR(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+ wakeup = !list_empty(&khugepaged_scan.mm_head);
+ mutex_unlock(&khugepaged_mutex);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ set_recommended_min_free_kbytes();
+ } else
+ /* wakeup to exit */
+ wake_up_interruptible(&khugepaged_wait);
+out:
+ return err;
+}
+
+#ifdef CONFIG_SYSFS
+
+static ssize_t double_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (test_bit(enabled, &transparent_hugepage_flags)) {
+ VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+ return sprintf(buf, "[always] madvise never\n");
+ } else if (test_bit(req_madv, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (!memcmp("always", buf,
+ min(sizeof("always")-1, count))) {
+ set_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("madvise", buf,
+ min(sizeof("madvise")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ set_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("never", buf,
+ min(sizeof("never")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+
+ ret = double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+
+ if (ret > 0) {
+ int err = start_khugepaged();
+ if (err)
+ ret = err;
+ }
+
+ if (ret > 0 &&
+ (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) ||
+ test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags)))
+ set_recommended_min_free_kbytes();
+
+ return ret;
+}
+static struct kobj_attribute enabled_attr =
+ __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static ssize_t single_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag flag)
+{
+ if (test_bit(flag, &transparent_hugepage_flags))
+ return sprintf(buf, "[yes] no\n");
+ else
+ return sprintf(buf, "yes [no]\n");
+}
+static ssize_t single_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag flag)
+{
+ if (!memcmp("yes", buf,
+ min(sizeof("yes")-1, count))) {
+ set_bit(flag, &transparent_hugepage_flags);
+ } else if (!memcmp("no", buf,
+ min(sizeof("no")-1, count))) {
+ clear_bit(flag, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+ __ATTR(defrag, 0644, defrag_show, defrag_store);
+
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+ __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+
+static struct attribute *hugepage_attr[] = {
+ &enabled_attr.attr,
+ &defrag_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+ &debug_cow_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group hugepage_attr_group = {
+ .attrs = hugepage_attr,
+};
+
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_scan_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+ __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+ scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_alloc_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+ __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+ alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long pages;
+
+ err = strict_strtoul(buf, 10, &pages);
+ if (err || !pages || pages > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_pages_to_scan = pages;
+
+ return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+ __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+ pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+ __ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+ __ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+ __ATTR(defrag, 0644, khugepaged_defrag_show,
+ khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_none;
+
+ err = strict_strtoul(buf, 10, &max_ptes_none);
+ if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_none = max_ptes_none;
+
+ return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+ __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+ khugepaged_max_ptes_none_store);
+
+static struct attribute *khugepaged_attr[] = {
+ &khugepaged_defrag_attr.attr,
+ &khugepaged_max_ptes_none_attr.attr,
+ &pages_to_scan_attr.attr,
+ &pages_collapsed_attr.attr,
+ &full_scans_attr.attr,
+ &scan_sleep_millisecs_attr.attr,
+ &alloc_sleep_millisecs_attr.attr,
+ NULL,
+};
+
+static struct attribute_group khugepaged_attr_group = {
+ .attrs = khugepaged_attr,
+ .name = "khugepaged",
+};
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+ int err;
+#ifdef CONFIG_SYSFS
+ static struct kobject *hugepage_kobj;
+#endif
+
+ err = -EINVAL;
+ if (!has_transparent_hugepage()) {
+ transparent_hugepage_flags = 0;
+ goto out;
+ }
+
+#ifdef CONFIG_SYSFS
+ err = -ENOMEM;
+ hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+ if (unlikely(!hugepage_kobj)) {
+ printk(KERN_ERR "hugepage: failed kobject create\n");
+ goto out;
+ }
+
+ err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+ if (err) {
+ printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ goto out;
+ }
+
+ err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+ if (err) {
+ printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ goto out;
+ }
+#endif
+
+ err = khugepaged_slab_init();
+ if (err)
+ goto out;
+
+ err = mm_slots_hash_init();
+ if (err) {
+ khugepaged_slab_free();
+ goto out;
+ }
+
+ /*
+ * By default disable transparent hugepages on smaller systems,
+ * where the extra memory used could hurt more than TLB overhead
+ * is likely to save. The admin can still enable it through /sys.
+ */
+ if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ transparent_hugepage_flags = 0;
+
+ start_khugepaged();
+
+ set_recommended_min_free_kbytes();
+
+out:
+ return err;
+}
+module_init(hugepage_init)
+
+static int __init setup_transparent_hugepage(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ if (!strcmp(str, "always")) {
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "never")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ printk(KERN_WARNING
+ "transparent_hugepage= cannot parse, ignored\n");
+ return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+
+static void prepare_pmd_huge_pte(pgtable_t pgtable,
+ struct mm_struct *mm)
+{
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ if (!mm->pmd_huge_pte)
+ INIT_LIST_HEAD(&pgtable->lru);
+ else
+ list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+ mm->pmd_huge_pte = pgtable;
+}
+
+static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pmd = pmd_mkwrite(pmd);
+ return pmd;
+}
+
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd,
+ struct page *page)
+{
+ int ret = 0;
+ pgtable_t pgtable;
+
+ VM_BUG_ON(!PageCompound(page));
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable)) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ return VM_FAULT_OOM;
+ }
+
+ clear_huge_page(page, haddr, HPAGE_PMD_NR);
+ __SetPageUptodate(page);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_none(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ pte_free(mm, pgtable);
+ } else {
+ pmd_t entry;
+ entry = mk_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ /*
+ * The spinlocking to take the lru_lock inside
+ * page_add_new_anon_rmap() acts as a full memory
+ * barrier to be sure clear_huge_page writes become
+ * visible after the set_pmd_at() write.
+ */
+ page_add_new_anon_rmap(page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ prepare_pmd_huge_pte(pgtable, mm);
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ spin_unlock(&mm->page_table_lock);
+ }
+
+ return ret;
+}
+
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+{
+ return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+ struct vm_area_struct *vma,
+ unsigned long haddr)
+{
+ return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+ HPAGE_PMD_ORDER, vma, haddr);
+}
+
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag),
+ HPAGE_PMD_ORDER);
+}
+#endif
+
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ unsigned int flags)
+{
+ struct page *page;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ pte_t *pte;
+
+ if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma)))
+ return VM_FAULT_OOM;
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr);
+ if (unlikely(!page))
+ goto out;
+ if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ put_page(page);
+ goto out;
+ }
+
+ return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+ }
+out:
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *vma)
+{
+ struct page *src_page;
+ pmd_t pmd;
+ pgtable_t pgtable;
+ int ret;
+
+ ret = -ENOMEM;
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
+
+ spin_lock(&dst_mm->page_table_lock);
+ spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+
+ ret = -EAGAIN;
+ pmd = *src_pmd;
+ if (unlikely(!pmd_trans_huge(pmd))) {
+ pte_free(dst_mm, pgtable);
+ goto out_unlock;
+ }
+ if (unlikely(pmd_trans_splitting(pmd))) {
+ /* split huge page running from under us */
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
+ pte_free(dst_mm, pgtable);
+
+ wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+ goto out;
+ }
+ src_page = pmd_page(pmd);
+ VM_BUG_ON(!PageHead(src_page));
+ get_page(src_page);
+ page_dup_rmap(src_page);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+ pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ pmd = pmd_mkold(pmd_wrprotect(pmd));
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ prepare_pmd_huge_pte(pgtable, dst_mm);
+
+ ret = 0;
+out_unlock:
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
+out:
+ return ret;
+}
+
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+ pgtable_t pgtable;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ pgtable = mm->pmd_huge_pte;
+ if (list_empty(&pgtable->lru))
+ mm->pmd_huge_pte = NULL;
+ else {
+ mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+ struct page, lru);
+ list_del(&pgtable->lru);
+ }
+ return pgtable;
+}
+
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ struct page *page,
+ unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int ret = 0, i;
+ struct page **pages;
+
+ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+ GFP_KERNEL);
+ if (unlikely(!pages)) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+ vma, address);
+ if (unlikely(!pages[i] ||
+ mem_cgroup_newpage_charge(pages[i], mm,
+ GFP_KERNEL))) {
+ if (pages[i])
+ put_page(pages[i]);
+ mem_cgroup_uncharge_start();
+ while (--i >= 0) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ copy_user_highpage(pages[i], page + i,
+ haddr + PAGE_SHIFT*i, vma);
+ __SetPageUptodate(pages[i]);
+ cond_resched();
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_pages;
+ VM_BUG_ON(!PageHead(page));
+
+ pmdp_clear_flush_notify(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = get_pmd_huge_pte(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = mk_pte(pages[i], vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(pages[i], vma, haddr);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ kfree(pages);
+
+ mm->nr_ptes++;
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ page_remove_rmap(page);
+ spin_unlock(&mm->page_table_lock);
+
+ ret |= VM_FAULT_WRITE;
+ put_page(page);
+
+out:
+ return ret;
+
+out_free_pages:
+ spin_unlock(&mm->page_table_lock);
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ goto out;
+}
+
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+ int ret = 0;
+ struct page *page, *new_page;
+ unsigned long haddr;
+
+ VM_BUG_ON(!vma->anon_vma);
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_unlock;
+
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+ haddr = address & HPAGE_PMD_MASK;
+ if (page_mapcount(page) == 1) {
+ pmd_t entry;
+ entry = pmd_mkyoung(orig_pmd);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
+ update_mmu_cache(vma, address, entry);
+ ret |= VM_FAULT_WRITE;
+ goto out_unlock;
+ }
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+
+ if (transparent_hugepage_enabled(vma) &&
+ !transparent_hugepage_debug_cow())
+ new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr);
+ else
+ new_page = NULL;
+
+ if (unlikely(!new_page)) {
+ ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+ pmd, orig_pmd, page, haddr);
+ put_page(page);
+ goto out;
+ }
+
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ put_page(new_page);
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ __SetPageUptodate(new_page);
+
+ spin_lock(&mm->page_table_lock);
+ put_page(page);
+ if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+ mem_cgroup_uncharge_page(new_page);
+ put_page(new_page);
+ } else {
+ pmd_t entry;
+ VM_BUG_ON(!PageHead(page));
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ pmdp_clear_flush_notify(vma, haddr, pmd);
+ page_add_new_anon_rmap(new_page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache(vma, address, entry);
+ page_remove_rmap(page);
+ put_page(page);
+ ret |= VM_FAULT_WRITE;
+ }
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+out:
+ return ret;
+}
+
+struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct page *page = NULL;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ goto out;
+
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!PageHead(page));
+ if (flags & FOLL_TOUCH) {
+ pmd_t _pmd;
+ /*
+ * We should set the dirty bit only for FOLL_WRITE but
+ * for now the dirty bit in the pmd is meaningless.
+ * And if the dirty bit will become meaningful and
+ * we'll only set it with FOLL_WRITE, an atomic
+ * set_bit will be required on the pmd to set the
+ * young bit, instead of the current set_pmd_at.
+ */
+ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+ }
+ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ VM_BUG_ON(!PageCompound(page));
+ if (flags & FOLL_GET)
+ get_page(page);
+
+out:
+ return page;
+}
+
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd)
+{
+ int ret = 0;
+
+ spin_lock(&tlb->mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&tlb->mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma,
+ pmd);
+ } else {
+ struct page *page;
+ pgtable_t pgtable;
+ pgtable = get_pmd_huge_pte(tlb->mm);
+ page = pmd_page(*pmd);
+ pmd_clear(pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ spin_unlock(&tlb->mm->page_table_lock);
+ tlb_remove_page(tlb, page);
+ pte_free(tlb->mm, pgtable);
+ ret = 1;
+ }
+ } else
+ spin_unlock(&tlb->mm->page_table_lock);
+
+ return ret;
+}
+
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ int ret = 0;
+
+ spin_lock(&vma->vm_mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ ret = !pmd_trans_splitting(*pmd);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ if (unlikely(!ret))
+ wait_split_huge_page(vma->anon_vma, pmd);
+ else {
+ /*
+ * All logical pages in the range are present
+ * if backed by a huge page.
+ */
+ memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+ }
+ } else
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ return ret;
+}
+
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, pgprot_t newprot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int ret = 0;
+
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ pmd_t entry;
+
+ entry = pmdp_get_and_clear(mm, addr, pmd);
+ entry = pmd_modify(entry, newprot);
+ set_pmd_at(mm, addr, pmd, entry);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+ ret = 1;
+ }
+ } else
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ return ret;
+}
+
+pmd_t *page_check_address_pmd(struct page *page,
+ struct mm_struct *mm,
+ unsigned long address,
+ enum page_check_address_pmd_flag flag)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, *ret = NULL;
+
+ if (address & ~HPAGE_PMD_MASK)
+ goto out;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ goto out;
+ if (pmd_page(*pmd) != page)
+ goto out;
+ /*
+ * split_vma() may create temporary aliased mappings. There is
+ * no risk as long as all huge pmd are found and have their
+ * splitting bit set before __split_huge_page_refcount
+ * runs. Finding the same huge pmd more than once during the
+ * same rmap walk is not a problem.
+ */
+ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+ pmd_trans_splitting(*pmd))
+ goto out;
+ if (pmd_trans_huge(*pmd)) {
+ VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+ !pmd_trans_splitting(*pmd));
+ ret = pmd;
+ }
+out:
+ return ret;
+}
+
+static int __split_huge_page_splitting(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd;
+ int ret = 0;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+ if (pmd) {
+ /*
+ * We can't temporarily set the pmd to null in order
+ * to split it, the pmd must remain marked huge at all
+ * times or the VM won't take the pmd_trans_huge paths
+ * and it won't wait on the anon_vma->root->lock to
+ * serialize against split_huge_page*.
+ */
+ pmdp_splitting_flush_notify(vma, address, pmd);
+ ret = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
+static void __split_huge_page_refcount(struct page *page)
+{
+ int i;
+ unsigned long head_index = page->index;
+ struct zone *zone = page_zone(page);
+ int zonestat;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ compound_lock(page);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *page_tail = page + i;
+
+ /* tail_page->_count cannot change */
+ atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+ BUG_ON(page_count(page) <= 0);
+ atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+ BUG_ON(atomic_read(&page_tail->_count) <= 0);
+
+ /* after clearing PageTail the gup refcount can be released */
+ smp_mb();
+
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page_tail->flags |= (page->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate)));
+ page_tail->flags |= (1L << PG_dirty);
+
+ /*
+ * 1) clear PageTail before overwriting first_page
+ * 2) clear PageTail before clearing PageHead for VM_BUG_ON
+ */
+ smp_wmb();
+
+ /*
+ * __split_huge_page_splitting() already set the
+ * splitting bit in all pmd that could map this
+ * hugepage, that will ensure no CPU can alter the
+ * mapcount on the head page. The mapcount is only
+ * accounted in the head page and it has to be
+ * transferred to all tail pages in the below code. So
+ * for this code to be safe, the split the mapcount
+ * can't change. But that doesn't mean userland can't
+ * keep changing and reading the page contents while
+ * we transfer the mapcount, so the pmd splitting
+ * status is achieved setting a reserved bit in the
+ * pmd, not by clearing the present bit.
+ */
+ BUG_ON(page_mapcount(page_tail));
+ page_tail->_mapcount = page->_mapcount;
+
+ BUG_ON(page_tail->mapping);
+ page_tail->mapping = page->mapping;
+
+ page_tail->index = ++head_index;
+
+ BUG_ON(!PageAnon(page_tail));
+ BUG_ON(!PageUptodate(page_tail));
+ BUG_ON(!PageDirty(page_tail));
+ BUG_ON(!PageSwapBacked(page_tail));
+
+ lru_add_page_tail(zone, page, page_tail);
+ }
+
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+
+ /*
+ * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+ * so adjust those appropriately if this page is on the LRU.
+ */
+ if (PageLRU(page)) {
+ zonestat = NR_LRU_BASE + page_lru(page);
+ __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+ }
+
+ ClearPageCompound(page);
+ compound_unlock(page);
+ spin_unlock_irq(&zone->lru_lock);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *page_tail = page + i;
+ BUG_ON(page_count(page_tail) <= 0);
+ /*
+ * Tail pages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(page_tail);
+ }
+
+ /*
+ * Only the head page (now become a regular page) is required
+ * to be pinned by the caller.
+ */
+ BUG_ON(page_count(page) <= 0);
+}
+
+static int __split_huge_page_map(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd, _pmd;
+ int ret = 0, i;
+ pgtable_t pgtable;
+ unsigned long haddr;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+ if (pmd) {
+ pgtable = get_pmd_huge_pte(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+ i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ BUG_ON(PageCompound(page+i));
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (!pmd_write(*pmd))
+ entry = pte_wrprotect(entry);
+ else
+ BUG_ON(page_mapcount(page) != 1);
+ if (!pmd_young(*pmd))
+ entry = pte_mkold(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+
+ mm->nr_ptes++;
+ smp_wmb(); /* make pte visible before pmd */
+ /*
+ * Up to this point the pmd is present and huge and
+ * userland has the whole access to the hugepage
+ * during the split (which happens in place). If we
+ * overwrite the pmd with the not-huge version
+ * pointing to the pte here (which of course we could
+ * if all CPUs were bug free), userland could trigger
+ * a small page size TLB miss on the small sized TLB
+ * while the hugepage TLB entry is still established
+ * in the huge TLB. Some CPU doesn't like that. See
+ * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+ * Erratum 383 on page 93. Intel should be safe but is
+ * also warns that it's only safe if the permission
+ * and cache attributes of the two entries loaded in
+ * the two TLB is identical (which should be the case
+ * here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual
+ * address to be loaded simultaneously. So instead of
+ * doing "pmd_populate(); flush_tlb_range();" we first
+ * mark the current pmd notpresent (atomically because
+ * here the pmd_trans_huge and pmd_trans_splitting
+ * must remain set at all times on the pmd until the
+ * split is complete for this pmd), then we flush the
+ * SMP TLB and finally we write the non-huge version
+ * of the pmd entry with pmd_populate.
+ */
+ set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ pmd_populate(mm, pmd, pgtable);
+ ret = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
+/* must be called with anon_vma->root->lock hold */
+static void __split_huge_page(struct page *page,
+ struct anon_vma *anon_vma)
+{
+ int mapcount, mapcount2;
+ struct anon_vma_chain *avc;
+
+ BUG_ON(!PageHead(page));
+ BUG_ON(PageTail(page));
+
+ mapcount = 0;
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ if (addr == -EFAULT)
+ continue;
+ mapcount += __split_huge_page_splitting(page, vma, addr);
+ }
+ /*
+ * It is critical that new vmas are added to the tail of the
+ * anon_vma list. This guarantes that if copy_huge_pmd() runs
+ * and establishes a child pmd before
+ * __split_huge_page_splitting() freezes the parent pmd (so if
+ * we fail to prevent copy_huge_pmd() from running until the
+ * whole __split_huge_page() is complete), we will still see
+ * the newly established pmd of the child later during the
+ * walk, to be able to set it as pmd_trans_splitting too.
+ */
+ if (mapcount != page_mapcount(page))
+ printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+ mapcount, page_mapcount(page));
+ BUG_ON(mapcount != page_mapcount(page));
+
+ __split_huge_page_refcount(page);
+
+ mapcount2 = 0;
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ if (addr == -EFAULT)
+ continue;
+ mapcount2 += __split_huge_page_map(page, vma, addr);
+ }
+ if (mapcount != mapcount2)
+ printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+ mapcount, mapcount2, page_mapcount(page));
+ BUG_ON(mapcount != mapcount2);
+}
+
+int split_huge_page(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ int ret = 1;
+
+ BUG_ON(!PageAnon(page));
+ anon_vma = page_lock_anon_vma(page);
+ if (!anon_vma)
+ goto out;
+ ret = 0;
+ if (!PageCompound(page))
+ goto out_unlock;
+
+ BUG_ON(!PageSwapBacked(page));
+ __split_huge_page(page, anon_vma);
+
+ BUG_ON(PageCompound(page));
+out_unlock:
+ page_unlock_anon_vma(anon_vma);
+out:
+ return ret;
+}
+
+int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ switch (advice) {
+ case MADV_HUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_HUGEPAGE |
+ VM_SHARED | VM_MAYSHARE |
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
+ VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+ VM_MIXEDMAP | VM_SAO))
+ return -EINVAL;
+ *vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags |= VM_HUGEPAGE;
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (unlikely(khugepaged_enter_vma_merge(vma)))
+ return -ENOMEM;
+ break;
+ case MADV_NOHUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_NOHUGEPAGE |
+ VM_SHARED | VM_MAYSHARE |
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
+ VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+ VM_MIXEDMAP | VM_SAO))
+ return -EINVAL;
+ *vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags |= VM_NOHUGEPAGE;
+ /*
+ * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+ * this vma even if we leave the mm registered in khugepaged if
+ * it got registered before VM_NOHUGEPAGE was set.
+ */
+ break;
+ }
+
+ return 0;
+}
+
+static int __init khugepaged_slab_init(void)
+{
+ mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+ sizeof(struct mm_slot),
+ __alignof__(struct mm_slot), 0, NULL);
+ if (!mm_slot_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __init khugepaged_slab_free(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+ mm_slot_cache = NULL;
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static int __init mm_slots_hash_init(void)
+{
+ mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!mm_slots_hash)
+ return -ENOMEM;
+ return 0;
+}
+
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+ kfree(mm_slots_hash);
+ mm_slots_hash = NULL;
+}
+#endif
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ struct hlist_head *bucket;
+ struct hlist_node *node;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ hlist_for_each_entry(mm_slot, node, bucket, hash) {
+ if (mm == mm_slot->mm)
+ return mm_slot;
+ }
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ struct hlist_head *bucket;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ mm_slot->mm = mm;
+ hlist_add_head(&mm_slot->hash, bucket);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON(khugepaged_test_exit(mm));
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+ free_mm_slot(mm_slot);
+ return 0;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * Insert just behind the scanning cursor, to let the area settle
+ * down a little.
+ */
+ wakeup = list_empty(&khugepaged_scan.mm_head);
+ list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ spin_unlock(&khugepaged_mm_lock);
+
+ atomic_inc(&mm->mm_count);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+ unsigned long hstart, hend;
+ if (!vma->anon_vma)
+ /*
+ * Not yet faulted in so we will register later in the
+ * page fault if needed.
+ */
+ return 0;
+ if (vma->vm_file || vma->vm_ops)
+ /* khugepaged not yet working on file or special mappings */
+ return 0;
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart < hend)
+ return khugepaged_enter(vma);
+ return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int free = 0;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ hlist_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+ free = 1;
+ }
+
+ if (free) {
+ spin_unlock(&khugepaged_mm_lock);
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ spin_unlock(&khugepaged_mm_lock);
+ /*
+ * This is required to serialize against
+ * khugepaged_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we
+ * return all pagetables will be destroyed) until
+ * khugepaged has finished working on the pagetables
+ * under the mmap_sem.
+ */
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ } else
+ spin_unlock(&khugepaged_mm_lock);
+}
+
+static void release_pte_page(struct page *page)
+{
+ /* 0 stands for page_is_file_cache(page) == false */
+ dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ unlock_page(page);
+ putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+ while (--_pte >= pte) {
+ pte_t pteval = *_pte;
+ if (!pte_none(pteval))
+ release_pte_page(pte_page(pteval));
+ }
+}
+
+static void release_all_pte_pages(pte_t *pte)
+{
+ release_pte_pages(pte, pte + HPAGE_PMD_NR);
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *pte)
+{
+ struct page *page;
+ pte_t *_pte;
+ int referenced = 0, isolated = 0, none = 0;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ }
+ if (!pte_present(pteval) || !pte_write(pteval)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ VM_BUG_ON(PageCompound(page));
+ BUG_ON(!PageAnon(page));
+ VM_BUG_ON(!PageSwapBacked(page));
+
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /*
+ * We can do it before isolate_lru_page because the
+ * page can't be freed from under us. NOTE: PG_lock
+ * is needed to serialize against split_huge_page
+ * when invoked from the VM.
+ */
+ if (!trylock_page(page)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /*
+ * Isolate the page to avoid collapsing an hugepage
+ * currently in use by the VM.
+ */
+ if (isolate_lru_page(page)) {
+ unlock_page(page);
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /* 0 stands for page_is_file_cache(page) == false */
+ inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageLRU(page));
+
+ /* If there is no mapped pte young don't collapse the page */
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (unlikely(!referenced))
+ release_all_pte_pages(pte);
+ else
+ isolated = 1;
+out:
+ return isolated;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl)
+{
+ pte_t *_pte;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+ pte_t pteval = *_pte;
+ struct page *src_page;
+
+ if (pte_none(pteval)) {
+ clear_user_highpage(page, address);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ } else {
+ src_page = pte_page(pteval);
+ copy_user_highpage(page, src_page, address, vma);
+ VM_BUG_ON(page_mapcount(src_page) != 1);
+ VM_BUG_ON(page_count(src_page) != 2);
+ release_pte_page(src_page);
+ /*
+ * ptl mostly unnecessary, but preempt has to
+ * be disabled to update the per-cpu stats
+ * inside page_remove_rmap().
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ page_remove_rmap(src_page);
+ spin_unlock(ptl);
+ free_page_and_swap_cache(src_page);
+ }
+
+ address += PAGE_SIZE;
+ page++;
+ }
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ struct vm_area_struct *vma)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pgtable_t pgtable;
+ struct page *new_page;
+ spinlock_t *ptl;
+ int isolated;
+ unsigned long hstart, hend;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
+ VM_BUG_ON(!*hpage);
+ new_page = *hpage;
+#else
+ VM_BUG_ON(*hpage);
+ /*
+ * Allocate the page while the vma is still valid and under
+ * the mmap_sem read mode so there is no memory allocation
+ * later when we take the mmap_sem in write mode. This is more
+ * friendly behavior (OTOH it may actually hide bugs) to
+ * filesystems in userland with daemons allocating memory in
+ * the userland I/O paths. Allocating memory with the
+ * mmap_sem in read mode is good idea also to allow greater
+ * scalability.
+ */
+ new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+ if (unlikely(!new_page)) {
+ up_read(&mm->mmap_sem);
+ *hpage = ERR_PTR(-ENOMEM);
+ return;
+ }
+#endif
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ up_read(&mm->mmap_sem);
+ put_page(new_page);
+ return;
+ }
+
+ /* after allocating the hugepage upgrade to mmap_sem write mode */
+ up_read(&mm->mmap_sem);
+
+ /*
+ * Prevent all access to pagetables with the exception of
+ * gup_fast later hanlded by the ptep_clear_flush and the VM
+ * handled by the anon_vma lock + PG_lock.
+ */
+ down_write(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ vma = find_vma(mm, address);
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ goto out;
+
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
+ goto out;
+
+ /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+ if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ goto out;
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ /* pmd can't go away or become huge under us */
+ if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ goto out;
+
+ anon_vma_lock(vma->anon_vma);
+
+ pte = pte_offset_map(pmd, address);
+ ptl = pte_lockptr(mm, pmd);
+
+ spin_lock(&mm->page_table_lock); /* probably unnecessary */
+ /*
+ * After this gup_fast can't run anymore. This also removes
+ * any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address
+ * to avoid the risk of CPU bugs in that area.
+ */
+ _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+ spin_unlock(&mm->page_table_lock);
+
+ spin_lock(ptl);
+ isolated = __collapse_huge_page_isolate(vma, address, pte);
+ spin_unlock(ptl);
+ pte_unmap(pte);
+
+ if (unlikely(!isolated)) {
+ spin_lock(&mm->page_table_lock);
+ BUG_ON(!pmd_none(*pmd));
+ set_pmd_at(mm, address, pmd, _pmd);
+ spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock(vma->anon_vma);
+ mem_cgroup_uncharge_page(new_page);
+ goto out;
+ }
+
+ /*
+ * All pages are isolated and locked so anon_vma rmap
+ * can't run anymore.
+ */
+ anon_vma_unlock(vma->anon_vma);
+
+ __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+ __SetPageUptodate(new_page);
+ pgtable = pmd_pgtable(_pmd);
+ VM_BUG_ON(page_count(pgtable) != 1);
+ VM_BUG_ON(page_mapcount(pgtable) != 0);
+
+ _pmd = mk_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+ _pmd = pmd_mkhuge(_pmd);
+
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), so
+ * this is needed to avoid the copy_huge_page writes to become
+ * visible after the set_pmd_at() write.
+ */
+ smp_wmb();
+
+ spin_lock(&mm->page_table_lock);
+ BUG_ON(!pmd_none(*pmd));
+ page_add_new_anon_rmap(new_page, vma, address);
+ set_pmd_at(mm, address, pmd, _pmd);
+ update_mmu_cache(vma, address, entry);
+ prepare_pmd_huge_pte(pgtable, mm);
+ mm->nr_ptes--;
+ spin_unlock(&mm->page_table_lock);
+
+#ifndef CONFIG_NUMA
+ *hpage = NULL;
+#endif
+ khugepaged_pages_collapsed++;
+out_up_write:
+ up_write(&mm->mmap_sem);
+ return;
+
+out:
+#ifdef CONFIG_NUMA
+ put_page(new_page);
+#endif
+ goto out_up_write;
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ struct page **hpage)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte, *_pte;
+ int ret = 0, referenced = 0, none = 0;
+ struct page *page;
+ unsigned long _address;
+ spinlock_t *ptl;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ goto out;
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, _address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else
+ goto out_unmap;
+ }
+ if (!pte_present(pteval) || !pte_write(pteval))
+ goto out_unmap;
+ page = vm_normal_page(vma, _address, pteval);
+ if (unlikely(!page))
+ goto out_unmap;
+ VM_BUG_ON(PageCompound(page));
+ if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ goto out_unmap;
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1)
+ goto out_unmap;
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (referenced)
+ ret = 1;
+out_unmap:
+ pte_unmap_unlock(pte, ptl);
+ if (ret)
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma);
+out:
+ return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+
+ VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
+ hlist_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+
+ /*
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ */
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+ struct page **hpage)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int progress = 0;
+
+ VM_BUG_ON(!pages);
+ VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_scan.mm_slot)
+ mm_slot = khugepaged_scan.mm_slot;
+ else {
+ mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ mm = mm_slot->mm;
+ down_read(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ vma = NULL;
+ else
+ vma = find_vma(mm, khugepaged_scan.address);
+
+ progress++;
+ for (; vma; vma = vma->vm_next) {
+ unsigned long hstart, hend;
+
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm))) {
+ progress++;
+ break;
+ }
+
+ if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+ !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE)) {
+ progress++;
+ continue;
+ }
+
+ /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+ if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+ khugepaged_scan.address = vma->vm_end;
+ progress++;
+ continue;
+ }
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart >= hend) {
+ progress++;
+ continue;
+ }
+ if (khugepaged_scan.address < hstart)
+ khugepaged_scan.address = hstart;
+ if (khugepaged_scan.address > hend) {
+ khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
+ progress++;
+ continue;
+ }
+ BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+
+ while (khugepaged_scan.address < hend) {
+ int ret;
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto breakouterloop;
+
+ VM_BUG_ON(khugepaged_scan.address < hstart ||
+ khugepaged_scan.address + HPAGE_PMD_SIZE >
+ hend);
+ ret = khugepaged_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ hpage);
+ /* move to next address */
+ khugepaged_scan.address += HPAGE_PMD_SIZE;
+ progress += HPAGE_PMD_NR;
+ if (ret)
+ /* we released mmap_sem so break loop */
+ goto breakouterloop_mmap_sem;
+ if (progress >= pages)
+ goto breakouterloop;
+ }
+ }
+breakouterloop:
+ up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+
+ spin_lock(&khugepaged_mm_lock);
+ BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ /*
+ * Release the current mm_slot if this mm is about to die, or
+ * if we scanned all vmas of this mm.
+ */
+ if (khugepaged_test_exit(mm) || !vma) {
+ /*
+ * Make sure that if mm_users is reaching zero while
+ * khugepaged runs here, khugepaged_exit will find
+ * mm_slot not pointing to the exiting mm.
+ */
+ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+ khugepaged_scan.mm_slot = list_entry(
+ mm_slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ } else {
+ khugepaged_scan.mm_slot = NULL;
+ khugepaged_full_scans++;
+ }
+
+ collect_mm_slot(mm_slot);
+ }
+
+ return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) &&
+ khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) ||
+ !khugepaged_enabled();
+}
+
+static void khugepaged_do_scan(struct page **hpage)
+{
+ unsigned int progress = 0, pass_through_head = 0;
+ unsigned int pages = khugepaged_pages_to_scan;
+
+ barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+ while (progress < pages) {
+ cond_resched();
+
+#ifndef CONFIG_NUMA
+ if (!*hpage) {
+ *hpage = alloc_hugepage(khugepaged_defrag());
+ if (unlikely(!*hpage))
+ break;
+ }
+#else
+ if (IS_ERR(*hpage))
+ break;
+#endif
+
+ if (unlikely(kthread_should_stop() || freezing(current)))
+ break;
+
+ spin_lock(&khugepaged_mm_lock);
+ if (!khugepaged_scan.mm_slot)
+ pass_through_head++;
+ if (khugepaged_has_work() &&
+ pass_through_head < 2)
+ progress += khugepaged_scan_mm_slot(pages - progress,
+ hpage);
+ else
+ progress = pages;
+ spin_unlock(&khugepaged_mm_lock);
+ }
+}
+
+static void khugepaged_alloc_sleep(void)
+{
+ DEFINE_WAIT(wait);
+ add_wait_queue(&khugepaged_wait, &wait);
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(
+ khugepaged_alloc_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+#ifndef CONFIG_NUMA
+static struct page *khugepaged_alloc_hugepage(void)
+{
+ struct page *hpage;
+
+ do {
+ hpage = alloc_hugepage(khugepaged_defrag());
+ if (!hpage)
+ khugepaged_alloc_sleep();
+ } while (unlikely(!hpage) &&
+ likely(khugepaged_enabled()));
+ return hpage;
+}
+#endif
+
+static void khugepaged_loop(void)
+{
+ struct page *hpage;
+
+#ifdef CONFIG_NUMA
+ hpage = NULL;
+#endif
+ while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
+ hpage = khugepaged_alloc_hugepage();
+ if (unlikely(!hpage))
+ break;
+#else
+ if (IS_ERR(hpage)) {
+ khugepaged_alloc_sleep();
+ hpage = NULL;
+ }
+#endif
+
+ khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
+ if (hpage)
+ put_page(hpage);
+#endif
+ try_to_freeze();
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (khugepaged_has_work()) {
+ DEFINE_WAIT(wait);
+ if (!khugepaged_scan_sleep_millisecs)
+ continue;
+ add_wait_queue(&khugepaged_wait, &wait);
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(
+ khugepaged_scan_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+ } else if (khugepaged_enabled())
+ wait_event_freezable(khugepaged_wait,
+ khugepaged_wait_event());
+ }
+}
+
+static int khugepaged(void *none)
+{
+ struct mm_slot *mm_slot;
+
+ set_freezable();
+ set_user_nice(current, 19);
+
+ /* serialize with start_khugepaged() */
+ mutex_lock(&khugepaged_mutex);
+
+ for (;;) {
+ mutex_unlock(&khugepaged_mutex);
+ BUG_ON(khugepaged_thread != current);
+ khugepaged_loop();
+ BUG_ON(khugepaged_thread != current);
+
+ mutex_lock(&khugepaged_mutex);
+ if (!khugepaged_enabled())
+ break;
+ if (unlikely(kthread_should_stop()))
+ break;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = khugepaged_scan.mm_slot;
+ khugepaged_scan.mm_slot = NULL;
+ if (mm_slot)
+ collect_mm_slot(mm_slot);
+ spin_unlock(&khugepaged_mm_lock);
+
+ khugepaged_thread = NULL;
+ mutex_unlock(&khugepaged_mutex);
+
+ return 0;
+}
+
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+ struct page *page;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ return;
+ }
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!page_count(page));
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+
+ split_huge_page(page);
+
+ put_page(page);
+ BUG_ON(pmd_trans_huge(*pmd));
+}
+
+static void split_huge_page_address(struct mm_struct *mm,
+ unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return;
+ /*
+ * Caller holds the mmap_sem write mode, so a huge pmd cannot
+ * materialize from under us.
+ */
+ split_huge_page_pmd(mm, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+ /*
+ * If the new start address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (start & ~HPAGE_PMD_MASK &&
+ (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, start);
+
+ /*
+ * If the new end address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (end & ~HPAGE_PMD_MASK &&
+ (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, end);
+
+ /*
+ * If we're also updating the vma->vm_next->vm_start, if the new
+ * vm_next->vm_start isn't page aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (adjust_next > 0) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long nstart = next->vm_start;
+ nstart += adjust_next << PAGE_SHIFT;
+ if (nstart & ~HPAGE_PMD_MASK &&
+ (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+ (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+ split_huge_page_address(next->vm_mm, nstart);
+ }
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85855240933d..bb0b7c128015 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma)
return 0;
}
-static void clear_gigantic_page(struct page *page,
- unsigned long addr, unsigned long sz)
-{
- int i;
- struct page *p = page;
-
- might_sleep();
- for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
- cond_resched();
- clear_user_highpage(p, addr + i * PAGE_SIZE);
- }
-}
-static void clear_huge_page(struct page *page,
- unsigned long addr, unsigned long sz)
-{
- int i;
-
- if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
- clear_gigantic_page(page, addr, sz);
- return;
- }
-
- might_sleep();
- for (i = 0; i < sz/PAGE_SIZE; i++) {
- cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
- }
-}
-
-static void copy_user_gigantic_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma)
-{
- int i;
- struct hstate *h = hstate_vma(vma);
- struct page *dst_base = dst;
- struct page *src_base = src;
-
- for (i = 0; i < pages_per_huge_page(h); ) {
- cond_resched();
- copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
-
- i++;
- dst = mem_map_next(dst, dst_base, i);
- src = mem_map_next(src, src_base, i);
- }
-}
-
-static void copy_user_huge_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma)
-{
- int i;
- struct hstate *h = hstate_vma(vma);
-
- if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
- copy_user_gigantic_page(dst, src, addr, vma);
- return;
- }
-
- might_sleep();
- for (i = 0; i < pages_per_huge_page(h); i++) {
- cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
- }
-}
-
static void copy_gigantic_page(struct page *dst, struct page *src)
{
int i;
@@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
return sprintf(buf, "%lu\n", nr_huge_pages);
}
+
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
@@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
err = strict_strtoul(buf, 10, &count);
if (err)
- return 0;
+ goto out;
h = kobj_to_hstate(kobj, &nid);
+ if (h->order >= MAX_ORDER) {
+ err = -EINVAL;
+ goto out;
+ }
+
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
@@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
NODEMASK_FREE(nodes_allowed);
return len;
+out:
+ NODEMASK_FREE(nodes_allowed);
+ return err;
}
static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
struct hstate *h = kobj_to_hstate(kobj, NULL);
return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
}
+
static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
@@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
+ if (h->order >= MAX_ORDER)
+ return -EINVAL;
+
err = strict_strtoul(buf, 10, &input);
if (err)
- return 0;
+ return err;
spin_lock(&hugetlb_lock);
h->nr_overcommit_huge_pages = input;
@@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
{
struct hstate *h = &default_hstate;
unsigned long tmp;
+ int ret;
if (!write)
tmp = h->max_huge_pages;
+ if (write && h->order >= MAX_ORDER)
+ return -EINVAL;
+
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
if (write) {
NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
if (nodes_allowed != &node_states[N_HIGH_MEMORY])
NODEMASK_FREE(nodes_allowed);
}
-
- return 0;
+out:
+ return ret;
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
{
struct hstate *h = &default_hstate;
unsigned long tmp;
+ int ret;
if (!write)
tmp = h->nr_overcommit_huge_pages;
+ if (write && h->order >= MAX_ORDER)
+ return -EINVAL;
+
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
if (write) {
spin_lock(&hugetlb_lock);
h->nr_overcommit_huge_pages = tmp;
spin_unlock(&hugetlb_lock);
}
-
- return 0;
+out:
+ return ret;
}
#endif /* CONFIG_SYSCTL */
@@ -2454,7 +2414,8 @@ retry_avoidcopy:
return VM_FAULT_OOM;
}
- copy_user_huge_page(new_page, old_page, address, vma);
+ copy_user_huge_page(new_page, old_page, address, vma,
+ pages_per_huge_page(h));
__SetPageUptodate(new_page);
/*
@@ -2558,7 +2519,7 @@ retry:
ret = -PTR_ERR(page);
goto out;
}
- clear_huge_page(page, address, huge_page_size(h));
+ clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
if (vma->vm_flags & VM_MAYSHARE) {
diff --git a/mm/internal.h b/mm/internal.h
index dedb0aff673f..4c98630f0f77 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page)
extern unsigned long highest_memmap_pfn;
+#ifdef CONFIG_SMP
+extern int putback_active_lru_page(struct zone *zone, struct page *page);
+#else
+static inline int putback_active_lru_page(struct zone *zone, struct page *page)
+{
+ return 0;
+}
+#endif
+
/*
* in mm/vmscan.c:
*/
@@ -134,6 +143,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
}
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+ struct vm_area_struct *vma);
+#endif
#else /* !CONFIG_MMU */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{
@@ -243,7 +256,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, unsigned int foll_flags,
- struct page **pages, struct vm_area_struct **vmas);
+ struct page **pages, struct vm_area_struct **vmas,
+ int *nonblocking);
#define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1
diff --git a/mm/ksm.c b/mm/ksm.c
index 43bc893470b4..c2b2a94f9d67 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,7 @@
#include <linux/swap.h>
#include <linux/ksm.h>
#include <linux/hash.h>
+#include <linux/freezer.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -411,6 +412,20 @@ out:
up_read(&mm->mmap_sem);
}
+static struct page *page_trans_compound_anon(struct page *page)
+{
+ if (PageTransCompound(page)) {
+ struct page *head = compound_trans_head(page);
+ /*
+ * head may actually be splitted and freed from under
+ * us but it's ok here.
+ */
+ if (PageAnon(head))
+ return head;
+ }
+ return NULL;
+}
+
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page)) {
+ if (PageAnon(page) || page_trans_compound_anon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
@@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (addr == -EFAULT)
goto out;
+ BUG_ON(PageTransCompound(page));
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
goto out;
@@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
goto out;
pmd = pmd_offset(pud, addr);
+ BUG_ON(pmd_trans_huge(*pmd));
if (!pmd_present(*pmd))
goto out;
@@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
put_page(page);
pte_unmap_unlock(ptep, ptl);
@@ -808,6 +827,33 @@ out:
return err;
}
+static int page_trans_compound_anon_split(struct page *page)
+{
+ int ret = 0;
+ struct page *transhuge_head = page_trans_compound_anon(page);
+ if (transhuge_head) {
+ /* Get the reference on the head to split it. */
+ if (get_page_unless_zero(transhuge_head)) {
+ /*
+ * Recheck we got the reference while the head
+ * was still anonymous.
+ */
+ if (PageAnon(transhuge_head))
+ ret = split_huge_page(transhuge_head);
+ else
+ /*
+ * Retry later if split_huge_page run
+ * from under us.
+ */
+ ret = 1;
+ put_page(transhuge_head);
+ } else
+ /* Retry later if split_huge_page run from under us. */
+ ret = 1;
+ }
+ return ret;
+}
+
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
@@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
if (!(vma->vm_flags & VM_MERGEABLE))
goto out;
+ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+ goto out;
+ BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
@@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
slot = ksm_scan.mm_slot;
if (slot == &ksm_mm_head) {
+ /*
+ * A number of pages can hang around indefinitely on per-cpu
+ * pagevecs, raised page count preventing write_protect_page
+ * from merging them. Though it doesn't really matter much,
+ * it is puzzling to see some stuck in pages_volatile until
+ * other activity jostles them out, and they also prevented
+ * LTP's KSM test from succeeding deterministically; so drain
+ * them here (here rather than on entry to ksm_do_scan(),
+ * so we don't IPI too often when pages_to_scan is set low).
+ */
+ lru_add_drain_all();
+
root_unstable_tree = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
@@ -1277,7 +1338,13 @@ next_mm:
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
- if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
+ if (IS_ERR_OR_NULL(*page)) {
+ ksm_scan.address += PAGE_SIZE;
+ cond_resched();
+ continue;
+ }
+ if (PageAnon(*page) ||
+ page_trans_compound_anon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1358,7 @@ next_mm:
up_read(&mm->mmap_sem);
return rmap_item;
}
- if (!IS_ERR_OR_NULL(*page))
- put_page(*page);
+ put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
@@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages)
struct rmap_item *rmap_item;
struct page *uninitialized_var(page);
- while (scan_npages--) {
+ while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
@@ -1370,6 +1436,7 @@ static int ksmd_should_run(void)
static int ksm_scan_thread(void *nothing)
{
+ set_freezable();
set_user_nice(current, 5);
while (!kthread_should_stop()) {
@@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing)
ksm_do_scan(ksm_thread_pages_to_scan);
mutex_unlock(&ksm_thread_mutex);
+ try_to_freeze();
+
if (ksmd_should_run()) {
schedule_timeout_interruptible(
msecs_to_jiffies(ksm_thread_sleep_millisecs));
} else {
- wait_event_interruptible(ksm_thread_wait,
+ wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
}
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
if (error)
goto out;
break;
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ error = hugepage_madvise(vma, &new_flags, behavior);
+ if (error)
+ goto out;
+ break;
}
if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+#endif
return 1;
default:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00bb8a64d028..8ab841031436 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,7 +292,6 @@ static struct move_charge_struct {
unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
- struct mm_struct *mm;
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -821,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
return;
VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru);
- return;
}
void mem_cgroup_del_lru(struct page *page)
@@ -1087,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
@@ -1312,8 +1310,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
u64 limit;
u64 memsw;
- limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
- total_swap_pages;
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ limit += total_swap_pages << PAGE_SHIFT;
+
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
@@ -1600,11 +1599,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
* possibility of race condition. If there is, we take a lock.
*/
-static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx, int val)
{
struct mem_cgroup *mem;
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
+ unsigned long uninitialized_var(flags);
if (unlikely(!pc))
return;
@@ -1616,37 +1617,34 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
/* pc->mem_cgroup is unstable ? */
if (unlikely(mem_cgroup_stealed(mem))) {
/* take a lock against to access pc->mem_cgroup */
- lock_page_cgroup(pc);
+ move_lock_page_cgroup(pc, &flags);
need_unlock = true;
mem = pc->mem_cgroup;
if (!mem || !PageCgroupUsed(pc))
goto out;
}
- this_cpu_add(mem->stat->count[idx], val);
-
switch (idx) {
- case MEM_CGROUP_STAT_FILE_MAPPED:
+ case MEMCG_NR_FILE_MAPPED:
if (val > 0)
SetPageCgroupFileMapped(pc);
else if (!page_mapped(page))
ClearPageCgroupFileMapped(pc);
+ idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
BUG();
}
+ this_cpu_add(mem->stat->count[idx], val);
+
out:
if (unlikely(need_unlock))
- unlock_page_cgroup(pc);
+ move_unlock_page_cgroup(pc, &flags);
rcu_read_unlock();
return;
}
-
-void mem_cgroup_update_file_mapped(struct page *page, int val)
-{
- mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
-}
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1887,12 +1885,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask,
+ struct mem_cgroup **memcg, bool oom,
+ int page_size)
{
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem = NULL;
int ret;
- int csize = CHARGE_SIZE;
+ int csize = max(CHARGE_SIZE, (unsigned long) page_size);
/*
* Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1917,7 +1917,7 @@ again:
VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;
- if (consume_stock(mem))
+ if (page_size == PAGE_SIZE && consume_stock(mem))
goto done;
css_get(&mem->css);
} else {
@@ -1940,7 +1940,7 @@ again:
rcu_read_unlock();
goto done;
}
- if (consume_stock(mem)) {
+ if (page_size == PAGE_SIZE && consume_stock(mem)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
@@ -1981,7 +1981,7 @@ again:
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
- csize = PAGE_SIZE;
+ csize = page_size;
css_put(&mem->css);
mem = NULL;
goto again;
@@ -2002,8 +2002,8 @@ again:
}
} while (ret != CHARGE_OK);
- if (csize > PAGE_SIZE)
- refill_stock(mem, csize - PAGE_SIZE);
+ if (csize > page_size)
+ refill_stock(mem, csize - page_size);
css_put(&mem->css);
done:
*memcg = mem;
@@ -2031,9 +2031,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
}
}
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+ int page_size)
{
- __mem_cgroup_cancel_charge(mem, 1);
+ __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
}
/*
@@ -2087,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
* commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
* USED state. If already USED, uncharge and return.
*/
-
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- enum charge_type ctype)
+static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
+ struct page_cgroup *pc,
+ enum charge_type ctype)
{
- /* try_charge() can return NULL to *memcg, taking care of it. */
- if (!mem)
- return;
-
- lock_page_cgroup(pc);
- if (unlikely(PageCgroupUsed(pc))) {
- unlock_page_cgroup(pc);
- mem_cgroup_cancel_charge(mem);
- return;
- }
-
pc->mem_cgroup = mem;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2127,6 +2116,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
}
mem_cgroup_charge_statistics(mem, pc, true);
+}
+
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+ struct page_cgroup *pc,
+ enum charge_type ctype,
+ int page_size)
+{
+ int i;
+ int count = page_size >> PAGE_SHIFT;
+
+ /* try_charge() can return NULL to *memcg, taking care of it. */
+ if (!mem)
+ return;
+
+ lock_page_cgroup(pc);
+ if (unlikely(PageCgroupUsed(pc))) {
+ unlock_page_cgroup(pc);
+ mem_cgroup_cancel_charge(mem, page_size);
+ return;
+ }
+
+ /*
+ * we don't need page_cgroup_lock about tail pages, becase they are not
+ * accessed by any other context at this point.
+ */
+ for (i = 0; i < count; i++)
+ ____mem_cgroup_commit_charge(mem, pc + i, ctype);
unlock_page_cgroup(pc);
/*
@@ -2173,7 +2189,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
mem_cgroup_charge_statistics(from, pc, false);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
- mem_cgroup_cancel_charge(from);
+ mem_cgroup_cancel_charge(from, PAGE_SIZE);
/* caller should have done css_get */
pc->mem_cgroup = to;
@@ -2195,9 +2211,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
{
int ret = -EINVAL;
+ unsigned long flags;
+
lock_page_cgroup(pc);
if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+ move_lock_page_cgroup(pc, &flags);
__mem_cgroup_move_account(pc, from, to, uncharge);
+ move_unlock_page_cgroup(pc, &flags);
ret = 0;
}
unlock_page_cgroup(pc);
@@ -2234,13 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
goto put;
parent = mem_cgroup_from_cont(pcg);
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
+ PAGE_SIZE);
if (ret || !parent)
goto put_back;
ret = mem_cgroup_move_account(pc, child, parent, true);
if (ret)
- mem_cgroup_cancel_charge(parent);
+ mem_cgroup_cancel_charge(parent, PAGE_SIZE);
put_back:
putback_lru_page(page);
put:
@@ -2261,6 +2282,12 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
struct mem_cgroup *mem = NULL;
struct page_cgroup *pc;
int ret;
+ int page_size = PAGE_SIZE;
+
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
pc = lookup_page_cgroup(page);
/* can happen at boot */
@@ -2268,11 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
return 0;
prefetchw(pc);
- ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
if (ret || !mem)
return ret;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
return 0;
}
@@ -2281,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page,
{
if (mem_cgroup_disabled())
return 0;
- if (PageCompound(page))
- return 0;
/*
* If already mapped, we don't have to account.
* If page cache, page->mapping has address_space.
@@ -2388,13 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
if (!mem)
goto charge_cur_mm;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+ ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
css_put(&mem->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, ptr, true);
+ return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
}
static void
@@ -2410,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
cgroup_exclude_rmdir(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
- __mem_cgroup_commit_charge(ptr, pc, ctype);
+ __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
mem_cgroup_lru_add_after_commit_swapcache(page);
/*
* Now swap is on-memory. This means this page may be
@@ -2459,11 +2484,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
return;
if (!mem)
return;
- mem_cgroup_cancel_charge(mem);
+ mem_cgroup_cancel_charge(mem, PAGE_SIZE);
}
static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
+ int page_size)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
@@ -2490,6 +2516,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
+ if (page_size != PAGE_SIZE)
+ goto direct_uncharge;
+
/*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
@@ -2503,9 +2532,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
batch->memsw_bytes += PAGE_SIZE;
return;
direct_uncharge:
- res_counter_uncharge(&mem->res, PAGE_SIZE);
+ res_counter_uncharge(&mem->res, page_size);
if (uncharge_memsw)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ res_counter_uncharge(&mem->memsw, page_size);
if (unlikely(batch->memcg != mem))
memcg_oom_recover(mem);
return;
@@ -2517,8 +2546,11 @@ direct_uncharge:
static struct mem_cgroup *
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
+ int i;
+ int count;
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
+ int page_size = PAGE_SIZE;
if (mem_cgroup_disabled())
return NULL;
@@ -2526,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
if (PageSwapCache(page))
return NULL;
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
+
+ count = page_size >> PAGE_SHIFT;
/*
* Check if our page_cgroup is valid
*/
@@ -2558,7 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- mem_cgroup_charge_statistics(mem, pc, false);
+ for (i = 0; i < count; i++)
+ mem_cgroup_charge_statistics(mem, pc + i, false);
ClearPageCgroupUsed(pc);
/*
@@ -2579,7 +2618,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
mem_cgroup_get(mem);
}
if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype);
+ __do_uncharge(mem, ctype, page_size);
return mem;
@@ -2774,6 +2813,7 @@ int mem_cgroup_prepare_migration(struct page *page,
enum charge_type ctype;
int ret = 0;
+ VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return 0;
@@ -2823,7 +2863,7 @@ int mem_cgroup_prepare_migration(struct page *page,
return 0;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
css_put(&mem->css);/* drop extra refcnt */
if (ret || *ptr == NULL) {
if (PageAnon(page)) {
@@ -2850,13 +2890,13 @@ int mem_cgroup_prepare_migration(struct page *page,
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
return ret;
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *mem,
- struct page *oldpage, struct page *newpage)
+ struct page *oldpage, struct page *newpage, bool migration_ok)
{
struct page *used, *unused;
struct page_cgroup *pc;
@@ -2865,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
return;
/* blocks rmdir() */
cgroup_exclude_rmdir(&mem->css);
- /* at migration success, oldpage->mapping is NULL. */
- if (oldpage->mapping) {
+ if (!migration_ok) {
used = oldpage;
unused = newpage;
} else {
@@ -4176,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
- pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
if (!pn)
return 1;
mem->info.nodeinfo[node] = pn;
- memset(pn, 0, sizeof(*pn));
-
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
@@ -4206,14 +4243,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
- mem = kmalloc(size, GFP_KERNEL);
+ mem = kzalloc(size, GFP_KERNEL);
else
- mem = vmalloc(size);
+ mem = vzalloc(size);
if (!mem)
return NULL;
- memset(mem, 0, size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!mem->stat)
goto out_free;
@@ -4461,7 +4497,8 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+ PAGE_SIZE);
if (ret || !mem)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
@@ -4623,6 +4660,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4638,7 +4676,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
struct vm_area_struct *vma;
- /* We've already held the mmap_sem */
+ down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4650,6 +4688,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
+ up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
@@ -4659,10 +4698,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
- return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+ unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+ VM_BUG_ON(mc.moving_task);
+ mc.moving_task = current;
+ return mem_cgroup_do_precharge(precharge);
}
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
@@ -4697,23 +4741,28 @@ static void mem_cgroup_clear_mc(void)
PAGE_SIZE * mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
-
mc.moved_swap = 0;
}
- if (mc.mm) {
- up_read(&mc.mm->mmap_sem);
- mmput(mc.mm);
- }
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
+ wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+
+ /*
+ * we must clear moving_task before waking up waiters at the end of
+ * task migration.
+ */
+ mc.moving_task = NULL;
+ __mem_cgroup_clear_mc();
spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
spin_unlock(&mc.lock);
- mc.moving_task = NULL;
- mc.mm = NULL;
mem_cgroup_end_move(from);
- memcg_oom_recover(from);
- memcg_oom_recover(to);
- wake_up_all(&mc.waitq);
}
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4735,38 +4784,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
- /*
- * We do all the move charge works under one mmap_sem to
- * avoid deadlock with down_write(&mmap_sem)
- * -> try_charge() -> if (mc.moving_task) -> sleep.
- */
- down_read(&mm->mmap_sem);
-
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
- VM_BUG_ON(mc.moving_task);
- VM_BUG_ON(mc.mm);
-
mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
- mc.precharge = 0;
- mc.moved_charge = 0;
- mc.moved_swap = 0;
spin_unlock(&mc.lock);
- mc.moving_task = current;
- mc.mm = mm;
+ /* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
- /* We call up_read() and mmput() in clear_mc(). */
- } else
- mmput(mm);
+ }
+ mmput(mm);
}
return ret;
}
@@ -4789,6 +4823,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
spinlock_t *ptl;
retry:
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
@@ -4854,7 +4889,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
struct vm_area_struct *vma;
lru_add_drain_all();
- /* We've already held the mmap_sem */
+retry:
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ /*
+ * Someone who are holding the mmap_sem might be waiting in
+ * waitq. So we cancel all extra charges, wake up all waiters,
+ * and retry. Because we cancel precharges, we might not be able
+ * to move enough charges, but moving charge is a best-effort
+ * feature anyway, so it wouldn't be a big problem.
+ */
+ __mem_cgroup_clear_mc();
+ cond_resched();
+ goto retry;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4873,6 +4920,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
*/
break;
}
+ up_read(&mm->mmap_sem);
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4881,11 +4929,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct task_struct *p,
bool threadgroup)
{
- if (!mc.mm)
+ struct mm_struct *mm;
+
+ if (!mc.to)
/* no need to move charge */
return;
- mem_cgroup_move_charge(mc.mm);
+ mm = get_task_mm(p);
+ if (mm) {
+ mem_cgroup_move_charge(mm);
+ mmput(mm);
+ }
mem_cgroup_clear_mc();
}
#else /* !CONFIG_MMU */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46ab2c044b0e..548fbd70f026 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+ si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
/*
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
@@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct task_struct *tsk;
struct anon_vma *av;
+ if (!PageHuge(page) && unlikely(split_huge_page(page)))
+ return;
read_lock(&tasklist_lock);
av = page_lock_anon_vma(page);
if (av == NULL) /* Not actually mapped anymore */
@@ -928,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
static void set_page_hwpoison_huge_page(struct page *hpage)
{
int i;
- int nr_pages = 1 << compound_order(hpage);
+ int nr_pages = 1 << compound_trans_order(hpage);
for (i = 0; i < nr_pages; i++)
SetPageHWPoison(hpage + i);
}
@@ -936,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
static void clear_page_hwpoison_huge_page(struct page *hpage)
{
int i;
- int nr_pages = 1 << compound_order(hpage);
+ int nr_pages = 1 << compound_trans_order(hpage);
for (i = 0; i < nr_pages; i++)
ClearPageHWPoison(hpage + i);
}
@@ -966,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
- nr_pages = 1 << compound_order(hpage);
+ nr_pages = 1 << compound_trans_order(hpage);
atomic_long_add(nr_pages, &mce_bad_pages);
/*
@@ -1164,7 +1166,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
- nr_pages = 1 << compound_order(page);
+ nr_pages = 1 << compound_trans_order(page);
if (!get_page_unless_zero(page)) {
/*
@@ -1290,9 +1292,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
/* Keep page count to indicate a given hugepage is isolated. */
list_add(&hpage->lru, &pagelist);
- ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+ true);
if (ret) {
- putback_lru_pages(&pagelist);
+ putback_lru_pages(&pagelist);
pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
@@ -1301,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
}
done:
if (!PageHWPoison(hpage))
- atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+ atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
/* keep elevated page count for bad page */
@@ -1413,7 +1416,8 @@ int soft_offline_page(struct page *page, int flags)
LIST_HEAD(pagelist);
list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ 0, true);
if (ret) {
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
diff --git a/mm/memory.c b/mm/memory.c
index 02e48aa0ed13..31250faff390 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long address)
{
pgtable_t new = pte_alloc_one(mm, address);
+ int wait_split_huge_page;
if (!new)
return -ENOMEM;
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
spin_lock(&mm->page_table_lock);
- if (!pmd_present(*pmd)) { /* Has another populated it ? */
+ wait_split_huge_page = 0;
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm->nr_ptes++;
pmd_populate(mm, pmd, new);
new = NULL;
- }
+ } else if (unlikely(pmd_trans_splitting(*pmd)))
+ wait_split_huge_page = 1;
spin_unlock(&mm->page_table_lock);
if (new)
pte_free(mm, new);
+ if (wait_split_huge_page)
+ wait_split_huge_page(vma->anon_vma, pmd);
return 0;
}
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&init_mm.page_table_lock);
- if (!pmd_present(*pmd)) { /* Has another populated it ? */
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
- }
+ } else
+ VM_BUG_ON(pmd_trans_splitting(*pmd));
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
return 0;
}
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
{
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*src_pmd)) {
+ int err;
+ VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+ err = copy_huge_pmd(dst_mm, src_mm,
+ dst_pmd, src_pmd, addr, vma);
+ if (err == -ENOMEM)
+ return -ENOMEM;
+ if (!err)
+ continue;
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next-addr != HPAGE_PMD_SIZE) {
+ VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+ split_huge_page_pmd(vma->vm_mm, pmd);
+ } else if (zap_huge_pmd(tlb, vma, pmd)) {
+ (*zap_work)--;
+ continue;
+ }
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd)) {
(*zap_work)--;
continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
pud = pud_offset(pgd, address);
if (pud_none(*pud))
goto no_page_table;
- if (pud_huge(*pud)) {
+ if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
goto no_page_table;
- if (pmd_huge(*pmd)) {
+ if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
+ if (pmd_trans_huge(*pmd)) {
+ if (flags & FOLL_SPLIT) {
+ split_huge_page_pmd(mm, pmd);
+ goto split_fallthrough;
+ }
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ page = follow_trans_huge_pmd(mm, address,
+ pmd, flags);
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+ } else
+ spin_unlock(&mm->page_table_lock);
+ /* fall through */
+ }
+split_fallthrough:
if (unlikely(pmd_bad(*pmd)))
goto no_page_table;
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
*/
mark_page_accessed(page);
}
+ if (flags & FOLL_MLOCK) {
+ /*
+ * The preliminary mapping check is mainly to avoid the
+ * pointless overhead of lock_page on the ZERO_PAGE
+ * which might bounce very badly if there is contention.
+ *
+ * If the page is already locked, we don't need to
+ * handle it now - vmscan will handle it later if and
+ * when it attempts to reclaim the page.
+ */
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain(); /* push cached pages to LRU */
+ /*
+ * Because we lock page here and migration is
+ * blocked by the pte's page reference, we need
+ * only check for file-cache page truncation.
+ */
+ if (page->mapping)
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
unlock:
pte_unmap_unlock(ptep, ptl);
out:
@@ -1341,7 +1412,8 @@ no_page_table:
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int gup_flags,
- struct page **pages, struct vm_area_struct **vmas)
+ struct page **pages, struct vm_area_struct **vmas,
+ int *nonblocking)
{
int i;
unsigned long vm_flags;
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pmd = pmd_offset(pud, pg);
if (pmd_none(*pmd))
return i ? : -EFAULT;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map(pmd, pg);
if (pte_none(*pte)) {
pte_unmap(pte);
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
cond_resched();
while (!(page = follow_page(vma, start, foll_flags))) {
int ret;
+ unsigned int fault_flags = 0;
+
+ if (foll_flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
ret = handle_mm_fault(mm, vma, start,
- (foll_flags & FOLL_WRITE) ?
- FAULT_FLAG_WRITE : 0);
+ fault_flags);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
else
tsk->min_flt++;
+ if (ret & VM_FAULT_RETRY) {
+ *nonblocking = 0;
+ return i;
+ }
+
/*
* The VM_FAULT_WRITE bit tells us that
* do_wp_page has broken COW when necessary,
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr)
struct page *page;
if (__get_user_pages(current, current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+ NULL) < 1)
return NULL;
flush_cache_page(vma, addr, page_to_pfn(page));
return page;
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
pud_t * pud = pud_alloc(mm, pgd, addr);
if (pud) {
pmd_t * pmd = pmd_alloc(mm, pud, addr);
- if (pmd)
+ if (pmd) {
+ VM_BUG_ON(pmd_trans_huge(*pmd));
return pte_alloc_map_lock(mm, pmd, addr, ptl);
+ }
}
return NULL;
}
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return -ENOMEM;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
if (remap_pte_range(mm, pmd, addr, next,
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
return same;
}
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
- * servicing faults for write access. In the normal case, do always want
- * pte_mkwrite. But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
- if (likely(vma->vm_flags & VM_WRITE))
- pte = pte_mkwrite(pte);
- return pte;
-}
-
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
/*
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *old_page, *new_page;
pte_t entry;
- int reuse = 0, ret = 0;
+ int ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
@@ -2149,14 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
page_cache_release(old_page);
}
- reuse = reuse_swap_page(old_page);
- if (reuse)
+ if (reuse_swap_page(old_page)) {
/*
* The page is all ours. Move it to our anon_vma so
* the rmap code will not search our parent or siblings.
* Protected against the rmap code by the page lock.
*/
page_move_anon_rmap(old_page, vma, address);
+ unlock_page(old_page);
+ goto reuse;
+ }
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
@@ -2220,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
dirty_page = old_page;
get_page(dirty_page);
- reuse = 1;
- }
- if (reuse) {
reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, page_table);
+ pte_unmap_unlock(page_table, ptl);
ret |= VM_FAULT_WRITE;
- goto unlock;
+
+ if (!dirty_page)
+ return ret;
+
+ /*
+ * Yes, Virginia, this is actually required to prevent a race
+ * with clear_page_dirty_for_io() from clearing the page dirty
+ * bit after it clear all dirty ptes, but before a racing
+ * do_wp_page installs a dirty pte.
+ *
+ * do_no_page is protected similarly.
+ */
+ if (!page_mkwrite) {
+ wait_on_page_locked(dirty_page);
+ set_page_dirty_balance(dirty_page, page_mkwrite);
+ }
+ put_page(dirty_page);
+ if (page_mkwrite) {
+ struct address_space *mapping = dirty_page->mapping;
+
+ set_page_dirty(dirty_page);
+ unlock_page(dirty_page);
+ page_cache_release(dirty_page);
+ if (mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+ }
+
+ /* file_update_time outside page_lock */
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+
+ return ret;
}
/*
@@ -2337,39 +2448,6 @@ gotten:
page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
- if (dirty_page) {
- /*
- * Yes, Virginia, this is actually required to prevent a race
- * with clear_page_dirty_for_io() from clearing the page dirty
- * bit after it clear all dirty ptes, but before a racing
- * do_wp_page installs a dirty pte.
- *
- * do_no_page is protected similarly.
- */
- if (!page_mkwrite) {
- wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page, page_mkwrite);
- }
- put_page(dirty_page);
- if (page_mkwrite) {
- struct address_space *mapping = dirty_page->mapping;
-
- set_page_dirty(dirty_page);
- unlock_page(dirty_page);
- page_cache_release(dirty_page);
- if (mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
- }
-
- /* file_update_time outside page_lock */
- if (vma->vm_file)
- file_update_time(vma->vm_file);
- }
return ret;
oom_free_new:
page_cache_release(new_page);
@@ -3147,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, unsigned int flags)
+int handle_pte_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
@@ -3228,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
return VM_FAULT_OOM;
- pte = pte_alloc_map(mm, pmd, address);
- if (!pte)
+ if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+ if (!vma->vm_ops)
+ return do_huge_pmd_anonymous_page(mm, vma, address,
+ pmd, flags);
+ } else {
+ pmd_t orig_pmd = *pmd;
+ barrier();
+ if (pmd_trans_huge(orig_pmd)) {
+ if (flags & FAULT_FLAG_WRITE &&
+ !pmd_write(orig_pmd) &&
+ !pmd_trans_splitting(orig_pmd))
+ return do_huge_pmd_wp_page(mm, vma, address,
+ pmd, orig_pmd);
+ return 0;
+ }
+ }
+
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
@@ -3296,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
vma = find_vma(current->mm, addr);
if (!vma)
return -ENOMEM;
- write = (vma->vm_flags & VM_WRITE) != 0;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
BUG_ON(addr >= end);
BUG_ON(end > vma->vm_end);
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3368,6 +3482,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
goto out;
pmd = pmd_offset(pud, address);
+ VM_BUG_ON(pmd_trans_huge(*pmd));
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
@@ -3608,3 +3723,74 @@ void might_fault(void)
}
EXPORT_SYMBOL(might_fault);
#endif
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+ unsigned long addr,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page;
+ i++, p = mem_map_next(p, page, i)) {
+ cond_resched();
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
+ }
+}
+void clear_huge_page(struct page *page,
+ unsigned long addr, unsigned int pages_per_huge_page)
+{
+ int i;
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, pages_per_huge_page);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page; i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+}
+
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+ unsigned long addr,
+ struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page; ) {
+ cond_resched();
+ copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_user_huge_page(struct page *dst, struct page *src,
+ unsigned long addr, struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ copy_user_gigantic_page(dst, src, addr, vma,
+ pages_per_huge_page);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page; i++) {
+ cond_resched();
+ copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2c6523af5473..e92f04749fcb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res)
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info, struct page *page, int type)
+static void get_page_bootmem(unsigned long info, struct page *page,
+ unsigned long type)
{
- atomic_set(&page->_mapcount, type);
+ page->lru.next = (struct list_head *) type;
SetPagePrivate(page);
set_page_private(page, info);
atomic_inc(&page->_count);
@@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
* so use __ref to tell modpost not to generate a warning */
void __ref put_page_bootmem(struct page *page)
{
- int type;
+ unsigned long type;
- type = atomic_read(&page->_mapcount);
- BUG_ON(type >= -1);
+ type = (unsigned long) page->lru.next;
+ BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+ type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (atomic_dec_return(&page->_count) == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
- reset_page_mapcount(page);
+ INIT_LIST_HEAD(&page->lru);
__free_pages_bootmem(page, 0);
}
@@ -733,7 +735,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
goto out;
}
/* this function returns # of failed pages */
- ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+ ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+ true, true);
if (ret)
putback_lru_pages(&source);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 11ff260fb282..368fc9d23610 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ split_huge_page_pmd(vma->vm_mm, pmd);
if (pmd_none_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
return PTR_ERR(vma);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_node_page, dest, 0);
+ err = migrate_pages(&pagelist, new_node_page, dest,
+ false, true);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma, 0);
+ (unsigned long)vma,
+ false, true);
if (nr_failed)
putback_lru_pages(&pagelist);
}
@@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
/* Find the mm_struct */
rcu_read_lock();
- read_lock(&tasklist_lock);
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
- read_unlock(&tasklist_lock);
rcu_read_unlock();
err = -ESRCH;
goto out;
}
mm = get_task_mm(task);
- read_unlock(&tasklist_lock);
rcu_read_unlock();
err = -EINVAL;
@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
}
/**
- * alloc_page_vma - Allocate a page for a VMA.
+ * alloc_pages_vma - Allocate a page for a VMA.
*
* @gfp:
* %GFP_USER user allocation.
@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* %GFP_FS allocation should not call back into a file system.
* %GFP_ATOMIC don't sleep.
*
+ * @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
*
@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* Should be called with the mm_sem of the vma hold.
*/
struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
mpol_cond_put(pol);
- page = alloc_page_interleave(gfp, 0, nid);
+ page = alloc_page_interleave(gfp, order, nid);
put_mems_allowed();
return page;
}
@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
/*
* slow path: ref counted shared policy
*/
- struct page *page = __alloc_pages_nodemask(gfp, 0,
+ struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
put_mems_allowed();
@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
/*
* fast path: default or task policy
*/
- page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+ page = __alloc_pages_nodemask(gfp, order, zl,
+ policy_nodemask(gfp, pol));
put_mems_allowed();
return page;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 6ae8a66a7045..46fe8cc13d67 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
goto out;
pmd = pmd_offset(pud, addr);
+ if (pmd_trans_huge(*pmd))
+ goto out;
if (!pmd_present(*pmd))
goto out;
@@ -246,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
- (struct page *)radix_tree_deref_slot(pslot) != page) {
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -318,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
- (struct page *)radix_tree_deref_slot(pslot) != page) {
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -614,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* to the newly allocated page in newpage.
*/
static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, int offlining)
+ struct page *page, int force, bool offlining, bool sync)
{
int rc = 0;
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
int remap_swapcache = 1;
- int rcu_locked = 0;
int charge = 0;
struct mem_cgroup *mem = NULL;
struct anon_vma *anon_vma = NULL;
@@ -632,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
/* page was freed from under us. So we are done. */
goto move_newpage;
}
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page)))
+ goto move_newpage;
/* prepare cgroup just returns 0 or -ENOMEM */
rc = -EAGAIN;
@@ -639,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
if (!trylock_page(page)) {
if (!force)
goto move_newpage;
+
+ /*
+ * It's not safe for direct compaction to call lock_page.
+ * For example, during page readahead pages are added locked
+ * to the LRU. Later, when the IO completes the pages are
+ * marked uptodate and unlocked. However, the queueing
+ * could be merging multiple pages for one bio (e.g.
+ * mpage_readpages). If an allocation happens for the
+ * second or third page, the process can end up locking
+ * the same page twice and deadlocking. Rather than
+ * trying to be clever about what pages can be locked,
+ * avoid the use of lock_page for direct compaction
+ * altogether.
+ */
+ if (current->flags & PF_MEMALLOC)
+ goto move_newpage;
+
lock_page(page);
}
@@ -665,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
BUG_ON(charge);
if (PageWriteback(page)) {
- if (!force)
+ if (!force || !sync)
goto uncharge;
wait_on_page_writeback(page);
}
/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
* we cannot notice that anon_vma is freed while we migrates a page.
- * This rcu_read_lock() delays freeing anon_vma pointer until the end
+ * This get_anon_vma() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*/
if (PageAnon(page)) {
- rcu_read_lock();
- rcu_locked = 1;
-
- /* Determine how to safely use anon_vma */
- if (!page_mapped(page)) {
- if (!PageSwapCache(page))
- goto rcu_unlock;
-
+ /*
+ * Only page_lock_anon_vma() understands the subtleties of
+ * getting a hold on an anon_vma from outside one of its mms.
+ */
+ anon_vma = page_lock_anon_vma(page);
+ if (anon_vma) {
+ /*
+ * Take a reference count on the anon_vma if the
+ * page is mapped so that it is guaranteed to
+ * exist when the page is remapped later
+ */
+ get_anon_vma(anon_vma);
+ page_unlock_anon_vma(anon_vma);
+ } else if (PageSwapCache(page)) {
/*
* We cannot be sure that the anon_vma of an unmapped
* swapcache page is safe to use because we don't
@@ -700,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
*/
remap_swapcache = 0;
} else {
- /*
- * Take a reference count on the anon_vma if the
- * page is mapped so that it is guaranteed to
- * exist when the page is remapped later
- */
- anon_vma = page_anon_vma(page);
- get_anon_vma(anon_vma);
+ goto uncharge;
}
}
@@ -723,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* free the metadata, so the page can be freed.
*/
if (!page->mapping) {
- if (!PageAnon(page) && page_has_private(page)) {
- /*
- * Go direct to try_to_free_buffers() here because
- * a) that's what try_to_release_page() would do anyway
- * b) we may be under rcu_read_lock() here, so we can't
- * use GFP_KERNEL which is what try_to_release_page()
- * needs to be effective.
- */
+ VM_BUG_ON(PageAnon(page));
+ if (page_has_private(page)) {
try_to_free_buffers(page);
- goto rcu_unlock;
+ goto uncharge;
}
goto skip_unmap;
}
@@ -746,17 +761,14 @@ skip_unmap:
if (rc && remap_swapcache)
remove_migration_ptes(page, page);
-rcu_unlock:
/* Drop an anon_vma reference if we took one */
if (anon_vma)
drop_anon_vma(anon_vma);
- if (rcu_locked)
- rcu_read_unlock();
uncharge:
if (!charge)
- mem_cgroup_end_migration(mem, page, newpage);
+ mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
@@ -810,12 +822,11 @@ move_newpage:
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
unsigned long private, struct page *hpage,
- int force, int offlining)
+ int force, bool offlining, bool sync)
{
int rc = 0;
int *result = NULL;
struct page *new_hpage = get_new_page(hpage, private, &result);
- int rcu_locked = 0;
struct anon_vma *anon_vma = NULL;
if (!new_hpage)
@@ -824,18 +835,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
rc = -EAGAIN;
if (!trylock_page(hpage)) {
- if (!force)
+ if (!force || !sync)
goto out;
lock_page(hpage);
}
if (PageAnon(hpage)) {
- rcu_read_lock();
- rcu_locked = 1;
-
- if (page_mapped(hpage)) {
- anon_vma = page_anon_vma(hpage);
- atomic_inc(&anon_vma->external_refcount);
+ anon_vma = page_lock_anon_vma(hpage);
+ if (anon_vma) {
+ get_anon_vma(anon_vma);
+ page_unlock_anon_vma(anon_vma);
}
}
@@ -847,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (rc)
remove_migration_ptes(hpage, hpage);
- if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
- &anon_vma->lock)) {
- int empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
- if (empty)
- anon_vma_free(anon_vma);
- }
-
- if (rcu_locked)
- rcu_read_unlock();
+ if (anon_vma)
+ drop_anon_vma(anon_vma);
out:
unlock_page(hpage);
@@ -892,7 +893,8 @@ out:
* Return: Number of pages not migrated or error code.
*/
int migrate_pages(struct list_head *from,
- new_page_t get_new_page, unsigned long private, int offlining)
+ new_page_t get_new_page, unsigned long private, bool offlining,
+ bool sync)
{
int retry = 1;
int nr_failed = 0;
@@ -912,7 +914,8 @@ int migrate_pages(struct list_head *from,
cond_resched();
rc = unmap_and_move(get_new_page, private,
- page, pass > 2, offlining);
+ page, pass > 2, offlining,
+ sync);
switch(rc) {
case -ENOMEM:
@@ -941,7 +944,8 @@ out:
}
int migrate_huge_pages(struct list_head *from,
- new_page_t get_new_page, unsigned long private, int offlining)
+ new_page_t get_new_page, unsigned long private, bool offlining,
+ bool sync)
{
int retry = 1;
int nr_failed = 0;
@@ -957,7 +961,8 @@ int migrate_huge_pages(struct list_head *from,
cond_resched();
rc = unmap_and_move_huge_page(get_new_page,
- private, page, pass > 2, offlining);
+ private, page, pass > 2, offlining,
+ sync);
switch(rc) {
case -ENOMEM:
@@ -1042,7 +1047,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
goto set_status;
- page = follow_page(vma, pp->addr, FOLL_GET);
+ page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1090,7 +1095,7 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0);
+ (unsigned long)pm, 0, true);
if (err)
putback_lru_pages(&pagelist);
}
diff --git a/mm/mincore.c b/mm/mincore.c
index 9ac42dc6d7b6..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
+ vec += (next - addr) >> PAGE_SHIFT;
+ continue;
+ }
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd))
mincore_unmapped_range(vma, addr, next, vec);
else
diff --git a/mm/mlock.c b/mm/mlock.c
index b70919ce4f72..13e81ee8be9d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
* vma->vm_mm->mmap_sem must be held for at least read.
*/
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ int *nonblocking)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start;
- struct page *pages[16]; /* 16 gives a reasonable batch */
int nr_pages = (end - start) / PAGE_SIZE;
- int ret = 0;
int gup_flags;
VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
VM_BUG_ON(end > vma->vm_end);
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
- gup_flags = FOLL_TOUCH | FOLL_GET;
- if (vma->vm_flags & VM_WRITE)
+ gup_flags = FOLL_TOUCH;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
gup_flags |= FOLL_WRITE;
+ if (vma->vm_flags & VM_LOCKED)
+ gup_flags |= FOLL_MLOCK;
+
/* We don't try to access the guard page of a stack vma */
if (stack_guard_page(vma, start)) {
addr += PAGE_SIZE;
nr_pages--;
}
- while (nr_pages > 0) {
- int i;
-
- cond_resched();
-
- /*
- * get_user_pages makes pages present if we are
- * setting mlock. and this extra reference count will
- * disable migration of this page. However, page may
- * still be truncated out from under us.
- */
- ret = __get_user_pages(current, mm, addr,
- min_t(int, nr_pages, ARRAY_SIZE(pages)),
- gup_flags, pages, NULL);
- /*
- * This can happen for, e.g., VM_NONLINEAR regions before
- * a page has been allocated and mapped at a given offset,
- * or for addresses that map beyond end of a file.
- * We'll mlock the pages if/when they get faulted in.
- */
- if (ret < 0)
- break;
-
- lru_add_drain(); /* push cached pages to LRU */
-
- for (i = 0; i < ret; i++) {
- struct page *page = pages[i];
-
- if (page->mapping) {
- /*
- * That preliminary check is mainly to avoid
- * the pointless overhead of lock_page on the
- * ZERO_PAGE: which might bounce very badly if
- * there is contention. However, we're still
- * dirtying its cacheline with get/put_page:
- * we'll add another __get_user_pages flag to
- * avoid it if that case turns out to matter.
- */
- lock_page(page);
- /*
- * Because we lock page here and migration is
- * blocked by the elevated reference, we need
- * only check for file-cache page truncation.
- */
- if (page->mapping)
- mlock_vma_page(page);
- unlock_page(page);
- }
- put_page(page); /* ref from get_user_pages() */
- }
-
- addr += ret * PAGE_SIZE;
- nr_pages -= ret;
- ret = 0;
- }
-
- return ret; /* 0 or negative error code */
+ return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
+ NULL, NULL, nonblocking);
}
/*
@@ -280,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current))) {
- __mlock_vma_pages_range(vma, start, end);
+ __mlock_vma_pages_range(vma, start, end, NULL);
/* Hide errors from mmap() and other callers */
return 0;
@@ -372,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
int ret = 0;
int lock = newflags & VM_LOCKED;
- if (newflags == vma->vm_flags ||
- (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
+ is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
goto out; /* don't set VM_LOCKED, don't count */
- if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current)) {
- if (lock)
- make_pages_present(start, end);
- goto out; /* don't set VM_LOCKED, don't count */
- }
-
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +363,10 @@ success:
* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
*/
- if (lock) {
+ if (lock)
vma->vm_flags = newflags;
- ret = __mlock_vma_pages_range(vma, start, end);
- if (ret < 0)
- ret = __mlock_posix_error_return(ret);
- } else {
+ else
munlock_vma_pages_range(vma, start, end);
- }
out:
*prev = vma;
@@ -439,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
struct vm_area_struct * vma, * prev;
int error;
- len = PAGE_ALIGN(len);
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
if (end < start)
return -EINVAL;
@@ -482,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
+static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ int ret = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. __mlock_vma_pages_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ ret = __mlock_posix_error_return(ret);
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
unsigned long locked;
@@ -507,6 +504,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
up_write(&current->mm->mmap_sem);
+ if (!error)
+ error = do_mlock_pages(start, len, 0);
return error;
}
@@ -571,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
up_write(&current->mm->mmap_sem);
+ if (!ret && (flags & MCL_CURRENT)) {
+ /* Ignore errors */
+ do_mlock_pages(0, TASK_SIZE, 1);
+ }
out:
return ret;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 50a4aa0255a0..2ec8eb5a9cdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
#include <linux/mmu_notifier.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
+#include <linux/khugepaged.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
down_write(&mm->mmap_sem);
#ifdef CONFIG_COMPAT_BRK
- min_brk = mm->end_code;
+ /*
+ * CONFIG_COMPAT_BRK can still be overridden by setting
+ * randomize_va_space to 2, which will still cause mm->start_brk
+ * to be arbitrarily shifted
+ */
+ if (mm->start_brk > PAGE_ALIGN(mm->end_data))
+ min_brk = mm->start_brk;
+ else
+ min_brk = mm->end_data;
#else
min_brk = mm->start_brk;
#endif
@@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
+ vma_adjust_trans_huge(vma, start, end, adjust_next);
+
/*
* When changing only vma->vm_end, we don't really need anon_vma
* lock. This is a fairly rare case by itself, but the anon_vma
@@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
end, prev->vm_pgoff, NULL);
if (err)
return NULL;
+ khugepaged_enter_vma_merge(prev);
return prev;
}
@@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
next->vm_pgoff - pglen, NULL);
if (err)
return NULL;
+ khugepaged_enter_vma_merge(area);
return area;
}
@@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
vma_unlock_anon_vma(vma);
+ khugepaged_enter_vma_merge(vma);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma,
}
}
vma_unlock_anon_vma(vma);
+ khugepaged_enter_vma_merge(vma);
return error;
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
return young;
}
+int __mmu_notifier_test_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+ int young = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->test_young) {
+ young = mn->ops->test_young(mn, mm, address);
+ if (young)
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return young;
+}
+
void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
pte_t pte)
{
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
return 1;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
- unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-
- /*
- * While kswapd is awake, it is considered the zone is under some
- * memory pressure. Under pressure, there is a risk that
- * per-cpu-counter-drift will allow the min watermark to be breached
- * potentially causing a live-lock. While kswapd is awake and
- * free pages are low, get a better estimate for free pages
- */
- if (nr_free_pages < zone->percpu_drift_mark &&
- !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
- return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-
- return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c5133873097..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte_unmap_unlock(pte - 1, ptl);
}
-static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ split_huge_page_pmd(vma->vm_mm, pmd);
+ else if (change_huge_pmd(vma, pmd, addr, newprot))
+ continue;
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
- change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
+ change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+ dirty_accountable);
} while (pmd++, addr = next, addr != end);
}
-static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
+ change_pmd_range(vma, pud, addr, next, newprot,
+ dirty_accountable);
} while (pud++, addr = next, addr != end);
}
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
+ change_pud_range(vma, pgd, addr, next, newprot,
+ dirty_accountable);
} while (pgd++, addr = next, addr != end);
flush_tlb_range(vma, start, end);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 563fbdd6293a..9925b6391b80 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return NULL;
pmd = pmd_offset(pud, addr);
+ split_huge_page_pmd(mm, pmd);
if (pmd_none_or_clear_bad(pmd))
return NULL;
return pmd;
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
if (!pmd)
return NULL;
- if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
return NULL;
return pmd;
@@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
- new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+ new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
diff --git a/mm/nommu.c b/mm/nommu.c
index ef4045d010d5..f59e1424d3db 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp)
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int foll_flags,
- struct page **pages, struct vm_area_struct **vmas)
+ struct page **pages, struct vm_area_struct **vmas,
+ int *retry)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
}
EXPORT_SYMBOL(get_user_pages);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b5d8a1f820a0..2cb01f6ec5d0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -410,9 +410,12 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
unsigned long background;
unsigned long dirty;
- unsigned long available_memory = determine_dirtyable_memory();
+ unsigned long uninitialized_var(available_memory);
struct task_struct *tsk;
+ if (!vm_dirty_bytes || !dirty_background_bytes)
+ available_memory = determine_dirtyable_memory();
+
if (vm_dirty_bytes)
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
else
@@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page);
int __set_page_dirty_no_writeback(struct page *page)
{
if (!PageDirty(page))
- SetPageDirty(page);
+ return !TestSetPageDirty(page);
return 0;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 826ba6922e84..90c1439549fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
+/* update __split_huge_page_refcount if you change this function */
static int destroy_compound_page(struct page *page, unsigned long order)
{
int i;
@@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
*
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
- unsigned long buddy_idx = page_idx ^ (1 << order);
-
- return page + (buddy_idx - page_idx);
-}
-
static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
{
- return (page_idx & ~(1 << order));
+ return page_idx ^ (1 << order);
}
/*
@@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
* order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
@@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page,
{
unsigned long page_idx;
unsigned long combined_idx;
+ unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
if (unlikely(PageCompound(page)))
@@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) {
- buddy = __page_find_buddy(page, page_idx, order);
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
break;
@@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page,
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
- combined_idx = __find_combined_index(page_idx, order);
+ combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
@@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page,
*/
if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
struct page *higher_page, *higher_buddy;
- combined_idx = __find_combined_index(page_idx, order);
- higher_page = page + combined_idx - page_idx;
- higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+ combined_idx = buddy_idx & page_idx;
+ higher_page = page + (combined_idx - page_idx);
+ buddy_idx = __find_buddy_index(combined_idx, order + 1);
+ higher_buddy = page + (buddy_idx - combined_idx);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
@@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
trace_mm_page_free_direct(page, order);
kmemcheck_free_shadow(page, order);
- for (i = 0; i < (1 << order); i++) {
- struct page *pg = page + i;
-
- if (PageAnon(pg))
- pg->mapping = NULL;
- bad += free_pages_check(pg);
- }
+ if (PageAnon(page))
+ page->mapping = NULL;
+ for (i = 0; i < (1 << order); i++)
+ bad += free_pages_check(page + i);
if (bad)
return false;
@@ -1460,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
int o;
+ free_pages -= (1 << order) + 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
- return 0;
+ return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min >>= 1;
if (free_pages <= min)
- return 0;
+ return false;
}
- return 1;
+ return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
}
#ifdef CONFIG_NUMA
@@ -1793,15 +1805,18 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
{
struct page *page;
if (!order || compaction_deferred(preferred_zone))
return NULL;
+ current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask);
+ nodemask, sync_migration);
+ current->flags &= ~PF_MEMALLOC;
if (*did_some_progress != COMPACT_SKIPPED) {
/* Page migration frees to the PCP lists but we want merging */
@@ -1837,7 +1852,8 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
{
return NULL;
}
@@ -1852,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
{
struct page *page = NULL;
struct reclaim_state reclaim_state;
- struct task_struct *p = current;
bool drained = false;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- p->flags |= PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ current->reclaim_state = &reclaim_state;
*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
- p->reclaim_state = NULL;
+ current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
- p->flags &= ~PF_MEMALLOC;
+ current->flags &= ~PF_MEMALLOC;
cond_resched();
@@ -1920,19 +1935,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
static inline
void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
- enum zone_type high_zoneidx)
+ enum zone_type high_zoneidx,
+ enum zone_type classzone_idx)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
- wakeup_kswapd(zone, order);
+ wakeup_kswapd(zone, order, classzone_idx);
}
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
- struct task_struct *p = current;
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
const gfp_t wait = gfp_mask & __GFP_WAIT;
@@ -1948,18 +1963,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
if (!wait) {
- alloc_flags |= ALLOC_HARDER;
+ /*
+ * Not worth trying to allocate harder for
+ * __GFP_NOMEMALLOC even if it can't schedule.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(p)) && !in_interrupt())
+ } else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
if (!in_interrupt() &&
- ((p->flags & PF_MEMALLOC) ||
+ ((current->flags & PF_MEMALLOC) ||
unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
@@ -1978,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
- struct task_struct *p = current;
+ bool sync_migration = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2003,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto nopage;
restart:
- wake_all_kswapd(order, zonelist, high_zoneidx);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapd(order, zonelist, high_zoneidx,
+ zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2034,21 +2056,26 @@ rebalance:
goto nopage;
/* Avoid recursion of direct reclaim */
- if (p->flags & PF_MEMALLOC)
+ if (current->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
- /* Try direct compaction */
+ /*
+ * Try direct compaction. The first pass is asynchronous. Subsequent
+ * attempts after direct reclaim are synchronous
+ */
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress);
+ migratetype, &did_some_progress,
+ sync_migration);
if (page)
goto got_pg;
+ sync_migration = true;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2102,13 +2129,27 @@ rebalance:
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
+ } else {
+ /*
+ * High-order allocations do not necessarily loop after
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, &did_some_progress,
+ sync_migration);
+ if (page)
+ goto got_pg;
}
nopage:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
printk(KERN_WARNING "%s: page allocation failure."
" order:%d, mode:0x%x\n",
- p->comm, order, gfp_mask);
+ current->comm, order, gfp_mask);
dump_stack();
show_mem();
}
@@ -2442,7 +2483,7 @@ void show_free_areas(void)
" all_unreclaimable? %s"
"\n",
zone->name,
- K(zone_nr_free_pages(zone)),
+ K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
@@ -2585,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- if (s)
- return __parse_numa_zonelist_order(s);
- return 0;
+ int ret;
+
+ if (!s)
+ return 0;
+
+ ret = __parse_numa_zonelist_order(s);
+ if (ret == 0)
+ strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+ return ret;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -5517,7 +5565,6 @@ static struct trace_print_flags pageflag_names[] = {
{1UL << PG_swapcache, "swapcache" },
{1UL << PG_mappedtodisk, "mappedtodisk" },
{1UL << PG_reclaim, "reclaim" },
- {1UL << PG_buddy, "buddy" },
{1UL << PG_swapbacked, "swapbacked" },
{1UL << PG_unevictable, "unevictable" },
#ifdef CONFIG_MMU
@@ -5565,7 +5612,7 @@ void dump_page(struct page *page)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
- page, page_count(page), page_mapcount(page),
+ page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 38cc58b8b2b0..7cfa6ae02303 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ split_huge_page_pmd(walk->mm, pmd);
if (pmd_none_or_clear_bad(pmd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
return NULL;
vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
- pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+ pcpu_nr_groups, pcpu_atom_size);
if (!vms) {
pcpu_free_chunk(chunk);
return NULL;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..d030548047e2
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,123 @@
+/*
+ * mm/pgtable-generic.c
+ *
+ * Generic pgtable methods declared in asm-generic/pgtable.h
+ *
+ * Copyright (C) 2010 Linus Torvalds
+ */
+
+#include <asm/tlb.h>
+#include <asm-generic/pgtable.h>
+
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+/*
+ * Only sets the access flags (dirty, accessed, and
+ * writable). Furthermore, we know it always gets set to a "more
+ * permissive" setting, which allows most architectures to optimize
+ * this. We return whether the PTE actually changed, which in turn
+ * instructs the caller to do things like update__mmu_cache. This
+ * used to be done in the caller, but sparc needs minor faults to
+ * force that call on sun4c so we changed this macro slightly
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+ if (changed) {
+ set_pte_at(vma->vm_mm, address, ptep, entry);
+ flush_tlb_page(vma, address);
+ }
+ return changed;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int changed = !pmd_same(*pmdp, entry);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ if (changed) {
+ set_pmd_at(vma->vm_mm, address, pmdp, entry);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+ return changed;
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+ BUG();
+ return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ int young;
+ young = ptep_test_and_clear_young(vma, address, ptep);
+ if (young)
+ flush_tlb_page(vma, address);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+ BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+ pte_t pte;
+ pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+ flush_tlb_page(vma, address);
+ return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+ BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return pmd;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ pmd_t pmd = pmd_mksplitting(*pmdp);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+ /* tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+ BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index c95d2ba27a0b..f21f4a1d6a1c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
list_add(&avc->same_vma, &vma->anon_vma_chain);
anon_vma_lock(anon_vma);
+ /*
+ * It's critical to add new vmas to the tail of the anon_vma,
+ * see comment in huge_memory.c:__split_huge_page().
+ */
list_add_tail(&avc->same_anon_vma, &anon_vma->head);
anon_vma_unlock(anon_vma);
}
@@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
* Returns virtual address or -EFAULT if page's index/offset is not
* within the range mapped the @vma.
*/
-static inline unsigned long
+inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return NULL;
+ if (pmd_trans_huge(*pmd))
+ return NULL;
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
@@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
- pte_t *pte;
- spinlock_t *ptl;
int referenced = 0;
- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
- goto out;
-
/*
* Don't want to elevate referenced for mlocked page that gets this far,
* in order that it progresses to try_to_unmap and is moved to the
* unevictable list.
*/
if (vma->vm_flags & VM_LOCKED) {
- *mapcount = 1; /* break early from loop */
+ *mapcount = 0; /* break early from loop */
*vm_flags |= VM_LOCKED;
- goto out_unmap;
- }
-
- if (ptep_clear_flush_young_notify(vma, address, pte)) {
- /*
- * Don't treat a reference through a sequentially read
- * mapping as such. If the page has been used in
- * another mapping, we will catch it; if this other
- * mapping is already gone, the unmap path will have
- * set PG_referenced or activated the page.
- */
- if (likely(!VM_SequentialReadHint(vma)))
- referenced++;
+ goto out;
}
/* Pretend the page is referenced if the task has the
@@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
rwsem_is_locked(&mm->mmap_sem))
referenced++;
-out_unmap:
+ if (unlikely(PageTransHuge(page))) {
+ pmd_t *pmd;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_FLAG);
+ if (pmd && !pmd_trans_splitting(*pmd) &&
+ pmdp_clear_flush_young_notify(vma, address, pmd))
+ referenced++;
+ spin_unlock(&mm->page_table_lock);
+ } else {
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pte = page_check_address(page, mm, address, &ptl, 0);
+ if (!pte)
+ goto out;
+
+ if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ /*
+ * Don't treat a reference through a sequentially read
+ * mapping as such. If the page has been used in
+ * another mapping, we will catch it; if this other
+ * mapping is already gone, the unmap path will have
+ * set PG_referenced or activated the page.
+ */
+ if (likely(!VM_SequentialReadHint(vma)))
+ referenced++;
+ }
+ pte_unmap_unlock(pte, ptl);
+ }
+
(*mapcount)--;
- pte_unmap_unlock(pte, ptl);
if (referenced)
*vm_flags |= vma->vm_flags;
@@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, int exclusive)
{
int first = atomic_inc_and_test(&page->_mapcount);
- if (first)
- __inc_zone_page_state(page, NR_ANON_PAGES);
+ if (first) {
+ if (!PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __inc_zone_page_state(page,
+ NR_ANON_TRANSPARENT_HUGEPAGES);
+ }
if (unlikely(PageKsm(page)))
return;
@@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
- __inc_zone_page_state(page, NR_ANON_PAGES);
+ if (!PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__page_set_anon_rmap(page, vma, address, 1);
if (page_evictable(page, vma))
lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, 1);
+ mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
}
@@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page)
return;
if (PageAnon(page)) {
mem_cgroup_uncharge_page(page);
- __dec_zone_page_state(page, NR_ANON_PAGES);
+ if (!PageTransHuge(page))
+ __dec_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __dec_zone_page_state(page,
+ NR_ANON_TRANSPARENT_HUGEPAGES);
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, -1);
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
/*
* It would be tidy to reset the PageAnon mapping here,
@@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
}
-static bool is_vma_temporary_stack(struct vm_area_struct *vma)
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
int ret;
BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
if (unlikely(PageKsm(page)))
ret = try_to_unmap_ksm(page, flags);
diff --git a/mm/slub.c b/mm/slub.c
index 008cd743a36a..c7ef0070dd86 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3636,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf + len, "%7ld ", l->count);
if (l->addr)
- len += sprint_symbol(buf + len, (unsigned long)l->addr);
+ len += sprintf(buf + len, "%pS", (void *)l->addr);
else
len += sprintf(buf + len, "<not-available>");
@@ -3946,12 +3946,9 @@ SLAB_ATTR(min_partial);
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
- if (s->ctor) {
- int n = sprint_symbol(buf, (unsigned long)s->ctor);
-
- return n + sprintf(buf + n, "\n");
- }
- return 0;
+ if (!s->ctor)
+ return 0;
+ return sprintf(buf, "%pS\n", s->ctor);
}
SLAB_ATTR_RO(ctor);
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..93250207c5cf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
static void free_map_bootmem(struct page *page, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
- int magic;
+ unsigned long magic;
for (i = 0; i < nr_pages; i++, page++) {
- magic = atomic_read(&page->_mapcount);
+ magic = (unsigned long) page->lru.next;
BUG_ON(magic == NODE_INFO);
diff --git a/mm/swap.c b/mm/swap.c
index 3f4854205b16..bbc1ce9f9460 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page)
del_page_from_lru(zone, page);
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
+}
+
+static void __put_single_page(struct page *page)
+{
+ __page_cache_release(page);
free_hot_cold_page(page, 0);
}
-static void put_compound_page(struct page *page)
+static void __put_compound_page(struct page *page)
{
- page = compound_head(page);
- if (put_page_testzero(page)) {
- compound_page_dtor *dtor;
+ compound_page_dtor *dtor;
- dtor = get_compound_page_dtor(page);
- (*dtor)(page);
+ __page_cache_release(page);
+ dtor = get_compound_page_dtor(page);
+ (*dtor)(page);
+}
+
+static void put_compound_page(struct page *page)
+{
+ if (unlikely(PageTail(page))) {
+ /* __split_huge_page_refcount can run under us */
+ struct page *page_head = page->first_page;
+ smp_rmb();
+ /*
+ * If PageTail is still set after smp_rmb() we can be sure
+ * that the page->first_page we read wasn't a dangling pointer.
+ * See __split_huge_page_refcount() smp_wmb().
+ */
+ if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+ unsigned long flags;
+ /*
+ * Verify that our page_head wasn't converted
+ * to a a regular page before we got a
+ * reference on it.
+ */
+ if (unlikely(!PageHead(page_head))) {
+ /* PageHead is cleared after PageTail */
+ smp_rmb();
+ VM_BUG_ON(PageTail(page));
+ goto out_put_head;
+ }
+ /*
+ * Only run compound_lock on a valid PageHead,
+ * after having it pinned with
+ * get_page_unless_zero() above.
+ */
+ smp_mb();
+ /* page_head wasn't a dangling pointer */
+ flags = compound_lock_irqsave(page_head);
+ if (unlikely(!PageTail(page))) {
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
+ VM_BUG_ON(PageHead(page_head));
+ out_put_head:
+ if (put_page_testzero(page_head))
+ __put_single_page(page_head);
+ out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
+ }
+ VM_BUG_ON(page_head != page->first_page);
+ /*
+ * We can release the refcount taken by
+ * get_page_unless_zero now that
+ * split_huge_page_refcount is blocked on the
+ * compound_lock.
+ */
+ if (put_page_testzero(page_head))
+ VM_BUG_ON(1);
+ /* __split_huge_page_refcount will wait now */
+ VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ atomic_dec(&page->_count);
+ VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+ compound_unlock_irqrestore(page_head, flags);
+ if (put_page_testzero(page_head)) {
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+ } else {
+ /* page_head is a dangling pointer */
+ VM_BUG_ON(PageTail(page));
+ goto out_put_single;
+ }
+ } else if (put_page_testzero(page)) {
+ if (PageHead(page))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
}
}
@@ -75,7 +155,7 @@ void put_page(struct page *page)
if (unlikely(PageCompound(page)))
put_compound_page(page);
else if (put_page_testzero(page))
- __page_cache_release(page);
+ __put_single_page(page);
}
EXPORT_SYMBOL(put_page);
@@ -98,15 +178,13 @@ void put_pages_list(struct list_head *pages)
}
EXPORT_SYMBOL(put_pages_list);
-/*
- * pagevec_move_tail() must be called with IRQ disabled.
- * Otherwise this may cause nasty races.
- */
-static void pagevec_move_tail(struct pagevec *pvec)
+static void pagevec_lru_move_fn(struct pagevec *pvec,
+ void (*move_fn)(struct page *page, void *arg),
+ void *arg)
{
int i;
- int pgmoved = 0;
struct zone *zone = NULL;
+ unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
@@ -114,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec)
if (pagezone != zone) {
if (zone)
- spin_unlock(&zone->lru_lock);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
zone = pagezone;
- spin_lock(&zone->lru_lock);
- }
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int lru = page_lru_base_type(page);
- list_move_tail(&page->lru, &zone->lru[lru].list);
- pgmoved++;
+ spin_lock_irqsave(&zone->lru_lock, flags);
}
+
+ (*move_fn)(page, arg);
}
if (zone)
- spin_unlock(&zone->lru_lock);
- __count_vm_events(PGROTATED, pgmoved);
- release_pages(pvec->pages, pvec->nr, pvec->cold);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
pagevec_reinit(pvec);
}
+static void pagevec_move_tail_fn(struct page *page, void *arg)
+{
+ int *pgmoved = arg;
+ struct zone *zone = page_zone(page);
+
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ int lru = page_lru_base_type(page);
+ list_move_tail(&page->lru, &zone->lru[lru].list);
+ (*pgmoved)++;
+ }
+}
+
+/*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+ int pgmoved = 0;
+
+ pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
+ __count_vm_events(PGROTATED, pgmoved);
+}
+
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
* inactive list.
*/
-void rotate_reclaimable_page(struct page *page)
+void rotate_reclaimable_page(struct page *page)
{
if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
!PageUnevictable(page) && PageLRU(page)) {
@@ -173,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
}
/*
- * FIXME: speed this up?
+ * A page will go to active list either by activate_page or putback_lru_page.
+ * In the activate_page case, the page hasn't active bit set. The page might
+ * not in LRU list because it's isolated before it gets a chance to be moved to
+ * active list. The window is small because pagevec just stores several pages.
+ * For such case, we do nothing for such page.
+ * In the putback_lru_page case, the page isn't in lru list but has active
+ * bit set
*/
-void activate_page(struct page *page)
+static void __activate_page(struct page *page, void *arg)
{
struct zone *zone = page_zone(page);
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+ bool putback = !PageLRU(page);
- spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_cache(page);
- int lru = page_lru_base_type(page);
+ /* The page is isolated before it's moved to active list */
+ if (!PageLRU(page) && !PageActive(page))
+ return;
+ if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page))
+ return;
+
+ if (!putback)
del_page_from_lru_list(zone, page, lru);
+ else
+ SetPageLRU(page);
- SetPageActive(page);
- lru += LRU_ACTIVE;
- add_page_to_lru_list(zone, page, lru);
- __count_vm_event(PGACTIVATE);
+ SetPageActive(page);
+ lru += LRU_ACTIVE;
+ add_page_to_lru_list(zone, page, lru);
- update_page_reclaim_stat(zone, page, file, 1);
+ if (putback)
+ return;
+ __count_vm_event(PGACTIVATE);
+ update_page_reclaim_stat(zone, page, file, 1);
+}
+
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+
+static void activate_page_drain(int cpu)
+{
+ struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+}
+
+void activate_page(struct page *page)
+{
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+ put_cpu_var(activate_page_pvecs);
+ }
+}
+
+/* Caller should hold zone->lru_lock */
+int putback_active_lru_page(struct zone *zone, struct page *page)
+{
+ struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+
+ if (!pagevec_add(pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+ spin_lock_irq(&zone->lru_lock);
}
+ put_cpu_var(activate_page_pvecs);
+ return 1;
+}
+
+#else
+static inline void activate_page_drain(int cpu)
+{
+}
+
+void activate_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page))
+ __activate_page(page, NULL);
spin_unlock_irq(&zone->lru_lock);
}
+#endif
/*
* Mark a page as having seen activity.
@@ -292,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu)
pagevec_move_tail(pvec);
local_irq_restore(flags);
}
+ activate_page_drain(cpu);
}
void lru_add_drain(void)
@@ -399,44 +565,70 @@ void __pagevec_release(struct pagevec *pvec)
EXPORT_SYMBOL(__pagevec_release);
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct zone* zone,
+ struct page *page, struct page *page_tail)
+{
+ int active;
+ enum lru_list lru;
+ const int file = 0;
+ struct list_head *head;
+
+ VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON(PageCompound(page_tail));
+ VM_BUG_ON(PageLRU(page_tail));
+ VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+
+ SetPageLRU(page_tail);
+
+ if (page_evictable(page_tail, NULL)) {
+ if (PageActive(page)) {
+ SetPageActive(page_tail);
+ active = 1;
+ lru = LRU_ACTIVE_ANON;
+ } else {
+ active = 0;
+ lru = LRU_INACTIVE_ANON;
+ }
+ update_page_reclaim_stat(zone, page_tail, file, active);
+ if (likely(PageLRU(page)))
+ head = page->lru.prev;
+ else
+ head = &zone->lru[lru].list;
+ __add_page_to_lru_list(zone, page_tail, lru, head);
+ } else {
+ SetPageUnevictable(page_tail);
+ add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
+ }
+}
+
+static void ____pagevec_lru_add_fn(struct page *page, void *arg)
+{
+ enum lru_list lru = (enum lru_list)arg;
+ struct zone *zone = page_zone(page);
+ int file = is_file_lru(lru);
+ int active = is_active_lru(lru);
+
+ VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(PageUnevictable(page));
+ VM_BUG_ON(PageLRU(page));
+
+ SetPageLRU(page);
+ if (active)
+ SetPageActive(page);
+ update_page_reclaim_stat(zone, page, file, active);
+ add_page_to_lru_list(zone, page, lru);
+}
+
/*
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
*/
void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
{
- int i;
- struct zone *zone = NULL;
-
VM_BUG_ON(is_unevictable_lru(lru));
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- struct zone *pagezone = page_zone(page);
- int file;
- int active;
-
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
- VM_BUG_ON(PageActive(page));
- VM_BUG_ON(PageUnevictable(page));
- VM_BUG_ON(PageLRU(page));
- SetPageLRU(page);
- active = is_active_lru(lru);
- file = is_file_lru(lru);
- if (active)
- SetPageActive(page);
- update_page_reclaim_stat(zone, page, file, active);
- add_page_to_lru_list(zone, page, lru);
- }
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- release_pages(pvec->pages, pvec->nr, pvec->cold);
- pagevec_reinit(pvec);
+ pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
}
EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e10f5833167f..5c8cfabbc9bc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -157,6 +157,12 @@ int add_to_swap(struct page *page)
if (!entry.val)
return 0;
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page))) {
+ swapcache_free(entry, NULL);
+ return 0;
+ }
+
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b6adcfbf6f48..07a458d72fa8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (unlikely(pmd_trans_huge(*pmd)))
+ continue;
if (pmd_none_or_clear_bad(pmd))
continue;
ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index eb5cc7d00c5a..cac13b415635 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask);
- if (unlikely(IS_ERR(va))) {
+ if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
}
@@ -1315,13 +1315,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
-1, GFP_KERNEL, caller);
}
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
- int node, gfp_t gfp_mask)
-{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
- node, gfp_mask, __builtin_return_address(0));
-}
-
static struct vm_struct *find_vm_area(const void *addr)
{
struct vmap_area *va;
@@ -1537,25 +1530,12 @@ fail:
return NULL;
}
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
- void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
- __builtin_return_address(0));
-
- /*
- * A ref_count = 3 is needed because the vm_struct and vmap_area
- * structures allocated in the __get_vm_area_node() function contain
- * references to the virtual address of the vmalloc'ed block.
- */
- kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
-
- return addr;
-}
-
/**
- * __vmalloc_node - allocate virtually contiguous memory
+ * __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or -1
@@ -1565,9 +1545,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, void *caller)
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, int node, void *caller)
{
struct vm_struct *area;
void *addr;
@@ -1577,8 +1557,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
return NULL;
- area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
- VMALLOC_END, node, gfp_mask, caller);
+ area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
+ gfp_mask, caller);
if (!area)
return NULL;
@@ -1595,6 +1575,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
return addr;
}
+/**
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @node: node to use for allocation or -1
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
+ int node, void *caller)
+{
+ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ gfp_mask, prot, node, caller);
+}
+
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -2203,17 +2204,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
* @sizes: array containing size of each area
* @nr_vms: the number of areas to allocate
* @align: alignment, all entries in @offsets and @sizes must be aligned to this
- * @gfp_mask: allocation mask
*
* Returns: kmalloc'd vm_struct pointer array pointing to allocated
* vm_structs on success, %NULL on failure
*
* Percpu allocator wants to use congruent vm areas so that it can
* maintain the offsets among percpu areas. This function allocates
- * congruent vmalloc areas for it. These areas tend to be scattered
- * pretty far, distance between two areas easily going up to
- * gigabytes. To avoid interacting with regular vmallocs, these areas
- * are allocated from top.
+ * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
+ * be scattered pretty far, distance between two areas easily going up
+ * to gigabytes. To avoid interacting with regular vmallocs, these
+ * areas are allocated from top.
*
* Despite its complicated look, this allocator is rather simple. It
* does everything top-down and scans areas from the end looking for
@@ -2224,7 +2224,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
- size_t align, gfp_t gfp_mask)
+ size_t align)
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2234,8 +2234,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
unsigned long base, start, end, last_end;
bool purged = false;
- gfp_mask &= GFP_RECLAIM_MASK;
-
/* verify parameters and allocate data structures */
BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2268,14 +2266,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
return NULL;
}
- vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
- vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+ vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
+ vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
if (!vas || !vms)
goto err_free;
for (area = 0; area < nr_vms; area++) {
- vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
- vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+ vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+ vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
if (!vas[area] || !vms[area])
goto err_free;
}
@@ -2456,13 +2454,8 @@ static int s_show(struct seq_file *m, void *p)
seq_printf(m, "0x%p-0x%p %7ld",
v->addr, v->addr + v->size, v->size);
- if (v->caller) {
- char buff[KSYM_SYMBOL_LEN];
-
- seq_putc(m, ' ');
- sprint_symbol(buff, (unsigned long)v->caller);
- seq_puts(m, buff);
- }
+ if (v->caller)
+ seq_printf(m, " %pS", v->caller);
if (v->nr_pages)
seq_printf(m, " pages=%d", v->nr_pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9ca587c69274..99999a9b2b0b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
@@ -40,6 +41,7 @@
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/compaction.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -51,11 +53,23 @@
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
-enum lumpy_mode {
- LUMPY_MODE_NONE,
- LUMPY_MODE_ASYNC,
- LUMPY_MODE_SYNC,
-};
+/*
+ * reclaim_mode determines how the inactive list is shrunk
+ * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
+ * RECLAIM_MODE_ASYNC: Do not block
+ * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
+ * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
+ * page from the LRU and reclaim all pages within a
+ * naturally aligned range
+ * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ * order-0 pages and then compact the zone
+ */
+typedef unsigned __bitwise__ reclaim_mode_t;
+#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
+#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
+#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
+#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
+#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
@@ -88,7 +102,7 @@ struct scan_control {
* Intend to reclaim enough continuous memory rather than reclaim
* enough amount of memory. i.e, mode for high order allocation.
*/
- enum lumpy_mode lumpy_reclaim_mode;
+ reclaim_mode_t reclaim_mode;
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
@@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
return ret;
}
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
+static void set_reclaim_mode(int priority, struct scan_control *sc,
bool sync)
{
- enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
+ reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
/*
- * Some reclaim have alredy been failed. No worth to try synchronous
- * lumpy reclaim.
+ * Initially assume we are entering either lumpy reclaim or
+ * reclaim/compaction.Depending on the order, we will either set the
+ * sync mode or just reclaim order-0 pages later.
*/
- if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
- return;
+ if (COMPACTION_BUILD)
+ sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
+ else
+ sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
/*
- * If we need a large contiguous chunk of memory, or have
- * trouble getting a small set of contiguous pages, we
- * will reclaim both active and inactive pages.
+ * Avoid using lumpy reclaim or reclaim/compaction if possible by
+ * restricting when its set to either costly allocations or when
+ * under memory pressure
*/
if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
- sc->lumpy_reclaim_mode = mode;
+ sc->reclaim_mode |= syncmode;
else if (sc->order && priority < DEF_PRIORITY - 2)
- sc->lumpy_reclaim_mode = mode;
+ sc->reclaim_mode |= syncmode;
else
- sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+ sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
}
-static void disable_lumpy_reclaim_mode(struct scan_control *sc)
+static void reset_reclaim_mode(struct scan_control *sc)
{
- sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+ sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
}
static inline int is_page_cache_freeable(struct page *page)
@@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* first attempt to free a range of pages fails.
*/
if (PageWriteback(page) &&
- sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
+ (sc->reclaim_mode & RECLAIM_MODE_SYNC))
wait_on_page_writeback(page);
if (!PageWriteback(page)) {
@@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
ClearPageReclaim(page);
}
trace_mm_vmscan_writepage(page,
- trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
+ trace_reclaim_flags(page, sc->reclaim_mode));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -622,7 +639,7 @@ static enum page_references page_check_references(struct page *page,
referenced_page = TestClearPageReferenced(page);
/* Lumpy reclaim - ignore references */
- if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
+ if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
return PAGEREF_RECLAIM;
/*
@@ -739,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* for any page for which writeback has already
* started.
*/
- if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
+ if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
may_enter_fs)
wait_on_page_writeback(page);
else {
@@ -895,7 +912,7 @@ cull_mlocked:
try_to_free_swap(page);
unlock_page(page);
putback_lru_page(page);
- disable_lumpy_reclaim_mode(sc);
+ reset_reclaim_mode(sc);
continue;
activate_locked:
@@ -908,7 +925,7 @@ activate_locked:
keep_locked:
unlock_page(page);
keep:
- disable_lumpy_reclaim_mode(sc);
+ reset_reclaim_mode(sc);
keep_lumpy:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
@@ -1028,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
@@ -1086,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
nr_lumpy_taken++;
if (PageDirty(cursor_page))
nr_lumpy_dirty++;
@@ -1141,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
struct page *page;
list_for_each_entry(page, page_list, lru) {
+ int numpages = hpage_nr_pages(page);
lru = page_lru_base_type(page);
if (PageActive(page)) {
lru += LRU_ACTIVE;
ClearPageActive(page);
- nr_active++;
+ nr_active += numpages;
}
if (count)
- count[lru]++;
+ count[lru] += numpages;
}
return nr_active;
@@ -1253,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
spin_lock_irq(&zone->lru_lock);
continue;
}
- SetPageLRU(page);
lru = page_lru(page);
- add_page_to_lru_list(zone, page, lru);
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
+ int numpages = hpage_nr_pages(page);
+ reclaim_stat->recent_rotated[file] += numpages;
+ if (putback_active_lru_page(zone, page))
+ continue;
}
+ SetPageLRU(page);
+ add_page_to_lru_list(zone, page, lru);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
@@ -1324,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
return false;
/* Only stall on lumpy reclaim */
- if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+ if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
return false;
/* If we have relaimed everything on the isolated list, no stall */
@@ -1368,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
return SWAP_CLUSTER_MAX;
}
- set_lumpy_reclaim_mode(priority, sc, false);
+ set_reclaim_mode(priority, sc, false);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
nr_taken = isolate_pages_global(nr_to_scan,
&page_list, &nr_scanned, sc->order,
- sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
- ISOLATE_INACTIVE : ISOLATE_BOTH,
+ sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
zone, 0, file);
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
@@ -1388,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
} else {
nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
&page_list, &nr_scanned, sc->order,
- sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
- ISOLATE_INACTIVE : ISOLATE_BOTH,
+ sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
zone, sc->mem_cgroup,
0, file);
/*
@@ -1411,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
/* Check if we should syncronously wait for writeback */
if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
- set_lumpy_reclaim_mode(priority, sc, true);
+ set_reclaim_mode(priority, sc, true);
nr_reclaimed += shrink_page_list(&page_list, zone, sc);
}
@@ -1426,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
zone_idx(zone),
nr_scanned, nr_reclaimed,
priority,
- trace_shrink_flags(file, sc->lumpy_reclaim_mode));
+ trace_shrink_flags(file, sc->reclaim_mode));
return nr_reclaimed;
}
@@ -1466,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone,
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
- pgmoved++;
+ pgmoved += hpage_nr_pages(page);
if (!pagevec_add(&pvec, page) || list_empty(list)) {
spin_unlock_irq(&zone->lru_lock);
@@ -1534,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
}
if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
- nr_rotated++;
+ nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -1805,6 +1826,57 @@ out:
}
/*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+ unsigned long nr_reclaimed,
+ unsigned long nr_scanned,
+ struct scan_control *sc)
+{
+ unsigned long pages_for_compaction;
+ unsigned long inactive_lru_pages;
+
+ /* If not in reclaim/compaction mode, stop */
+ if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+ return false;
+
+ /*
+ * If we failed to reclaim and have scanned the full list, stop.
+ * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+ * faster but obviously would be less likely to succeed
+ * allocation. If this is desirable, use GFP_REPEAT to decide
+ * if both reclaimed and scanned should be checked or just
+ * reclaimed
+ */
+ if (!nr_reclaimed && !nr_scanned)
+ return false;
+
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = (2UL << sc->order);
+ inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ if (sc->nr_reclaimed < pages_for_compaction &&
+ inactive_lru_pages > pages_for_compaction)
+ return true;
+
+ /* If compaction would go ahead or the allocation would succeed, stop */
+ switch (compaction_suitable(zone, sc->order)) {
+ case COMPACT_PARTIAL:
+ case COMPACT_CONTINUE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_zone(int priority, struct zone *zone,
@@ -1813,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone,
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list l;
- unsigned long nr_reclaimed = sc->nr_reclaimed;
+ unsigned long nr_reclaimed;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ unsigned long nr_scanned = sc->nr_scanned;
+restart:
+ nr_reclaimed = 0;
get_scan_count(zone, sc, nr, priority);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1841,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone,
if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
break;
}
-
- sc->nr_reclaimed = nr_reclaimed;
+ sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
@@ -1851,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone,
if (inactive_anon_is_low(zone, sc))
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+ /* reclaim/compaction might need reclaim to continue */
+ if (should_continue_reclaim(zone, nr_reclaimed,
+ sc->nr_scanned - nr_scanned, sc))
+ goto restart;
+
throttle_vm_writeout(sc->gfp_mask);
}
@@ -2124,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
}
#endif
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ * o a 16M DMA zone that is balanced will not balance a zone on any
+ * reasonable sized machine
+ * o On all other machines, the top zone must be at least a reasonable
+ * precentage of the middle zones. For example, on 32-bit x86, highmem
+ * would need to be at least 256M for it to be balance a whole node.
+ * Similarly, on x86-64 the Normal zone would need to be at least 1G
+ * to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+ int classzone_idx)
+{
+ unsigned long present_pages = 0;
+ int i;
+
+ for (i = 0; i <= classzone_idx; i++)
+ present_pages += pgdat->node_zones[i].present_pages;
+
+ return balanced_pages > (present_pages >> 2);
+}
+
/* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ int classzone_idx)
{
int i;
+ unsigned long balanced = 0;
+ bool all_zones_ok = true;
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
- return 1;
+ return true;
- /* If after HZ/10, a zone is below the high mark, it's premature */
+ /* Check the watermark levels */
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable)
+ /*
+ * balance_pgdat() skips over all_unreclaimable after
+ * DEF_PRIORITY. Effectively, it considers them balanced so
+ * they must be considered balanced here as well if kswapd
+ * is to sleep
+ */
+ if (zone->all_unreclaimable) {
+ balanced += zone->present_pages;
continue;
+ }
- if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
- 0, 0))
- return 1;
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
+ classzone_idx, 0))
+ all_zones_ok = false;
+ else
+ balanced += zone->present_pages;
}
- return 0;
+ /*
+ * For high-order requests, the balanced zones must contain at least
+ * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+ * must be balanced
+ */
+ if (order)
+ return pgdat_balanced(pgdat, balanced, classzone_idx);
+ else
+ return !all_zones_ok;
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -2172,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+ int *classzone_idx)
{
int all_zones_ok;
+ unsigned long balanced;
int priority;
int i;
+ int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc = {
@@ -2199,7 +2330,6 @@ loop_again:
count_vm_event(PAGEOUTRUN);
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
int has_under_min_watermark_zone = 0;
@@ -2208,6 +2338,7 @@ loop_again:
disable_swap_token();
all_zones_ok = 1;
+ balanced = 0;
/*
* Scan in the highmem->dma direction for the highest
@@ -2230,9 +2361,10 @@ loop_again:
shrink_active_list(SWAP_CLUSTER_MAX, zone,
&sc, priority, 0);
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
+ *classzone_idx = i;
break;
}
}
@@ -2255,6 +2387,7 @@ loop_again:
* cause too much scanning of the lower zones.
*/
for (i = 0; i <= end_zone; i++) {
+ int compaction;
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
@@ -2276,7 +2409,7 @@ loop_again:
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
8*high_wmark_pages(zone), end_zone, 0))
shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
@@ -2284,9 +2417,26 @@ loop_again:
lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
+
+ compaction = 0;
+ if (order &&
+ zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone),
+ end_zone, 0) &&
+ !zone_watermark_ok(zone, order,
+ high_wmark_pages(zone),
+ end_zone, 0)) {
+ compact_zone_order(zone,
+ order,
+ sc.gfp_mask, false,
+ COMPACT_MODE_KSWAPD);
+ compaction = 1;
+ }
+
if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 && !zone_reclaimable(zone))
+ if (!compaction && nr_slab == 0 &&
+ !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
@@ -2297,7 +2447,7 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
/*
@@ -2305,7 +2455,7 @@ loop_again:
* means that we have a GFP_ATOMIC allocation
* failure risk. Hurry up!
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
min_wmark_pages(zone), end_zone, 0))
has_under_min_watermark_zone = 1;
} else {
@@ -2317,10 +2467,12 @@ loop_again:
* spectulatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
+ if (i <= *classzone_idx)
+ balanced += zone->present_pages;
}
}
- if (all_zones_ok)
+ if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
@@ -2343,7 +2495,13 @@ loop_again:
break;
}
out:
- if (!all_zones_ok) {
+
+ /*
+ * order-0: All zones must meet high watermark for a balanced node
+ * high-order: Balanced zones must make up at least 25% of the node
+ * for the node to be balanced
+ */
+ if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
cond_resched();
try_to_freeze();
@@ -2368,7 +2526,88 @@ out:
goto loop_again;
}
- return sc.nr_reclaimed;
+ /*
+ * If kswapd was reclaiming at a higher order, it has the option of
+ * sleeping without all zones being balanced. Before it does, it must
+ * ensure that the watermarks for order-0 on *all* zones are met and
+ * that the congestion flags are cleared. The congestion flag must
+ * be cleared as kswapd is the only mechanism that clears the flag
+ * and it is potentially going to sleep here.
+ */
+ if (order) {
+ for (i = 0; i <= end_zone; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ continue;
+
+ /* Confirm the zone is balanced for order-0 */
+ if (!zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone), 0, 0)) {
+ order = sc.order = 0;
+ goto loop_again;
+ }
+
+ /* If balanced, clear the congested flag */
+ zone_clear_flag(zone, ZONE_CONGESTED);
+ }
+ }
+
+ /*
+ * Return the order we were reclaiming at so sleeping_prematurely()
+ * makes a decision on the order we were last reclaiming at. However,
+ * if another caller entered the allocator slow path while kswapd
+ * was awake, order will remain at the higher level
+ */
+ *classzone_idx = end_zone;
+ return order;
+}
+
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ long remaining = 0;
+ DEFINE_WAIT(wait);
+
+ if (freezing(current) || kthread_should_stop())
+ return;
+
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+
+ /* Try to sleep for a short interval */
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ remaining = schedule_timeout(HZ/10);
+ finish_wait(&pgdat->kswapd_wait, &wait);
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ }
+
+ /*
+ * After a short sleep, check if it was a premature sleep. If not, then
+ * go fully to sleep until explicitly woken up.
+ */
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+
+ /*
+ * vmstat counters are not perfectly accurate and the estimated
+ * value for counters such as NR_FREE_PAGES can deviate from the
+ * true value by nr_online_cpus * threshold. To avoid the zone
+ * watermarks being breached while under pressure, we reduce the
+ * per-cpu vmstat threshold while kswapd is awake and restore
+ * them before going back to sleep.
+ */
+ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+ schedule();
+ set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+ } else {
+ if (remaining)
+ count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+ else
+ count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+ }
+ finish_wait(&pgdat->kswapd_wait, &wait);
}
/*
@@ -2387,9 +2626,10 @@ out:
static int kswapd(void *p)
{
unsigned long order;
+ int classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
- DEFINE_WAIT(wait);
+
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
@@ -2417,49 +2657,30 @@ static int kswapd(void *p)
set_freezable();
order = 0;
+ classzone_idx = MAX_NR_ZONES - 1;
for ( ; ; ) {
unsigned long new_order;
+ int new_classzone_idx;
int ret;
- prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
- if (order < new_order) {
+ pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
- * allocation
+ * allocation or has tigher zone constraints
*/
order = new_order;
+ classzone_idx = new_classzone_idx;
} else {
- if (!freezing(current) && !kthread_should_stop()) {
- long remaining = 0;
-
- /* Try to sleep for a short interval */
- if (!sleeping_prematurely(pgdat, order, remaining)) {
- remaining = schedule_timeout(HZ/10);
- finish_wait(&pgdat->kswapd_wait, &wait);
- prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- }
-
- /*
- * After a short sleep, check if it was a
- * premature sleep. If not, then go fully
- * to sleep until explicitly woken up
- */
- if (!sleeping_prematurely(pgdat, order, remaining)) {
- trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
- schedule();
- } else {
- if (remaining)
- count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
- else
- count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
- }
- }
-
+ kswapd_try_to_sleep(pgdat, order, classzone_idx);
order = pgdat->kswapd_max_order;
+ classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = MAX_NR_ZONES - 1;
}
- finish_wait(&pgdat->kswapd_wait, &wait);
ret = try_to_freeze();
if (kthread_should_stop())
@@ -2471,7 +2692,7 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balance_pgdat(pgdat, order);
+ order = balance_pgdat(pgdat, order, &classzone_idx);
}
}
return 0;
@@ -2480,23 +2701,26 @@ static int kswapd(void *p)
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
if (!populated_zone(zone))
return;
- pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
- return;
- if (pgdat->kswapd_max_order < order)
- pgdat->kswapd_max_order = order;
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
+ pgdat = zone->zone_pgdat;
+ if (pgdat->kswapd_max_order < order) {
+ pgdat->kswapd_max_order = order;
+ pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+ }
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
+ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+ return;
+
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 312d728976f1..0c3b5048773e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
-static int calculate_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
+{
+ int threshold;
+ int watermark_distance;
+
+ /*
+ * As vmstats are not up to date, there is drift between the estimated
+ * and real values. For high thresholds and a high number of CPUs, it
+ * is possible for the min watermark to be breached while the estimated
+ * value looks fine. The pressure threshold is a reduced value such
+ * that even the maximum amount of drift will not accidentally breach
+ * the min watermark
+ */
+ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
+int calculate_normal_threshold(struct zone *zone)
{
int threshold;
int mem; /* memory in 128 MB units */
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
for_each_populated_zone(zone) {
unsigned long max_drift, tolerate_drift;
- threshold = calculate_threshold(zone);
+ threshold = calculate_normal_threshold(zone);
for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -161,6 +185,26 @@ static void refresh_zone_stat_thresholds(void)
}
}
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ int (*calculate_pressure)(struct zone *))
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = (*calculate_pressure)(zone);
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+}
+
/*
* For use when we know that interrupts are disabled.
*/
@@ -836,6 +880,7 @@ static const char * const vmstat_text[] = {
"numa_local",
"numa_other",
#endif
+ "nr_anon_transparent_hugepages",
"nr_dirty_threshold",
"nr_dirty_background_threshold",
@@ -911,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
- zone_nr_free_pages(zone),
+ zone_page_state(zone, NR_FREE_PAGES),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7f686251f711..f29abeb6a912 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -104,8 +104,26 @@ static pfn_t fault_pfn;
inline int kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn)) {
- struct page *page = compound_head(pfn_to_page(pfn));
- return PageReserved(page);
+ int reserved;
+ struct page *tail = pfn_to_page(pfn);
+ struct page *head = compound_trans_head(tail);
+ reserved = PageReserved(head);
+ if (head != tail) {
+ /*
+ * "head" is not a dangling pointer
+ * (compound_trans_head takes care of that)
+ * but the hugepage may have been splitted
+ * from under us (and we may not hold a
+ * reference count on the head page so it can
+ * be reused before we run PageReferenced), so
+ * we've to check PageTail before returning
+ * what we just read.
+ */
+ smp_rmb();
+ if (PageTail(tail))
+ return reserved;
+ }
+ return PageReserved(tail);
}
return true;
@@ -352,6 +370,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
return young;
}
+static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int young, idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+ young = kvm_test_age_hva(kvm, address);
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return young;
+}
+
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
struct mm_struct *mm)
{
@@ -368,6 +402,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
+ .test_young = kvm_mmu_notifier_test_young,
.change_pte = kvm_mmu_notifier_change_pte,
.release = kvm_mmu_notifier_release,
};