diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-22 19:29:24 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-22 19:29:24 -0800 |
commit | bc49a7831b1137ce1c2dda1c57e3631655f5d2ae (patch) | |
tree | 469380ac3a17e1d927ccf06abc99b6f509deb24a | |
parent | be5165a51d2500ae1afa1236a8b09858831fdf7e (diff) | |
parent | f201ebd87652cf1519792f8662bb3f862c76aa33 (diff) | |
download | linux-bc49a7831b1137ce1c2dda1c57e3631655f5d2ae.tar.gz linux-bc49a7831b1137ce1c2dda1c57e3631655f5d2ae.tar.bz2 linux-bc49a7831b1137ce1c2dda1c57e3631655f5d2ae.zip |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
"142 patches:
- DAX updates
- various misc bits
- OCFS2 updates
- most of MM"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (142 commits)
mm/z3fold.c: limit first_num to the actual range of possible buddy indexes
mm: fix <linux/pagemap.h> stray kernel-doc notation
zram: remove obsolete sysfs attrs
mm/memblock.c: remove unnecessary log and clean up
oom-reaper: use madvise_dontneed() logic to decide if unmap the VMA
mm: drop unused argument of zap_page_range()
mm: drop zap_details::check_swap_entries
mm: drop zap_details::ignore_dirty
mm, page_alloc: warn_alloc nodemask is NULL when cpusets are disabled
mm: help __GFP_NOFAIL allocations which do not trigger OOM killer
mm, oom: do not enforce OOM killer for __GFP_NOFAIL automatically
mm: consolidate GFP_NOFAIL checks in the allocator slowpath
lib/show_mem.c: teach show_mem to work with the given nodemask
arch, mm: remove arch specific show_mem
mm, page_alloc: warn_alloc print nodemask
mm, page_alloc: do not report all nodes in show_mem
Revert "mm: bail out in shrink_inactive_list()"
mm, vmscan: consider eligible zones in get_scan_count
mm, vmscan: cleanup lru size claculations
mm, vmscan: do not count freed pages as PGDEACTIVATE
...
124 files changed, 4558 insertions, 1723 deletions
diff --git a/Documentation/ABI/obsolete/sysfs-block-zram b/Documentation/ABI/obsolete/sysfs-block-zram deleted file mode 100644 index 720ea92cfb2e..000000000000 --- a/Documentation/ABI/obsolete/sysfs-block-zram +++ /dev/null @@ -1,119 +0,0 @@ -What: /sys/block/zram<id>/num_reads -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The num_reads file is read-only and specifies the number of - reads (failed or successful) done on this device. - Now accessible via zram<id>/stat node. - -What: /sys/block/zram<id>/num_writes -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The num_writes file is read-only and specifies the number of - writes (failed or successful) done on this device. - Now accessible via zram<id>/stat node. - -What: /sys/block/zram<id>/invalid_io -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The invalid_io file is read-only and specifies the number of - non-page-size-aligned I/O requests issued to this device. - Now accessible via zram<id>/io_stat node. - -What: /sys/block/zram<id>/failed_reads -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The failed_reads file is read-only and specifies the number of - failed reads happened on this device. - Now accessible via zram<id>/io_stat node. - -What: /sys/block/zram<id>/failed_writes -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The failed_writes file is read-only and specifies the number of - failed writes happened on this device. - Now accessible via zram<id>/io_stat node. - -What: /sys/block/zram<id>/notify_free -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The notify_free file is read-only. Depending on device usage - scenario it may account a) the number of pages freed because - of swap slot free notifications or b) the number of pages freed - because of REQ_DISCARD requests sent by bio. The former ones - are sent to a swap block device when a swap slot is freed, which - implies that this disk is being used as a swap disk. The latter - ones are sent by filesystem mounted with discard option, - whenever some data blocks are getting discarded. - Now accessible via zram<id>/io_stat node. - -What: /sys/block/zram<id>/zero_pages -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The zero_pages file is read-only and specifies number of zero - filled pages written to this disk. No memory is allocated for - such pages. - Now accessible via zram<id>/mm_stat node. - -What: /sys/block/zram<id>/orig_data_size -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The orig_data_size file is read-only and specifies uncompressed - size of data stored in this disk. This excludes zero-filled - pages (zero_pages) since no memory is allocated for them. - Unit: bytes - Now accessible via zram<id>/mm_stat node. - -What: /sys/block/zram<id>/compr_data_size -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The compr_data_size file is read-only and specifies compressed - size of data stored in this disk. So, compression ratio can be - calculated using orig_data_size and this statistic. - Unit: bytes - Now accessible via zram<id>/mm_stat node. - -What: /sys/block/zram<id>/mem_used_total -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The mem_used_total file is read-only and specifies the amount - of memory, including allocator fragmentation and metadata - overhead, allocated for this disk. So, allocator space - efficiency can be calculated using compr_data_size and this - statistic. - Unit: bytes - Now accessible via zram<id>/mm_stat node. - -What: /sys/block/zram<id>/mem_used_max -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The mem_used_max file is read/write and specifies the amount - of maximum memory zram have consumed to store compressed data. - For resetting the value, you should write "0". Otherwise, - you could see -EINVAL. - Unit: bytes - Downgraded to write-only node: so it's possible to set new - value only; its current value is stored in zram<id>/mm_stat - node. - -What: /sys/block/zram<id>/mem_limit -Date: August 2015 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The mem_limit file is read/write and specifies the maximum - amount of memory ZRAM can use to store the compressed data. - The limit could be changed in run time and "0" means disable - the limit. No limit is the initial state. Unit: bytes - Downgraded to write-only node: so it's possible to set new - value only; its current value is stored in zram<id>/mm_stat - node. diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 4518d30b8c2e..451b6d882b2c 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -22,41 +22,6 @@ Description: device. The reset operation frees all the memory associated with this device. -What: /sys/block/zram<id>/num_reads -Date: August 2010 -Contact: Nitin Gupta <ngupta@vflare.org> -Description: - The num_reads file is read-only and specifies the number of - reads (failed or successful) done on this device. - -What: /sys/block/zram<id>/num_writes -Date: August 2010 -Contact: Nitin Gupta <ngupta@vflare.org> -Description: - The num_writes file is read-only and specifies the number of - writes (failed or successful) done on this device. - -What: /sys/block/zram<id>/invalid_io -Date: August 2010 -Contact: Nitin Gupta <ngupta@vflare.org> -Description: - The invalid_io file is read-only and specifies the number of - non-page-size-aligned I/O requests issued to this device. - -What: /sys/block/zram<id>/failed_reads -Date: February 2014 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The failed_reads file is read-only and specifies the number of - failed reads happened on this device. - -What: /sys/block/zram<id>/failed_writes -Date: February 2014 -Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> -Description: - The failed_writes file is read-only and specifies the number of - failed writes happened on this device. - What: /sys/block/zram<id>/max_comp_streams Date: February 2014 Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> @@ -73,74 +38,24 @@ Description: available and selected compression algorithms, change compression algorithm selection. -What: /sys/block/zram<id>/notify_free -Date: August 2010 -Contact: Nitin Gupta <ngupta@vflare.org> -Description: - The notify_free file is read-only. Depending on device usage - scenario it may account a) the number of pages freed because - of swap slot free notifications or b) the number of pages freed - because of REQ_DISCARD requests sent by bio. The former ones - are sent to a swap block device when a swap slot is freed, which - implies that this disk is being used as a swap disk. The latter - ones are sent by filesystem mounted with discard option, - whenever some data blocks are getting discarded. - -What: /sys/block/zram<id>/zero_pages -Date: August 2010 -Contact: Nitin Gupta <ngupta@vflare.org> -Description: - The zero_pages file is read-only and specifies number of zero - filled pages written to this disk. No memory is allocated for - such pages. - -What: /sys/block/zram<id>/orig_data_size -Date: August 2010 -Contact: Nitin Gupta <ngupta@vflare.org> -Description: - The orig_data_size file is read-only and specifies uncompressed - size of data stored in this disk. This excludes zero-filled - pages (zero_pages) since no memory is allocated for them. - Unit: bytes - -What: /sys/block/zram<id>/compr_data_size -Date: August 2010 -Contact: Nitin Gupta <ngupta@vflare.org> -Description: - The compr_data_size file is read-only and specifies compressed - size of data stored in this disk. So, compression ratio can be - calculated using orig_data_size and this statistic. - Unit: bytes - -What: /sys/block/zram<id>/mem_used_total -Date: August 2010 -Contact: Nitin Gupta <ngupta@vflare.org> -Description: - The mem_used_total file is read-only and specifies the amount - of memory, including allocator fragmentation and metadata - overhead, allocated for this disk. So, allocator space - efficiency can be calculated using compr_data_size and this - statistic. - Unit: bytes - What: /sys/block/zram<id>/mem_used_max Date: August 2014 Contact: Minchan Kim <minchan@kernel.org> Description: - The mem_used_max file is read/write and specifies the amount - of maximum memory zram have consumed to store compressed data. - For resetting the value, you should write "0". Otherwise, - you could see -EINVAL. + The mem_used_max file is write-only and is used to reset + the counter of maximum memory zram have consumed to store + compressed data. For resetting the value, you should write + "0". Otherwise, you could see -EINVAL. Unit: bytes What: /sys/block/zram<id>/mem_limit Date: August 2014 Contact: Minchan Kim <minchan@kernel.org> Description: - The mem_limit file is read/write and specifies the maximum - amount of memory ZRAM can use to store the compressed data. The - limit could be changed in run time and "0" means disable the - limit. No limit is the initial state. Unit: bytes + The mem_limit file is write-only and specifies the maximum + amount of memory ZRAM can use to store the compressed data. + The limit could be changed in run time and "0" means disable + the limit. No limit is the initial state. Unit: bytes What: /sys/block/zram<id>/compact Date: August 2015 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d05533e6120b..986e44387dad 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3694,6 +3694,14 @@ last alloc / free. For more information see Documentation/vm/slub.txt. + slub_memcg_sysfs= [MM, SLUB] + Determines whether to enable sysfs directories for + memory cgroup sub-caches. 1 to enable, 0 to disable. + The default is determined by CONFIG_SLUB_MEMCG_SYSFS_ON. + Enabling this can lead to a very high number of debug + directories and files being created under + /sys/kernel/slub. + slub_max_order= [MM, SLUB] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 0535ae1f73e5..1c0c08d9206b 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -161,42 +161,14 @@ Name access description disksize RW show and set the device's disk size initstate RO shows the initialization state of the device reset WO trigger device reset -num_reads RO the number of reads -failed_reads RO the number of failed reads -num_write RO the number of writes -failed_writes RO the number of failed writes -invalid_io RO the number of non-page-size-aligned I/O requests +mem_used_max WO reset the `mem_used_max' counter (see later) +mem_limit WO specifies the maximum amount of memory ZRAM can use + to store the compressed data max_comp_streams RW the number of possible concurrent compress operations comp_algorithm RW show and change the compression algorithm -notify_free RO the number of notifications to free pages (either - slot free notifications or REQ_DISCARD requests) -zero_pages RO the number of zero filled pages written to this disk -orig_data_size RO uncompressed size of data stored in this disk -compr_data_size RO compressed size of data stored in this disk -mem_used_total RO the amount of memory allocated for this disk -mem_used_max RW the maximum amount of memory zram have consumed to - store the data (to reset this counter to the actual - current value, write 1 to this attribute) -mem_limit RW the maximum amount of memory ZRAM can use to store - the compressed data -pages_compacted RO the number of pages freed during compaction - (available only via zram<id>/mm_stat node) compact WO trigger memory compaction debug_stat RO this file is used for zram debugging purposes -WARNING -======= -per-stat sysfs attributes are considered to be deprecated. -The basic strategy is: --- the existing RW nodes will be downgraded to WO nodes (in linux 4.11) --- deprecated RO sysfs nodes will eventually be removed (in linux 4.11) - -The list of deprecated attributes can be found here: -Documentation/ABI/obsolete/sysfs-block-zram - -Basically, every attribute that has its own read accessible sysfs node -(e.g. num_reads) *AND* is accessible via one of the stat files (zram<id>/stat -or zram<id>/io_stat or zram<id>/mm_stat) is considered to be deprecated. User space is advised to use the following files to read the device statistics. @@ -211,22 +183,40 @@ The stat file represents device's I/O statistics not accounted by block layer and, thus, not available in zram<id>/stat file. It consists of a single line of text and contains the following stats separated by whitespace: - failed_reads - failed_writes - invalid_io - notify_free + failed_reads the number of failed reads + failed_writes the number of failed writes + invalid_io the number of non-page-size-aligned I/O requests + notify_free Depending on device usage scenario it may account + a) the number of pages freed because of swap slot free + notifications or b) the number of pages freed because of + REQ_DISCARD requests sent by bio. The former ones are + sent to a swap block device when a swap slot is freed, + which implies that this disk is being used as a swap disk. + The latter ones are sent by filesystem mounted with + discard option, whenever some data blocks are getting + discarded. File /sys/block/zram<id>/mm_stat The stat file represents device's mm statistics. It consists of a single line of text and contains the following stats separated by whitespace: - orig_data_size - compr_data_size - mem_used_total - mem_limit - mem_used_max - zero_pages - num_migrated + orig_data_size uncompressed size of data stored in this disk. + This excludes zero-filled pages (zero_pages) since no + memory is allocated for them. + Unit: bytes + compr_data_size compressed size of data stored in this disk + mem_used_total the amount of memory allocated for this disk. This + includes allocator fragmentation and metadata overhead, + allocated for this disk. So, allocator space efficiency + can be calculated using compr_data_size and this statistic. + Unit: bytes + mem_limit the maximum amount of memory ZRAM can use to store + the compressed data + mem_used_max the maximum amount of memory zram have consumed to + store the data + zero_pages the number of zero filled pages written to this disk. + No memory is allocated for such pages. + pages_compacted the number of pages freed during compaction 9) Deactivate: swapoff /dev/zram0 diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 8f961ef2b457..ba976805853a 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -112,8 +112,8 @@ my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)'; my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)'; -my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) file=([0-9]*)'; -my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) zid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; +my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; +my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -205,15 +205,15 @@ $regex_wakeup_kswapd = generate_traceevent_regex( $regex_lru_isolate = generate_traceevent_regex( "vmscan/mm_vmscan_lru_isolate", $regex_lru_isolate_default, - "isolate_mode", "order", - "nr_requested", "nr_scanned", "nr_taken", - "file"); + "isolate_mode", "classzone_idx", "order", + "nr_requested", "nr_scanned", "nr_skipped", "nr_taken", + "lru"); $regex_lru_shrink_inactive = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_inactive", $regex_lru_shrink_inactive_default, - "nid", "zid", - "nr_scanned", "nr_reclaimed", "priority", - "flags"); + "nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback", + "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep", + "nr_unmap_fail", "priority", "flags"); $regex_lru_shrink_active = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_active", $regex_lru_shrink_active_default, @@ -381,8 +381,8 @@ EVENT_PROCESS: next; } my $isolate_mode = $1; - my $nr_scanned = $4; - my $file = $6; + my $nr_scanned = $5; + my $file = $8; # To closer match vmstat scanning statistics, only count isolate_both # and isolate_inactive as scanning. isolate_active is rotation @@ -391,7 +391,7 @@ EVENT_PROCESS: # isolate_both == 3 if ($isolate_mode != 2) { $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; - if ($file == 1) { + if ($file =~ /_file/) { $perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED} += $nr_scanned; } else { $perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED} += $nr_scanned; @@ -406,8 +406,8 @@ EVENT_PROCESS: next; } - my $nr_reclaimed = $4; - my $flags = $6; + my $nr_reclaimed = $3; + my $flags = $12; my $file = 0; if ($flags =~ /RECLAIM_WB_FILE/) { $file = 1; diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index f2e739545e74..cd28d5ee5273 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -110,6 +110,7 @@ MADV_HUGEPAGE region. echo always >/sys/kernel/mm/transparent_hugepage/defrag echo defer >/sys/kernel/mm/transparent_hugepage/defrag +echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag echo madvise >/sys/kernel/mm/transparent_hugepage/defrag echo never >/sys/kernel/mm/transparent_hugepage/defrag @@ -120,10 +121,15 @@ that benefit heavily from THP use and are willing to delay the VM start to utilise them. "defer" means that an application will wake kswapd in the background -to reclaim pages and wake kcompact to compact memory so that THP is +to reclaim pages and wake kcompactd to compact memory so that THP is available in the near future. It's the responsibility of khugepaged to then install the THP pages later. +"defer+madvise" will enter direct reclaim and compaction like "always", but +only for regions that have used madvise(MADV_HUGEPAGE); all other regions +will wake kswapd in the background to reclaim pages and wake kcompactd to +compact memory so that THP is available in the near future. + "madvise" will enter direct reclaim like "always" but only for regions that are have used madvise(MADV_HUGEPAGE). This is the default behaviour. diff --git a/MAINTAINERS b/MAINTAINERS index ca6f5f7a4752..ad4f2ce991ca 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3926,10 +3926,13 @@ S: Maintained F: drivers/i2c/busses/i2c-diolan-u2c.c DIRECT ACCESS (DAX) -M: Matthew Wilcox <willy@linux.intel.com> +M: Matthew Wilcox <mawilcox@microsoft.com> +M: Ross Zwisler <ross.zwisler@linux.intel.com> L: linux-fsdevel@vger.kernel.org S: Supported F: fs/dax.c +F: include/linux/dax.h +F: include/trace/events/fs_dax.h DIRECTORY NOTIFICATION (DNOTIFY) M: Eric Paris <eparis@parisplace.org> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index bb4610faca84..06cdaef54b2e 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -684,51 +684,3 @@ int arch_remove_memory(u64 start, u64 size) } #endif #endif - -/** - * show_mem - give short summary of memory stats - * - * Shows a simple page count of reserved and used pages in the system. - * For discontig machines, it does this on a per-pgdat basis. - */ -void show_mem(unsigned int filter) -{ - int total_reserved = 0; - unsigned long total_present = 0; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(filter); - printk(KERN_INFO "Node memory in pages:\n"); - for_each_online_pgdat(pgdat) { - unsigned long present; - unsigned long flags; - int reserved = 0; - int nid = pgdat->node_id; - int zoneid; - - if (skip_free_areas_node(filter, nid)) - continue; - pgdat_resize_lock(pgdat, &flags); - - for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { - struct zone *zone = &pgdat->node_zones[zoneid]; - if (!populated_zone(zone)) - continue; - - reserved += zone->present_pages - zone->managed_pages; - } - present = pgdat->node_present_pages; - - pgdat_resize_unlock(pgdat, &flags); - total_present += present; - total_reserved += reserved; - printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, ", - nid, present, reserved); - } - printk(KERN_INFO "%ld pages of RAM\n", total_present); - printk(KERN_INFO "%d reserved pages\n", total_reserved); - printk(KERN_INFO "Total of %ld pages in page table cache\n", - quicklist_total_size()); - printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); -} diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index 652100b64a71..8c24c5e1db66 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild @@ -1,5 +1,6 @@ generic-y += clkdev.h +generic-y += current.h generic-y += exec.h generic-y += irq_work.h generic-y += kvm_para.h diff --git a/arch/m32r/include/asm/cmpxchg.h b/arch/m32r/include/asm/cmpxchg.h index 14bf9b739dd2..23c9d0537201 100644 --- a/arch/m32r/include/asm/cmpxchg.h +++ b/arch/m32r/include/asm/cmpxchg.h @@ -64,8 +64,10 @@ __xchg(unsigned long x, volatile void *ptr, int size) return (tmp); } -#define xchg(ptr, x) \ - ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))) +#define xchg(ptr, x) ({ \ + ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), \ + sizeof(*(ptr)))); \ +}) static __always_inline unsigned long __xchg_local(unsigned long x, volatile void *ptr, int size) @@ -187,9 +189,12 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) return old; } -#define cmpxchg(ptr, o, n) \ - ((__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), sizeof(*(ptr)))) +#define cmpxchg(ptr, o, n) ({ \ + ((__typeof__(*(ptr))) \ + __cmpxchg((ptr), (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr)))); \ +}) #include <asm-generic/cmpxchg-local.h> diff --git a/arch/m32r/include/asm/current.h b/arch/m32r/include/asm/current.h deleted file mode 100644 index 7859d864f2c2..000000000000 --- a/arch/m32r/include/asm/current.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _ASM_M32R_CURRENT_H -#define _ASM_M32R_CURRENT_H - -#include <linux/thread_info.h> - -struct task_struct; - -static __inline__ struct task_struct *get_current(void) -{ - return current_thread_info()->task; -} - -#define current (get_current()) - -#endif /* _ASM_M32R_CURRENT_H */ diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 4e179d770d69..cc70b4116718 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -2,6 +2,7 @@ generic-y += auxvec.h generic-y += barrier.h generic-y += clkdev.h +generic-y += current.h generic-y += device.h generic-y += div64.h generic-y += emergency-restart.h diff --git a/arch/parisc/include/asm/current.h b/arch/parisc/include/asm/current.h deleted file mode 100644 index 0fb9338e3bf2..000000000000 --- a/arch/parisc/include/asm/current.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _PARISC_CURRENT_H -#define _PARISC_CURRENT_H - -#include <linux/thread_info.h> - -struct task_struct; - -static inline struct task_struct * get_current(void) -{ - return current_thread_info()->task; -} - -#define current get_current() - -#endif /* !(_PARISC_CURRENT_H) */ diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index a055e5b6b380..66f3a6345105 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -653,55 +653,6 @@ void __init mem_init(void) unsigned long *empty_zero_page __read_mostly; EXPORT_SYMBOL(empty_zero_page); -void show_mem(unsigned int filter) -{ - int total = 0,reserved = 0; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(filter); - - for_each_online_pgdat(pgdat) { - unsigned long flags; - int zoneid; - - pgdat_resize_lock(pgdat, &flags); - for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { - struct zone *zone = &pgdat->node_zones[zoneid]; - if (!populated_zone(zone)) - continue; - - total += zone->present_pages; - reserved = zone->present_pages - zone->managed_pages; - } - pgdat_resize_unlock(pgdat, &flags); - } - - printk(KERN_INFO "%d pages of RAM\n", total); - printk(KERN_INFO "%d reserved pages\n", reserved); - -#ifdef CONFIG_DISCONTIGMEM - { - struct zonelist *zl; - int i, j; - - for (i = 0; i < npmem_ranges; i++) { - zl = node_zonelist(i, 0); - for (j = 0; j < MAX_NR_ZONES; j++) { - struct zoneref *z; - struct zone *zone; - - printk("Zone list for zone %d on node %d: ", j, i); - for_each_zone_zonelist(zone, z, zl, j) - printk("[%d/%s] ", zone_to_nid(zone), - zone->name); - printk("\n"); - } - } - } -#endif -} - /* * pagetable_init() sets up the page tables * diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 47120bf2670c..2a32483c7b6c 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -230,7 +230,9 @@ extern long long virt_phys_offset; * and needs to be executable. This means the whole heap ends * up being executable. */ -#define VM_DATA_DEFAULT_FLAGS32 (VM_READ | VM_WRITE | VM_EXEC | \ +#define VM_DATA_DEFAULT_FLAGS32 \ + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ + VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_DEFAULT_FLAGS64 (VM_READ | VM_WRITE | \ diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 1be0499f5397..5720236d0266 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -916,7 +916,7 @@ cmds(struct pt_regs *excp) memzcan(); break; case 'i': - show_mem(0); + show_mem(0, NULL); break; default: termch = cmd; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index ec1f0dedb948..59ac93714fa4 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -687,7 +687,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) /* Find vma in the parent mm */ vma = find_vma(gmap->mm, vmaddr); size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size, NULL); + zap_page_range(vma, vmaddr, size); } up_read(&gmap->mm->mmap_sem); } diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index 51970bb6c4fe..db3e28ca3ae2 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild @@ -1,9 +1,9 @@ header-y += - generic-y += barrier.h generic-y += clkdev.h +generic-y += current.h generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/score/include/asm/current.h b/arch/score/include/asm/current.h deleted file mode 100644 index 16eae9cbaf1a..000000000000 --- a/arch/score/include/asm/current.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_SCORE_CURRENT_H -#define _ASM_SCORE_CURRENT_H - -#include <asm-generic/current.h> - -#endif /* _ASM_SCORE_CURRENT_H */ diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index c4e65cb3280f..6f06058c5ae7 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -82,7 +82,7 @@ static void prom_sync_me(void) "nop\n\t" : : "r" (&trapbase)); prom_printf("PROM SYNC COMMAND...\n"); - show_free_areas(0); + show_free_areas(0, NULL); if (!is_idle_task(current)) { local_irq_enable(); sys_sync(); diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index eb8287155279..c6afe98de4d9 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -55,17 +55,6 @@ extern unsigned int sparc_ramdisk_size; unsigned long highstart_pfn, highend_pfn; -void show_mem(unsigned int filter) -{ - printk("Mem-info:\n"); - show_free_areas(filter); - printk("Free swap: %6ldkB\n", - get_nr_swap_pages() << (PAGE_SHIFT-10)); - printk("%ld pages of RAM\n", totalram_pages); - printk("%ld free pages\n", nr_free_pages()); -} - - unsigned long last_valid_pfn; unsigned long calc_highpages(void) diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 7cc6ee7f1a58..492a7361e58e 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -36,51 +36,6 @@ #define K(x) ((x) << (PAGE_SHIFT-10)) -/* - * The normal show_free_areas() is too verbose on Tile, with dozens - * of processors and often four NUMA zones each with high and lowmem. - */ -void show_mem(unsigned int filter) -{ - struct zone *zone; - - pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n", - (global_node_page_state(NR_ACTIVE_ANON) + - global_node_page_state(NR_ACTIVE_FILE)), - (global_node_page_state(NR_INACTIVE_ANON) + - global_node_page_state(NR_INACTIVE_FILE)), - global_node_page_state(NR_FILE_DIRTY), - global_node_page_state(NR_WRITEBACK), - global_node_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_FREE_PAGES), - (global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)), - global_node_page_state(NR_FILE_MAPPED), - global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE), - global_node_page_state(NR_FILE_PAGES), - get_nr_swap_pages()); - - for_each_zone(zone) { - unsigned long flags, order, total = 0, largest_order = -1; - - if (!populated_zone(zone)) - continue; - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { - int nr = zone->free_area[order].nr_free; - total += nr << order; - if (nr) - largest_order = order; - } - spin_unlock_irqrestore(&zone->lock, flags); - pr_err("Node %d %7s: %lukB (largest %luKb)\n", - zone_to_nid(zone), zone->name, - K(total), largest_order ? K(1UL) << largest_order : 0); - } -} - /** * shatter_huge_page() - ensure a given address is mapped by a small page. * diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index be2bde9b07cf..f4950fbfe574 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -57,50 +57,6 @@ early_param("initrd", early_initrd); */ struct meminfo meminfo; -void show_mem(unsigned int filter) -{ - int free = 0, total = 0, reserved = 0; - int shared = 0, cached = 0, slab = 0, i; - struct meminfo *mi = &meminfo; - - printk(KERN_DEFAULT "Mem-info:\n"); - show_free_areas(filter); - - for_each_bank(i, mi) { - struct membank *bank = &mi->bank[i]; - unsigned int pfn1, pfn2; - struct page *page, *end; - - pfn1 = bank_pfn_start(bank); - pfn2 = bank_pfn_end(bank); - - page = pfn_to_page(pfn1); - end = pfn_to_page(pfn2 - 1) + 1; - - do { - total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (PageSlab(page)) - slab++; - else if (!page_count(page)) - free++; - else - shared += page_count(page) - 1; - page++; - } while (page < end); - } - - printk(KERN_DEFAULT "%d pages of RAM\n", total); - printk(KERN_DEFAULT "%d free pages\n", free); - printk(KERN_DEFAULT "%d reserved pages\n", reserved); - printk(KERN_DEFAULT "%d slab pages\n", slab); - printk(KERN_DEFAULT "%d pages shared\n", shared); - printk(KERN_DEFAULT "%d pages swap cached\n", cached); -} - static void __init find_limits(unsigned long *min, unsigned long *max_low, unsigned long *max_high) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index af85b686a7b0..97346f987ef2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -679,7 +679,7 @@ static void __meminit free_pagetable(struct page *page, int order) if (PageReserved(page)) { __ClearPageReserved(page); - magic = (unsigned long)page->lru.next; + magic = (unsigned long)page->freelist; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index af59f808742f..aad4ac386f98 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -796,7 +796,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm, return -EINVAL; len = min(vma->vm_end, end) - addr; - zap_page_range(vma, addr, len, NULL); + zap_page_range(vma, addr, len); trace_mpx_unmap_zap(addr, addr+len); vma = vma->vm_next; diff --git a/block/ioprio.c b/block/ioprio.c index 01b8116298a1..3790669232ff 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -122,14 +122,14 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) if (!user) break; - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (!uid_eq(task_uid(p), uid) || !task_pid_vnr(p)) continue; ret = set_task_ioprio(p, ioprio); if (ret) goto free_uid; - } while_each_thread(g, p); + } free_uid: if (who) free_uid(user); @@ -222,7 +222,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) if (!user) break; - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (!uid_eq(task_uid(p), user->uid) || !task_pid_vnr(p)) continue; @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_thread(g, p); + } if (who) free_uid(user); diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 9451b762fa1c..15b263a420e8 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -657,7 +657,7 @@ free_range: page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; if (vma) zap_page_range(vma, (uintptr_t)page_addr + - proc->user_buffer_offset, PAGE_SIZE, NULL); + proc->user_buffer_offset, PAGE_SIZE); err_vm_insert_page_failed: unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); err_map_kernel_failed: diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3cd7856156b4..c73fede582f7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -45,27 +45,6 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; -static inline void deprecated_attr_warn(const char *name) -{ - pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n", - task_pid_nr(current), - current->comm, - name, - "See zram documentation."); -} - -#define ZRAM_ATTR_RO(name) \ -static ssize_t name##_show(struct device *d, \ - struct device_attribute *attr, char *b) \ -{ \ - struct zram *zram = dev_to_zram(d); \ - \ - deprecated_attr_warn(__stringify(name)); \ - return scnprintf(b, PAGE_SIZE, "%llu\n", \ - (u64)atomic64_read(&zram->stats.name)); \ -} \ -static DEVICE_ATTR_RO(name); - static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -218,47 +197,6 @@ static ssize_t disksize_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); } -static ssize_t orig_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("orig_data_size"); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); -} - -static ssize_t mem_used_total_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_used_total"); - down_read(&zram->init_lock); - if (init_done(zram)) { - struct zram_meta *meta = zram->meta; - val = zs_get_total_pages(meta->mem_pool); - } - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - -static ssize_t mem_limit_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_limit"); - down_read(&zram->init_lock); - val = zram->limit_pages; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - static ssize_t mem_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -277,21 +215,6 @@ static ssize_t mem_limit_store(struct device *dev, return len; } -static ssize_t mem_used_max_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_used_max"); - down_read(&zram->init_lock); - if (init_done(zram)) - val = atomic_long_read(&zram->stats.max_used_pages); - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - static ssize_t mem_used_max_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -467,14 +390,6 @@ static ssize_t debug_stat_show(struct device *dev, static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -ZRAM_ATTR_RO(num_reads); -ZRAM_ATTR_RO(num_writes); -ZRAM_ATTR_RO(failed_reads); -ZRAM_ATTR_RO(failed_writes); -ZRAM_ATTR_RO(invalid_io); -ZRAM_ATTR_RO(notify_free); -ZRAM_ATTR_RO(zero_pages); -ZRAM_ATTR_RO(compr_data_size); static inline bool zram_meta_get(struct zram *zram) { @@ -1188,10 +1103,8 @@ static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); static DEVICE_ATTR_RO(initstate); static DEVICE_ATTR_WO(reset); -static DEVICE_ATTR_RO(orig_data_size); -static DEVICE_ATTR_RO(mem_used_total); -static DEVICE_ATTR_RW(mem_limit); -static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_WO(mem_limit); +static DEVICE_ATTR_WO(mem_used_max); static DEVICE_ATTR_RW(max_comp_streams); static DEVICE_ATTR_RW(comp_algorithm); @@ -1199,17 +1112,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, &dev_attr_initstate.attr, &dev_attr_reset.attr, - &dev_attr_num_reads.attr, - &dev_attr_num_writes.attr, - &dev_attr_failed_reads.attr, - &dev_attr_failed_writes.attr, &dev_attr_compact.attr, - &dev_attr_invalid_io.attr, - &dev_attr_notify_free.attr, - &dev_attr_zero_pages.attr, - &dev_attr_orig_data_size.attr, - &dev_attr_compr_data_size.attr, - &dev_attr_mem_used_total.attr, &dev_attr_mem_limit.attr, &dev_attr_mem_used_max.attr, &dev_attr_max_comp_streams.attr, diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index ed758b74ddf0..18e9875f6277 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -472,18 +472,16 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return rc; } -static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, - struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, - unsigned int flags) +static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) { - unsigned long pmd_addr = addr & PMD_MASK; + unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dax_dev->dev; struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; pfn_t pfn; - if (check_vma(dax_dev, vma, __func__)) + if (check_vma(dax_dev, vmf->vma, __func__)) return VM_FAULT_SIGBUS; dax_region = dax_dev->region; @@ -498,7 +496,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, return VM_FAULT_SIGBUS; } - pgoff = linear_page_index(vma, pmd_addr); + pgoff = linear_page_index(vmf->vma, pmd_addr); phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); if (phys == -1) { dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, @@ -508,23 +506,23 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - return vmf_insert_pfn_pmd(vma, addr, pmd, pfn, - flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, + vmf->flags & FAULT_FLAG_WRITE); } -static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) +static int dax_dev_pmd_fault(struct vm_fault *vmf) { int rc; - struct file *filp = vma->vm_file; + struct file *filp = vmf->vma->vm_file; struct dax_dev *dax_dev = filp->private_data; dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, - current->comm, (flags & FAULT_FLAG_WRITE) - ? "write" : "read", vma->vm_start, vma->vm_end); + current->comm, (vmf->flags & FAULT_FLAG_WRITE) + ? "write" : "read", + vmf->vma->vm_start, vmf->vma->vm_end); rcu_read_lock(); - rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags); + rc = __dax_dev_pmd_fault(dax_dev, vmf); rcu_read_unlock(); return rc; diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c index d390b9663dc3..57e6cef81ebe 100644 --- a/drivers/net/ethernet/sgi/ioc3-eth.c +++ b/drivers/net/ethernet/sgi/ioc3-eth.c @@ -914,7 +914,7 @@ static void ioc3_alloc_rings(struct net_device *dev) skb = ioc3_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC); if (!skb) { - show_free_areas(0); + show_free_areas(0, NULL); continue; } diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index 937c2d5d7ec3..969600779e44 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -865,8 +865,7 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer, list_for_each_entry(vma_list, &buffer->vmas, list) { struct vm_area_struct *vma = vma_list->vma; - zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, - NULL); + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); } mutex_unlock(&buffer->lock); } diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 701c085bb19b..71136742e606 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -317,7 +317,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = { static void sysrq_handle_showmem(int key) { - show_mem(0); + show_mem(0, NULL); } static struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index 3dd6a491cdba..397e1509fe51 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -572,7 +572,7 @@ static void fn_scroll_back(struct vc_data *vc) static void fn_show_mem(struct vc_data *vc) { - show_mem(0); + show_mem(0, NULL); } static void fn_show_state(struct vc_data *vc) diff --git a/fs/9p/acl.c b/fs/9p/acl.c index b3c2cc79c20d..082d227fa56b 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -277,6 +277,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, case ACL_TYPE_ACCESS: if (acl) { struct iattr iattr; + struct posix_acl *old_acl = acl; retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); if (retval) @@ -287,6 +288,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, * by the mode bits. So don't * update ACL. */ + posix_acl_release(old_acl); value = NULL; size = 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e7bf01373bc4..443a6f537d56 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) -static int set_brk(unsigned long start, unsigned long end) +static int set_brk(unsigned long start, unsigned long end, int prot) { start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - int error = vm_brk(start, end - start); + /* + * Map the last of the bss segment. + * If the header is requesting these pages to be + * executable, honour that (ppc32 needs this). + */ + int error = vm_brk_flags(start, end - start, + prot & PROT_EXEC ? VM_EXEC : 0); if (error) return error; } @@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, unsigned long load_addr = 0; int load_addr_set = 0; unsigned long last_bss = 0, elf_bss = 0; + int bss_prot = 0; unsigned long error = ~0UL; unsigned long total_size; int i; @@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, * elf_bss and last_bss is the bss section. */ k = load_addr + eppnt->p_vaddr + eppnt->p_memsz; - if (k > last_bss) + if (k > last_bss) { last_bss = k; + bss_prot = elf_prot; + } } } @@ -623,13 +632,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, /* * Next, align both the file and mem bss up to the page size, * since this is where elf_bss was just zeroed up to, and where - * last_bss will end after the vm_brk() below. + * last_bss will end after the vm_brk_flags() below. */ elf_bss = ELF_PAGEALIGN(elf_bss); last_bss = ELF_PAGEALIGN(last_bss); /* Finally, if there is still more bss to allocate, do it. */ if (last_bss > elf_bss) { - error = vm_brk(elf_bss, last_bss - elf_bss); + error = vm_brk_flags(elf_bss, last_bss - elf_bss, + bss_prot & PROT_EXEC ? VM_EXEC : 0); if (error) goto out; } @@ -674,6 +684,7 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; unsigned long elf_bss, elf_brk; + int bss_prot = 0; int retval, i; unsigned long elf_entry; unsigned long interp_load_addr = 0; @@ -882,7 +893,8 @@ static int load_elf_binary(struct linux_binprm *bprm) before this one. Map anonymous pages, if needed, and clear the area. */ retval = set_brk(elf_bss + load_bias, - elf_brk + load_bias); + elf_brk + load_bias, + bss_prot); if (retval) goto out_free_dentry; nbyte = ELF_PAGEOFFSET(elf_bss); @@ -976,8 +988,10 @@ static int load_elf_binary(struct linux_binprm *bprm) if (end_data < k) end_data = k; k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; - if (k > elf_brk) + if (k > elf_brk) { + bss_prot = elf_prot; elf_brk = k; + } } loc->elf_ex.e_entry += load_bias; @@ -993,7 +1007,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * mapping in the interpreter, to make sure it doesn't wind * up getting placed where the bss needs to go. */ - retval = set_brk(elf_bss, elf_brk); + retval = set_brk(elf_bss, elf_brk, bss_prot); if (retval) goto out_free_dentry; if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { @@ -35,6 +35,9 @@ #include <linux/iomap.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/fs_dax.h> + /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -1253,21 +1256,21 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault); */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) -static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, - struct vm_fault *vmf, unsigned long address, - struct iomap *iomap, loff_t pos, bool write, void **entryp) +static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, + loff_t pos, void **entryp) { - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct block_device *bdev = iomap->bdev; + struct inode *inode = mapping->host; struct blk_dax_ctl dax = { .sector = dax_iomap_sector(iomap, pos), .size = PMD_SIZE, }; long length = dax_map_atomic(bdev, &dax); - void *ret; + void *ret = NULL; if (length < 0) /* dax_map_atomic() failed */ - return VM_FAULT_FALLBACK; + goto fallback; if (length < PMD_SIZE) goto unmap_fallback; if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) @@ -1280,67 +1283,86 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, RADIX_DAX_PMD); if (IS_ERR(ret)) - return VM_FAULT_FALLBACK; + goto fallback; *entryp = ret; - return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); + trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); + return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + dax.pfn, vmf->flags & FAULT_FLAG_WRITE); unmap_fallback: dax_unmap_atomic(bdev, &dax); +fallback: + trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, + dax.pfn, ret); return VM_FAULT_FALLBACK; } -static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, - struct vm_fault *vmf, unsigned long address, - struct iomap *iomap, void **entryp) +static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, + void **entryp) { - struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long pmd_addr = address & PMD_MASK; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + unsigned long pmd_addr = vmf->address & PMD_MASK; + struct inode *inode = mapping->host; struct page *zero_page; + void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; - void *ret; - zero_page = mm_get_huge_zero_page(vma->vm_mm); + zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); if (unlikely(!zero_page)) - return VM_FAULT_FALLBACK; + goto fallback; ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, RADIX_DAX_PMD | RADIX_DAX_HZP); if (IS_ERR(ret)) - return VM_FAULT_FALLBACK; + goto fallback; *entryp = ret; - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_none(*pmd)) { + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (!pmd_none(*(vmf->pmd))) { spin_unlock(ptl); - return VM_FAULT_FALLBACK; + goto fallback; } - pmd_entry = mk_pmd(zero_page, vma->vm_page_prot); + pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); pmd_entry = pmd_mkhuge(pmd_entry); - set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry); + set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); + trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); return VM_FAULT_NOPAGE; + +fallback: + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); + return VM_FAULT_FALLBACK; } -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops) +int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops) { + struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long pmd_addr = address & PMD_MASK; - bool write = flags & FAULT_FLAG_WRITE; + unsigned long pmd_addr = vmf->address & PMD_MASK; + bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; int result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; - struct vm_fault vmf; void *entry; loff_t pos; int error; + /* + * Check whether offset isn't beyond end of file now. Caller is + * supposed to hold locks serializing us with truncate / punch hole so + * this is a reliable test. + */ + pgoff = linear_page_index(vma, pmd_addr); + max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; + + trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); + /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) goto fallback; @@ -1351,16 +1373,10 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - /* - * Check whether offset isn't beyond end of file now. Caller is - * supposed to hold locks serializing us with truncate / punch hole so - * this is a reliable test. - */ - pgoff = linear_page_index(vma, pmd_addr); - max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; - - if (pgoff > max_pgoff) - return VM_FAULT_SIGBUS; + if (pgoff > max_pgoff) { + result = VM_FAULT_SIGBUS; + goto out; + } /* If the PMD would extend beyond the file size */ if ((pgoff | PG_PMD_COLOUR) > max_pgoff) @@ -1389,21 +1405,15 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (IS_ERR(entry)) goto finish_iomap; - vmf.pgoff = pgoff; - vmf.flags = flags; - vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; - switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, - &iomap, pos, write, &entry); + result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) goto unlock_entry; - result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, - &entry); + result = dax_pmd_load_hole(vmf, &iomap, &entry); break; default: WARN_ON_ONCE(1); @@ -1429,9 +1439,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, } fallback: if (result == VM_FAULT_FALLBACK) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, vmf->pmd, vmf->address); count_vm_event(THP_FAULT_FALLBACK); } +out: + trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); return result; } EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 87e11dfe3cde..13021a054fc0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -273,21 +273,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return result; } -static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) +static int +ext4_dax_pmd_fault(struct vm_fault *vmf) { int result; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; - bool write = flags & FAULT_FLAG_WRITE; + bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); } down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_pmd_fault(vma, addr, pmd, flags, - &ext4_iomap_ops); + result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops); up_read(&EXT4_I(inode)->i_mmap_sem); if (write) sb_end_pagefault(sb); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b00d53d13d47..006068526542 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -728,8 +728,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); - smp_mb__after_atomic(); - wake_up_page(head->wb_page, PG_private); clear_bit(PG_MAPPED, &head->wb_flags); } nfsi->nrequests--; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index bed1fcb63088..dc22ba8c710f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -283,16 +283,14 @@ int ocfs2_set_acl(handle_t *handle, int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; - int status = 0; + int status, had_lock; + struct ocfs2_lock_holder oh; - status = ocfs2_inode_lock(inode, &bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } + had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); + if (had_lock < 0) + return had_lock; status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); brelse(bh); return status; } @@ -302,21 +300,20 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; - int ret; + int had_lock; + struct ocfs2_lock_holder oh; osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return NULL; - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - if (ret != -ENOENT) - mlog_errno(ret); - return ERR_PTR(ret); - } + + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh); + if (had_lock < 0) + return ERR_PTR(had_lock); acl = ocfs2_get_acl_nolock(inode, type, di_bh); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); return acl; } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 77d1632e905d..8dce4099a6ca 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) init_waitqueue_head(&res->l_event); INIT_LIST_HEAD(&res->l_blocked_list); INIT_LIST_HEAD(&res->l_mask_waiters); + INIT_LIST_HEAD(&res->l_holders); } void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, @@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) res->l_flags = 0UL; } +/* + * Keep a list of processes who have interest in a lockres. + * Note: this is now only uesed for check recursive cluster locking. + */ +static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + INIT_LIST_HEAD(&oh->oh_list); + oh->oh_owner_pid = get_pid(task_pid(current)); + + spin_lock(&lockres->l_lock); + list_add_tail(&oh->oh_list, &lockres->l_holders); + spin_unlock(&lockres->l_lock); +} + +static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + spin_lock(&lockres->l_lock); + list_del(&oh->oh_list); + spin_unlock(&lockres->l_lock); + + put_pid(oh->oh_owner_pid); +} + +static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_lock_holder *oh; + struct pid *pid; + + /* look in the list of holders for one with the current task as owner */ + spin_lock(&lockres->l_lock); + pid = task_pid(current); + list_for_each_entry(oh, &lockres->l_holders, oh_list) { + if (oh->oh_owner_pid == pid) { + spin_unlock(&lockres->l_lock); + return 1; + } + } + spin_unlock(&lockres->l_lock); + + return 0; +} + static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) { @@ -2333,8 +2378,9 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, goto getbh; } - if (ocfs2_mount_local(osb)) - goto local; + if ((arg_flags & OCFS2_META_LOCK_GETBH) || + ocfs2_mount_local(osb)) + goto update; if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); @@ -2363,7 +2409,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); -local: +update: /* * We only see this flag if we're being called from * ocfs2_read_locked_inode(). It means we're locking an inode @@ -2497,6 +2543,59 @@ void ocfs2_inode_unlock(struct inode *inode, ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); } +/* + * This _tracker variantes are introduced to deal with the recursive cluster + * locking issue. The idea is to keep track of a lock holder on the stack of + * the current process. If there's a lock holder on the stack, we know the + * task context is already protected by cluster locking. Currently, they're + * used in some VFS entry routines. + * + * return < 0 on error, return == 0 if there's no lock holder on the stack + * before this call, return == 1 if this call would be a recursive locking. + */ +int ocfs2_inode_lock_tracker(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct ocfs2_lock_holder *oh) +{ + int status; + int arg_flags = 0, has_locked; + struct ocfs2_lock_res *lockres; + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + has_locked = ocfs2_is_locked_by_me(lockres); + /* Just get buffer head if the cluster lock has been taken */ + if (has_locked) + arg_flags = OCFS2_META_LOCK_GETBH; + + if (likely(!has_locked || ret_bh)) { + status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + } + if (!has_locked) + ocfs2_add_holder(lockres, oh); + + return has_locked; +} + +void ocfs2_inode_unlock_tracker(struct inode *inode, + int ex, + struct ocfs2_lock_holder *oh, + int had_lock) +{ + struct ocfs2_lock_res *lockres; + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + if (!had_lock) { + ocfs2_remove_holder(lockres, oh); + ocfs2_inode_unlock(inode, ex); + } +} + int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno) { struct ocfs2_lock_res *lockres; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index d293a22c32c5..a7fc18ba0dc1 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -70,6 +70,11 @@ struct ocfs2_orphan_scan_lvb { __be32 lvb_os_seqno; }; +struct ocfs2_lock_holder { + struct list_head oh_list; + struct pid *oh_owner_pid; +}; + /* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) @@ -77,6 +82,8 @@ struct ocfs2_orphan_scan_lvb { #define OCFS2_META_LOCK_NOQUEUE (0x02) /* don't block waiting for the downconvert thread, instead return -EAGAIN */ #define OCFS2_LOCK_NONBLOCK (0x04) +/* just get back disk inode bh if we've got cluster lock. */ +#define OCFS2_META_LOCK_GETBH (0x08) /* Locking subclasses of inode cluster lock */ enum { @@ -170,4 +177,15 @@ void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); /* To set the locking protocol on module initialization */ void ocfs2_set_locking_protocol(void); + +/* The _tracker pair is used to avoid cluster recursive locking */ +int ocfs2_inode_lock_tracker(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct ocfs2_lock_holder *oh); +void ocfs2_inode_unlock_tracker(struct inode *inode, + int ex, + struct ocfs2_lock_holder *oh, + int had_lock); + #endif /* DLMGLUE_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index c4889655d32b..7b6a146327d7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1138,6 +1138,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) handle_t *handle = NULL; struct dquot *transfer_to[MAXQUOTAS] = { }; int qtype; + int had_lock; + struct ocfs2_lock_holder oh; trace_ocfs2_setattr(inode, dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1173,11 +1175,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - status = ocfs2_inode_lock(inode, &bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); + had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); + if (had_lock < 0) { + status = had_lock; goto bail_unlock_rw; + } else if (had_lock) { + /* + * As far as we know, ocfs2_setattr() could only be the first + * VFS entry point in the call chain of recursive cluster + * locking issue. + * + * For instance: + * chmod_common() + * notify_change() + * ocfs2_setattr() + * posix_acl_chmod() + * ocfs2_iop_get_acl() + * + * But, we're not 100% sure if it's always true, because the + * ordering of the VFS entry points in the call chain is out + * of our control. So, we'd better dump the stack here to + * catch the other cases of recursive locking. + */ + mlog(ML_ERROR, "Another case of recursive locking:\n"); + dump_stack(); } inode_locked = 1; @@ -1260,8 +1281,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - if (status) { - ocfs2_inode_unlock(inode, 1); + if (status && inode_locked) { + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); inode_locked = 0; } bail_unlock_rw: @@ -1279,7 +1300,7 @@ bail: mlog_errno(status); } if (inode_locked) - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); brelse(bh); return status; @@ -1320,21 +1341,32 @@ bail: int ocfs2_permission(struct inode *inode, int mask) { - int ret; + int ret, had_lock; + struct ocfs2_lock_holder oh; if (mask & MAY_NOT_BLOCK) return -ECHILD; - ret = ocfs2_inode_lock(inode, NULL, 0); - if (ret) { - if (ret != -ENOENT) - mlog_errno(ret); + had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh); + if (had_lock < 0) { + ret = had_lock; goto out; + } else if (had_lock) { + /* See comments in ocfs2_setattr() for details. + * The call chain of this case could be: + * do_sys_open() + * may_open() + * inode_permission() + * ocfs2_permission() + * ocfs2_iop_get_acl() + */ + mlog(ML_ERROR, "Another case of recursive locking:\n"); + dump_stack(); } ret = generic_permission(inode, mask); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); out: return ret; } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7e5958b0be6b..0c39d71c67a1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -172,6 +172,7 @@ struct ocfs2_lock_res { struct list_head l_blocked_list; struct list_head l_mask_waiters; + struct list_head l_holders; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 43953e03c356..18406158e13f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -12,6 +12,7 @@ * mm/ksm.c (mm hashing). */ +#include <linux/list.h> #include <linux/hashtable.h> #include <linux/sched.h> #include <linux/mm.h> @@ -26,6 +27,7 @@ #include <linux/mempolicy.h> #include <linux/ioctl.h> #include <linux/security.h> +#include <linux/hugetlb.h> static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; @@ -45,12 +47,16 @@ struct userfaultfd_ctx { wait_queue_head_t fault_wqh; /* waitqueue head for the pseudo fd to wakeup poll/read */ wait_queue_head_t fd_wqh; + /* waitqueue head for events */ + wait_queue_head_t event_wqh; /* a refile sequence protected by fault_pending_wqh lock */ struct seqcount refile_seq; /* pseudo fd refcounting */ atomic_t refcount; /* userfaultfd syscall flags */ unsigned int flags; + /* features requested from the userspace */ + unsigned int features; /* state machine */ enum userfaultfd_state state; /* released */ @@ -59,6 +65,12 @@ struct userfaultfd_ctx { struct mm_struct *mm; }; +struct userfaultfd_fork_ctx { + struct userfaultfd_ctx *orig; + struct userfaultfd_ctx *new; + struct list_head list; +}; + struct userfaultfd_wait_queue { struct uffd_msg msg; wait_queue_t wq; @@ -142,6 +154,8 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); + VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); mmdrop(ctx->mm); @@ -169,7 +183,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address, msg.arg.pagefault.address = address; if (flags & FAULT_FLAG_WRITE) /* - * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the + * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE * was not set in a UFFD_EVENT_PAGEFAULT, it means it * was a read fault, otherwise if set it means it's @@ -188,6 +202,49 @@ static inline struct uffd_msg userfault_msg(unsigned long address, return msg; } +#ifdef CONFIG_HUGETLB_PAGE +/* + * Same functionality as userfaultfd_must_wait below with modifications for + * hugepmd ranges. + */ +static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + unsigned long address, + unsigned long flags, + unsigned long reason) +{ + struct mm_struct *mm = ctx->mm; + pte_t *pte; + bool ret = true; + + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + pte = huge_pte_offset(mm, address); + if (!pte) + goto out; + + ret = false; + + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + if (huge_pte_none(*pte)) + ret = true; + if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP)) + ret = true; +out: + return ret; +} +#else +static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + unsigned long address, + unsigned long flags, + unsigned long reason) +{ + return false; /* should never get here */ +} +#endif /* CONFIG_HUGETLB_PAGE */ + /* * Verify the pagetables are still not ok after having reigstered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any @@ -364,8 +421,12 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, - reason); + if (!is_vm_hugetlb_page(vmf->vma)) + must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, + reason); + else + must_wait = userfaultfd_huge_must_wait(ctx, vmf->address, + vmf->flags, reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && @@ -458,6 +519,196 @@ out: return ret; } +static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) +{ + int ret = 0; + + ewq->ctx = ctx; + init_waitqueue_entry(&ewq->wq, current); + + spin_lock(&ctx->event_wqh.lock); + /* + * After the __add_wait_queue the uwq is visible to userland + * through poll/read(). + */ + __add_wait_queue(&ctx->event_wqh, &ewq->wq); + for (;;) { + set_current_state(TASK_KILLABLE); + if (ewq->msg.event == 0) + break; + if (ACCESS_ONCE(ctx->released) || + fatal_signal_pending(current)) { + ret = -1; + __remove_wait_queue(&ctx->event_wqh, &ewq->wq); + break; + } + + spin_unlock(&ctx->event_wqh.lock); + + wake_up_poll(&ctx->fd_wqh, POLLIN); + schedule(); + + spin_lock(&ctx->event_wqh.lock); + } + __set_current_state(TASK_RUNNING); + spin_unlock(&ctx->event_wqh.lock); + + /* + * ctx may go away after this if the userfault pseudo fd is + * already released. + */ + + userfaultfd_ctx_put(ctx); + return ret; +} + +static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) +{ + ewq->msg.event = 0; + wake_up_locked(&ctx->event_wqh); + __remove_wait_queue(&ctx->event_wqh, &ewq->wq); +} + +int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) +{ + struct userfaultfd_ctx *ctx = NULL, *octx; + struct userfaultfd_fork_ctx *fctx; + + octx = vma->vm_userfaultfd_ctx.ctx; + if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + return 0; + } + + list_for_each_entry(fctx, fcs, list) + if (fctx->orig == octx) { + ctx = fctx->new; + break; + } + + if (!ctx) { + fctx = kmalloc(sizeof(*fctx), GFP_KERNEL); + if (!fctx) + return -ENOMEM; + + ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); + if (!ctx) { + kfree(fctx); + return -ENOMEM; + } + + atomic_set(&ctx->refcount, 1); + ctx->flags = octx->flags; + ctx->state = UFFD_STATE_RUNNING; + ctx->features = octx->features; + ctx->released = false; + ctx->mm = vma->vm_mm; + atomic_inc(&ctx->mm->mm_count); + + userfaultfd_ctx_get(octx); + fctx->orig = octx; + fctx->new = ctx; + list_add_tail(&fctx->list, fcs); + } + + vma->vm_userfaultfd_ctx.ctx = ctx; + return 0; +} + +static int dup_fctx(struct userfaultfd_fork_ctx *fctx) +{ + struct userfaultfd_ctx *ctx = fctx->orig; + struct userfaultfd_wait_queue ewq; + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_FORK; + ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; + + return userfaultfd_event_wait_completion(ctx, &ewq); +} + +void dup_userfaultfd_complete(struct list_head *fcs) +{ + int ret = 0; + struct userfaultfd_fork_ctx *fctx, *n; + + list_for_each_entry_safe(fctx, n, fcs, list) { + if (!ret) + ret = dup_fctx(fctx); + list_del(&fctx->list); + kfree(fctx); + } +} + +void mremap_userfaultfd_prep(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx *vm_ctx) +{ + struct userfaultfd_ctx *ctx; + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { + vm_ctx->ctx = ctx; + userfaultfd_ctx_get(ctx); + } +} + +void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, + unsigned long from, unsigned long to, + unsigned long len) +{ + struct userfaultfd_ctx *ctx = vm_ctx->ctx; + struct userfaultfd_wait_queue ewq; + + if (!ctx) + return; + + if (to & ~PAGE_MASK) { + userfaultfd_ctx_put(ctx); + return; + } + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_REMAP; + ewq.msg.arg.remap.from = from; + ewq.msg.arg.remap.to = to; + ewq.msg.arg.remap.len = len; + + userfaultfd_event_wait_completion(ctx, &ewq); +} + +void madvise_userfault_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + struct userfaultfd_ctx *ctx; + struct userfaultfd_wait_queue ewq; + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED)) + return; + + userfaultfd_ctx_get(ctx); + up_read(&mm->mmap_sem); + + *prev = NULL; /* We wait for ACK w/o the mmap semaphore */ + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_MADVDONTNEED; + ewq.msg.arg.madv_dn.start = start; + ewq.msg.arg.madv_dn.end = end; + + userfaultfd_event_wait_completion(ctx, &ewq); + + down_read(&mm->mmap_sem); +} + static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; @@ -522,25 +773,36 @@ wakeup: } /* fault_pending_wqh.lock must be hold by the caller */ -static inline struct userfaultfd_wait_queue *find_userfault( - struct userfaultfd_ctx *ctx) +static inline struct userfaultfd_wait_queue *find_userfault_in( + wait_queue_head_t *wqh) { wait_queue_t *wq; struct userfaultfd_wait_queue *uwq; - VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_BUG_ON(!spin_is_locked(&wqh->lock)); uwq = NULL; - if (!waitqueue_active(&ctx->fault_pending_wqh)) + if (!waitqueue_active(wqh)) goto out; /* walk in reverse to provide FIFO behavior to read userfaults */ - wq = list_last_entry(&ctx->fault_pending_wqh.task_list, - typeof(*wq), task_list); + wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list); uwq = container_of(wq, struct userfaultfd_wait_queue, wq); out: return uwq; } +static inline struct userfaultfd_wait_queue *find_userfault( + struct userfaultfd_ctx *ctx) +{ + return find_userfault_in(&ctx->fault_pending_wqh); +} + +static inline struct userfaultfd_wait_queue *find_userfault_evt( + struct userfaultfd_ctx *ctx) +{ + return find_userfault_in(&ctx->event_wqh); +} + static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) { struct userfaultfd_ctx *ctx = file->private_data; @@ -572,10 +834,42 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) smp_mb(); if (waitqueue_active(&ctx->fault_pending_wqh)) ret = POLLIN; + else if (waitqueue_active(&ctx->event_wqh)) + ret = POLLIN; + return ret; default: - BUG(); + WARN_ON_ONCE(1); + return POLLERR; + } +} + +static const struct file_operations userfaultfd_fops; + +static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, + struct userfaultfd_ctx *new, + struct uffd_msg *msg) +{ + int fd; + struct file *file; + unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS; + + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new, + O_RDWR | flags); + if (IS_ERR(file)) { + put_unused_fd(fd); + return PTR_ERR(file); } + + fd_install(fd, file); + msg->arg.reserved.reserved1 = 0; + msg->arg.fork.ufd = fd; + + return 0; } static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, @@ -584,6 +878,15 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, ssize_t ret; DECLARE_WAITQUEUE(wait, current); struct userfaultfd_wait_queue *uwq; + /* + * Handling fork event requires sleeping operations, so + * we drop the event_wqh lock, then do these ops, then + * lock it back and wake up the waiter. While the lock is + * dropped the ewq may go away so we keep track of it + * carefully. + */ + LIST_HEAD(fork_event); + struct userfaultfd_ctx *fork_nctx = NULL; /* always take the fd_wqh lock before the fault_pending_wqh lock */ spin_lock(&ctx->fd_wqh.lock); @@ -635,6 +938,29 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, break; } spin_unlock(&ctx->fault_pending_wqh.lock); + + spin_lock(&ctx->event_wqh.lock); + uwq = find_userfault_evt(ctx); + if (uwq) { + *msg = uwq->msg; + + if (uwq->msg.event == UFFD_EVENT_FORK) { + fork_nctx = (struct userfaultfd_ctx *) + (unsigned long) + uwq->msg.arg.reserved.reserved1; + list_move(&uwq->wq.task_list, &fork_event); + spin_unlock(&ctx->event_wqh.lock); + ret = 0; + break; + } + + userfaultfd_event_complete(ctx, uwq); + spin_unlock(&ctx->event_wqh.lock); + ret = 0; + break; + } + spin_unlock(&ctx->event_wqh.lock); + if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -651,6 +977,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, __set_current_state(TASK_RUNNING); spin_unlock(&ctx->fd_wqh.lock); + if (!ret && msg->event == UFFD_EVENT_FORK) { + ret = resolve_userfault_fork(ctx, fork_nctx, msg); + + if (!ret) { + spin_lock(&ctx->event_wqh.lock); + if (!list_empty(&fork_event)) { + uwq = list_first_entry(&fork_event, + typeof(*uwq), + wq.task_list); + list_del(&uwq->wq.task_list); + __add_wait_queue(&ctx->event_wqh, &uwq->wq); + userfaultfd_event_complete(ctx, uwq); + } + spin_unlock(&ctx->event_wqh.lock); + } + } + return ret; } @@ -753,6 +1096,12 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } +static inline bool vma_can_userfault(struct vm_area_struct *vma) +{ + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); +} + static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -763,6 +1112,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, struct uffdio_register __user *user_uffdio_register; unsigned long vm_flags, new_flags; bool found; + bool non_anon_pages; unsigned long start, end, vma_end; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -814,13 +1164,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out_unlock; /* + * If the first vma contains huge pages, make sure start address + * is aligned to huge page size. + */ + if (is_vm_hugetlb_page(vma)) { + unsigned long vma_hpagesize = vma_kernel_pagesize(vma); + + if (start & (vma_hpagesize - 1)) + goto out_unlock; + } + + /* * Search for not compatible vmas. - * - * FIXME: this shall be relaxed later so that it doesn't fail - * on tmpfs backed vmas (in addition to the current allowance - * on anonymous vmas). */ found = false; + non_anon_pages = false; for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { cond_resched(); @@ -829,8 +1187,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (cur->vm_ops) + if (!vma_can_userfault(cur)) goto out_unlock; + /* + * If this vma contains ending address, and huge pages + * check alignment. + */ + if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && + end > cur->vm_start) { + unsigned long vma_hpagesize = vma_kernel_pagesize(cur); + + ret = -EINVAL; + + if (end & (vma_hpagesize - 1)) + goto out_unlock; + } /* * Check that this vma isn't already owned by a @@ -843,6 +1214,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, cur->vm_userfaultfd_ctx.ctx != ctx) goto out_unlock; + /* + * Note vmas containing huge pages + */ + if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur)) + non_anon_pages = true; + found = true; } BUG_ON(!found); @@ -854,7 +1231,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(vma->vm_ops); + BUG_ON(!vma_can_userfault(vma)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); @@ -912,7 +1289,8 @@ out_unlock: * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(UFFD_API_RANGE_IOCTLS, + if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC : + UFFD_API_RANGE_IOCTLS, &user_uffdio_register->ioctls)) ret = -EFAULT; } @@ -959,11 +1337,18 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out_unlock; /* + * If the first vma contains huge pages, make sure start address + * is aligned to huge page size. + */ + if (is_vm_hugetlb_page(vma)) { + unsigned long vma_hpagesize = vma_kernel_pagesize(vma); + + if (start & (vma_hpagesize - 1)) + goto out_unlock; + } + + /* * Search for not compatible vmas. - * - * FIXME: this shall be relaxed later so that it doesn't fail - * on tmpfs backed vmas (in addition to the current allowance - * on anonymous vmas). */ found = false; ret = -EINVAL; @@ -980,7 +1365,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (cur->vm_ops) + if (!vma_can_userfault(cur)) goto out_unlock; found = true; @@ -994,7 +1379,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(vma->vm_ops); + BUG_ON(!vma_can_userfault(vma)); /* * Nothing to do: this vma is already registered into this @@ -1007,6 +1392,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, start = vma->vm_start; vma_end = min(end, vma->vm_end); + if (userfaultfd_missing(vma)) { + /* + * Wake any concurrent pending userfault while + * we unregister, so they will not hang + * permanently and it avoids userland to call + * UFFDIO_WAKE explicitly. + */ + struct userfaultfd_wake_range range; + range.start = start; + range.len = vma_end - start; + wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); + } + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, @@ -1178,6 +1576,14 @@ out: return ret; } +static inline unsigned int uffd_ctx_features(__u64 user_features) +{ + /* + * For the current set of features the bits just coincide + */ + return (unsigned int)user_features; +} + /* * userland asks for a certain API version and we return which bits * and ioctl commands are implemented in this kernel for such API @@ -1189,6 +1595,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, struct uffdio_api uffdio_api; void __user *buf = (void __user *)arg; int ret; + __u64 features; ret = -EINVAL; if (ctx->state != UFFD_STATE_WAIT_API) @@ -1196,19 +1603,23 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - if (uffdio_api.api != UFFD_API || uffdio_api.features) { + features = uffdio_api.features; + if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) { memset(&uffdio_api, 0, sizeof(uffdio_api)); if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; ret = -EINVAL; goto out; } + /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; ctx->state = UFFD_STATE_RUNNING; + /* only enable the requested features for this uffd context */ + ctx->features = uffd_ctx_features(features); ret = 0; out: return ret; @@ -1295,6 +1706,7 @@ static void init_once_userfaultfd_ctx(void *mem) init_waitqueue_head(&ctx->fault_pending_wqh); init_waitqueue_head(&ctx->fault_wqh); + init_waitqueue_head(&ctx->event_wqh); init_waitqueue_head(&ctx->fd_wqh); seqcount_init(&ctx->refile_seq); } @@ -1335,6 +1747,7 @@ static struct file *userfaultfd_file_create(int flags) atomic_set(&ctx->refcount, 1); ctx->flags = flags; + ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; ctx->mm = current->mm; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 086440e79b86..022014016d80 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1437,12 +1437,9 @@ xfs_filemap_fault( */ STATIC int xfs_filemap_pmd_fault( - struct vm_area_struct *vma, - unsigned long addr, - pmd_t *pmd, - unsigned int flags) + struct vm_fault *vmf) { - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); int ret; @@ -1451,16 +1448,16 @@ xfs_filemap_pmd_fault( trace_xfs_filemap_pmd_fault(ip); - if (flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); + ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (flags & FAULT_FLAG_WRITE) + if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); return ret; diff --git a/include/linux/dax.h b/include/linux/dax.h index 2983e52efd07..1e77ff5818f1 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -71,15 +71,13 @@ static inline unsigned int dax_radix_order(void *entry) return PMD_SHIFT - PAGE_SHIFT; return 0; } -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops); +int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops); #else static inline unsigned int dax_radix_order(void *entry) { return 0; } -static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags, +static inline int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 97e478d6b690..f0029e786205 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -33,6 +33,7 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 48c76d612d40..503099d8aada 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -65,7 +65,8 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, - unsigned long *, unsigned long *, long, unsigned int); + unsigned long *, unsigned long *, long, unsigned int, + int *); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *); void __unmap_hugepage_range_final(struct mmu_gather *tlb, @@ -81,6 +82,11 @@ void hugetlb_show_meminfo(void); unsigned long hugetlb_total_pages(void); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); +int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep); int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); @@ -131,7 +137,7 @@ static inline unsigned long hugetlb_total_pages(void) return 0; } -#define follow_hugetlb_page(m,v,p,vs,a,b,i,w) ({ BUG(); 0; }) +#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; }) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) static inline void hugetlb_report_meminfo(struct seq_file *m) @@ -149,6 +155,8 @@ static inline void hugetlb_show_meminfo(void) #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) +#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ + src_addr, pagep) ({ BUG(); 0; }) #define huge_pte_offset(mm, address) 0 static inline int dequeue_hwpoisoned_huge_page(struct page *page) { diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 5b759c9acf97..38bcf00cbed3 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -203,6 +203,7 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); +unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn); /** * for_each_mem_pfn_range - early memory pfn range iterator diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 254698856b8f..5af377303880 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -253,6 +253,7 @@ struct mem_cgroup { /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; enum memcg_kmem_state kmem_state; + struct list_head kmem_caches; #endif int last_scanned_node; @@ -829,6 +830,7 @@ void memcg_kmem_uncharge(struct page *page, int order); #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) extern struct static_key_false memcg_kmem_enabled_key; +extern struct workqueue_struct *memcg_kmem_cache_wq; extern int memcg_nr_cache_ids; void memcg_get_cache_ids(void); diff --git a/include/linux/mm.h b/include/linux/mm.h index 6ff66d6fe8e2..574bc157a27c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,6 +285,17 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_TRACE \ + { FAULT_FLAG_WRITE, "WRITE" }, \ + { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ + { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ + { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ + { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ + { FAULT_FLAG_TRIED, "TRIED" }, \ + { FAULT_FLAG_USER, "USER" }, \ + { FAULT_FLAG_REMOTE, "REMOTE" }, \ + { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" } + /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask @@ -340,8 +351,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); - int (*pmd_fault)(struct vm_area_struct *, unsigned long address, - pmd_t *, unsigned int flags); + int (*pmd_fault)(struct vm_fault *vmf); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); @@ -1111,6 +1121,20 @@ static inline void clear_page_pfmemalloc(struct page *page) VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ VM_FAULT_FALLBACK) +#define VM_FAULT_RESULT_TRACE \ + { VM_FAULT_OOM, "OOM" }, \ + { VM_FAULT_SIGBUS, "SIGBUS" }, \ + { VM_FAULT_MAJOR, "MAJOR" }, \ + { VM_FAULT_WRITE, "WRITE" }, \ + { VM_FAULT_HWPOISON, "HWPOISON" }, \ + { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ + { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ + { VM_FAULT_NOPAGE, "NOPAGE" }, \ + { VM_FAULT_LOCKED, "LOCKED" }, \ + { VM_FAULT_RETRY, "RETRY" }, \ + { VM_FAULT_FALLBACK, "FALLBACK" }, \ + { VM_FAULT_DONE_COW, "DONE_COW" } + /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((x) << 12) #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) @@ -1128,8 +1152,7 @@ extern void pagefault_out_of_memory(void); */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -extern void show_free_areas(unsigned int flags); -extern bool skip_free_areas_node(unsigned int flags, int nid); +extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); int shmem_zero_setup(struct vm_area_struct *); #ifdef CONFIG_SHMEM @@ -1152,8 +1175,6 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ - bool ignore_dirty; /* Ignore dirty pages */ - bool check_swap_entries; /* Check also swap entries */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, @@ -1164,7 +1185,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *); + unsigned long size); void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); @@ -1359,6 +1380,16 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma) return !vma->vm_ops; } +#ifdef CONFIG_SHMEM +/* + * The vma_is_shmem is not inline because it is used only by slow + * paths in userfault. + */ +bool vma_is_shmem(struct vm_area_struct *vma); +#else +static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } +#endif + static inline int stack_guard_page_start(struct vm_area_struct *vma, unsigned long addr) { @@ -1900,7 +1931,7 @@ extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(unsigned int flags); +extern void show_mem(unsigned int flags, nodemask_t *nodemask); extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); @@ -1908,8 +1939,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid); extern unsigned long arch_reserved_kernel_pages(void); #endif -extern __printf(2, 3) -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...); +extern __printf(3, 4) +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); @@ -2049,6 +2080,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} /* These take the mm semaphore themselves */ extern int __must_check vm_brk(unsigned long, unsigned long); +extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, @@ -2400,6 +2432,10 @@ extern void clear_huge_page(struct page *page, extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma, unsigned int pages_per_huge_page); +extern long copy_huge_page_from_user(struct page *dst_page, + const void __user *usr_src, + unsigned int pages_per_huge_page, + bool allow_pagefault); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ extern struct page_ext_operations debug_guardpage_ops; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f4aac87adcc3..82fc632fd11d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -779,7 +779,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #endif } -extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru); +extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #ifdef CONFIG_HAVE_MEMORY_PRESENT void memory_present(int nid, unsigned long start, unsigned long end); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 324c8dbad1e1..84943e8057ef 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -266,7 +266,6 @@ static inline struct page *find_get_page_flags(struct address_space *mapping, /** * find_lock_page - locate, pin and lock a pagecache page - * pagecache_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * @@ -482,19 +481,11 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, } /* - * This is exported only for wait_on_page_locked/wait_on_page_writeback, - * and for filesystems which need to wait on PG_private. + * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc., + * and should not be used directly. */ extern void wait_on_page_bit(struct page *page, int bit_nr); extern int wait_on_page_bit_killable(struct page *page, int bit_nr); -extern void wake_up_page_bit(struct page *page, int bit_nr); - -static inline void wake_up_page(struct page *page, int bit) -{ - if (!PageWaiters(page)) - return; - wake_up_page_bit(page, bit); -} /* * Wait for a page to be unlocked. diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index a3d90b9da18d..033fc7bbcefa 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -15,6 +15,12 @@ #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) #define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4)) +#define PFN_FLAGS_TRACE \ + { PFN_SG_CHAIN, "SG_CHAIN" }, \ + { PFN_SG_LAST, "SG_LAST" }, \ + { PFN_DEV, "DEV" }, \ + { PFN_MAP, "MAP" } + static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags) { pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), }; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index ff078e7043b6..fdaac9d4d46d 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -124,4 +124,15 @@ static inline bool shmem_huge_enabled(struct vm_area_struct *vma) } #endif +#ifdef CONFIG_SHMEM +extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep); +#else +#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ + src_addr, pagep) ({ BUG(); 0; }) +#endif + #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 4c5363566815..3c37a8c51921 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -545,22 +545,49 @@ struct memcg_cache_array { * array to be accessed without taking any locks, on relocation we free the old * version only after a grace period. * - * Child caches will hold extra metadata needed for its operation. Fields are: + * Root and child caches hold different metadata. * - * @memcg: pointer to the memcg this cache belongs to - * @root_cache: pointer to the global, root cache, this cache was derived from + * @root_cache: Common to root and child caches. NULL for root, pointer to + * the root cache for children. * - * Both root and child caches of the same kind are linked into a list chained - * through @list. + * The following fields are specific to root caches. + * + * @memcg_caches: kmemcg ID indexed table of child caches. This table is + * used to index child cachces during allocation and cleared + * early during shutdown. + * + * @root_caches_node: List node for slab_root_caches list. + * + * @children: List of all child caches. While the child caches are also + * reachable through @memcg_caches, a child cache remains on + * this list until it is actually destroyed. + * + * The following fields are specific to child caches. + * + * @memcg: Pointer to the memcg this cache belongs to. + * + * @children_node: List node for @root_cache->children list. + * + * @kmem_caches_node: List node for @memcg->kmem_caches list. */ struct memcg_cache_params { - bool is_root_cache; - struct list_head list; + struct kmem_cache *root_cache; union { - struct memcg_cache_array __rcu *memcg_caches; + struct { + struct memcg_cache_array __rcu *memcg_caches; + struct list_head __root_caches_node; + struct list_head children; + }; struct { struct mem_cgroup *memcg; - struct kmem_cache *root_cache; + struct list_head children_node; + struct list_head kmem_caches_node; + + void (*deact_fn)(struct kmem_cache *); + union { + struct rcu_head deact_rcu_head; + struct work_struct deact_work; + }; }; }; }; diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 75f56c2ef2d4..07ef550c6627 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -113,9 +113,9 @@ struct kmem_cache { #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS -void sysfs_slab_remove(struct kmem_cache *); +void sysfs_slab_release(struct kmem_cache *); #else -static inline void sysfs_slab_remove(struct kmem_cache *s) +static inline void sysfs_slab_release(struct kmem_cache *s) { } #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index 7f47b7098b1b..45e91dd6716d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -27,6 +27,7 @@ struct bio; #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ SWAP_FLAG_DISCARD_PAGES) +#define SWAP_BATCH 64 static inline int current_is_kswapd(void) { @@ -176,6 +177,12 @@ enum { * protected by swap_info_struct.lock. */ struct swap_cluster_info { + spinlock_t lock; /* + * Protect swap_cluster_info fields + * and swap_info_struct->swap_map + * elements correspond to the swap + * cluster + */ unsigned int data:24; unsigned int flags:8; }; @@ -337,8 +344,13 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); /* linux/mm/swap_state.c */ -extern struct address_space swapper_spaces[]; -#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) +/* One swap address space for each 64M swap space */ +#define SWAP_ADDRESS_SPACE_SHIFT 14 +#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) +extern struct address_space *swapper_spaces[]; +#define swap_address_space(entry) \ + (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ + >> SWAP_ADDRESS_SPACE_SHIFT]) extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *, struct list_head *list); @@ -360,6 +372,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; +extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) @@ -375,23 +388,31 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); +extern int get_swap_pages(int n, swp_entry_t swp_entries[]); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); extern void swapcache_free(swp_entry_t); +extern void swapcache_free_entries(swp_entry_t *entries, int n); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); +extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; +extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); +extern void exit_swap_address_space(unsigned int type); + +extern int get_swap_slots(int n, swp_entry_t *slots); +extern void swapcache_free_batch(swp_entry_t *entries, int n); #else /* CONFIG_SWAP */ @@ -479,6 +500,11 @@ static inline int page_swapcount(struct page *page) return 0; } +static inline int __swp_swapcount(swp_entry_t entry) +{ + return 0; +} + static inline int swp_swapcount(swp_entry_t entry) { return 0; diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h new file mode 100644 index 000000000000..6ef92d17633d --- /dev/null +++ b/include/linux/swap_slots.h @@ -0,0 +1,30 @@ +#ifndef _LINUX_SWAP_SLOTS_H +#define _LINUX_SWAP_SLOTS_H + +#include <linux/swap.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> + +#define SWAP_SLOTS_CACHE_SIZE SWAP_BATCH +#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE (5*SWAP_SLOTS_CACHE_SIZE) +#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE (2*SWAP_SLOTS_CACHE_SIZE) + +struct swap_slots_cache { + bool lock_initialized; + struct mutex alloc_lock; /* protects slots, nr, cur */ + swp_entry_t *slots; + int nr; + int cur; + spinlock_t free_lock; /* protects slots_ret, n_ret */ + swp_entry_t *slots_ret; + int n_ret; +}; + +void disable_swap_slots_cache_lock(void); +void reenable_swap_slots_cache_unlock(void); +int enable_swap_slots_cache(void); +int free_swap_slot(swp_entry_t entry); + +extern bool swap_slot_cache_enabled; + +#endif /* _LINUX_SWAP_SLOTS_H */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 0f165507495c..0af63c4381b9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -23,6 +23,10 @@ const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array); #if BITS_PER_LONG == 32 +const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, + unsigned long long flags, + const struct trace_print_flags_u64 *flag_array); + const char *trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 11b92b047a1e..f431861f22f1 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -52,6 +52,20 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); } +extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); +extern void dup_userfaultfd_complete(struct list_head *); + +extern void mremap_userfaultfd_prep(struct vm_area_struct *, + struct vm_userfaultfd_ctx *); +extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, + unsigned long from, unsigned long to, + unsigned long len); + +extern void madvise_userfault_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, + unsigned long end); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -76,6 +90,34 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return false; } +static inline int dup_userfaultfd(struct vm_area_struct *vma, + struct list_head *l) +{ + return 0; +} + +static inline void dup_userfaultfd_complete(struct list_head *l) +{ +} + +static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx *ctx) +{ +} + +static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, + unsigned long from, + unsigned long to, + unsigned long len) +{ +} + +static inline void madvise_userfault_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, + unsigned long end) +{ +} #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 4d6ec58a8d45..6aa1b6cb5828 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -56,6 +56,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, COMPACTISOLATED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, KCOMPACTD_WAKE, + KCOMPACTD_MIGRATE_SCANNED, KCOMPACTD_FREE_SCANNED, #endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index cbdb90b6b308..0a18ab6483ff 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -9,62 +9,6 @@ #include <linux/tracepoint.h> #include <trace/events/mmflags.h> -#define COMPACTION_STATUS \ - EM( COMPACT_SKIPPED, "skipped") \ - EM( COMPACT_DEFERRED, "deferred") \ - EM( COMPACT_CONTINUE, "continue") \ - EM( COMPACT_SUCCESS, "success") \ - EM( COMPACT_PARTIAL_SKIPPED, "partial_skipped") \ - EM( COMPACT_COMPLETE, "complete") \ - EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ - EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ - EMe(COMPACT_CONTENDED, "contended") - -#ifdef CONFIG_ZONE_DMA -#define IFDEF_ZONE_DMA(X) X -#else -#define IFDEF_ZONE_DMA(X) -#endif - -#ifdef CONFIG_ZONE_DMA32 -#define IFDEF_ZONE_DMA32(X) X -#else -#define IFDEF_ZONE_DMA32(X) -#endif - -#ifdef CONFIG_HIGHMEM -#define IFDEF_ZONE_HIGHMEM(X) X -#else -#define IFDEF_ZONE_HIGHMEM(X) -#endif - -#define ZONE_TYPE \ - IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \ - IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \ - EM (ZONE_NORMAL, "Normal") \ - IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \ - EMe(ZONE_MOVABLE,"Movable") - -/* - * First define the enums in the above macros to be exported to userspace - * via TRACE_DEFINE_ENUM(). - */ -#undef EM -#undef EMe -#define EM(a, b) TRACE_DEFINE_ENUM(a); -#define EMe(a, b) TRACE_DEFINE_ENUM(a); - -COMPACTION_STATUS -ZONE_TYPE - -/* - * Now redefine the EM() and EMe() macros to map the enums to the strings - * that will be printed in the output. - */ -#undef EM -#undef EMe -#define EM(a, b) {a, b}, -#define EMe(a, b) {a, b} DECLARE_EVENT_CLASS(mm_compaction_isolate_template, @@ -187,6 +131,7 @@ TRACE_EVENT(mm_compaction_begin, __entry->sync ? "sync" : "async") ); +#ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_end, TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, unsigned long free_pfn, unsigned long zone_end, bool sync, @@ -220,6 +165,7 @@ TRACE_EVENT(mm_compaction_end, __entry->sync ? "sync" : "async", __print_symbolic(__entry->status, COMPACTION_STATUS)) ); +#endif TRACE_EVENT(mm_compaction_try_to_compact_pages, @@ -248,6 +194,7 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages, __entry->prio) ); +#ifdef CONFIG_COMPACTION DECLARE_EVENT_CLASS(mm_compaction_suitable_template, TP_PROTO(struct zone *zone, @@ -295,7 +242,6 @@ DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable, TP_ARGS(zone, order, ret) ); -#ifdef CONFIG_COMPACTION DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_PROTO(struct zone *zone, int order), diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h new file mode 100644 index 000000000000..c566ddc87f73 --- /dev/null +++ b/include/trace/events/fs_dax.h @@ -0,0 +1,156 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs_dax + +#if !defined(_TRACE_FS_DAX_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FS_DAX_H + +#include <linux/tracepoint.h> + +DECLARE_EVENT_CLASS(dax_pmd_fault_class, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, + pgoff_t max_pgoff, int result), + TP_ARGS(inode, vmf, max_pgoff, result), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_start) + __field(unsigned long, vm_end) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(pgoff_t, pgoff) + __field(pgoff_t, max_pgoff) + __field(dev_t, dev) + __field(unsigned int, flags) + __field(int, result) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_start = vmf->vma->vm_start; + __entry->vm_end = vmf->vma->vm_end; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->flags = vmf->flags; + __entry->pgoff = vmf->pgoff; + __entry->max_pgoff = max_pgoff; + __entry->result = result; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx vm_start " + "%#lx vm_end %#lx pgoff %#lx max_pgoff %#lx %s", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE), + __entry->address, + __entry->vm_start, + __entry->vm_end, + __entry->pgoff, + __entry->max_pgoff, + __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE) + ) +) + +#define DEFINE_PMD_FAULT_EVENT(name) \ +DEFINE_EVENT(dax_pmd_fault_class, name, \ + TP_PROTO(struct inode *inode, struct vm_fault *vmf, \ + pgoff_t max_pgoff, int result), \ + TP_ARGS(inode, vmf, max_pgoff, result)) + +DEFINE_PMD_FAULT_EVENT(dax_pmd_fault); +DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done); + +DECLARE_EVENT_CLASS(dax_pmd_load_hole_class, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, + struct page *zero_page, + void *radix_entry), + TP_ARGS(inode, vmf, zero_page, radix_entry), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(struct page *, zero_page) + __field(void *, radix_entry) + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->zero_page = zero_page; + __entry->radix_entry = radix_entry; + ), + TP_printk("dev %d:%d ino %#lx %s address %#lx zero_page %p " + "radix_entry %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __entry->address, + __entry->zero_page, + (unsigned long)__entry->radix_entry + ) +) + +#define DEFINE_PMD_LOAD_HOLE_EVENT(name) \ +DEFINE_EVENT(dax_pmd_load_hole_class, name, \ + TP_PROTO(struct inode *inode, struct vm_fault *vmf, \ + struct page *zero_page, void *radix_entry), \ + TP_ARGS(inode, vmf, zero_page, radix_entry)) + +DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole); +DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback); + +DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, + long length, pfn_t pfn, void *radix_entry), + TP_ARGS(inode, vmf, length, pfn, radix_entry), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(long, length) + __field(u64, pfn_val) + __field(void *, radix_entry) + __field(dev_t, dev) + __field(int, write) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->write = vmf->flags & FAULT_FLAG_WRITE; + __entry->length = length; + __entry->pfn_val = pfn.val; + __entry->radix_entry = radix_entry; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx length %#lx " + "pfn %#llx %s radix_entry %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __entry->write ? "write" : "read", + __entry->address, + __entry->length, + __entry->pfn_val & ~PFN_FLAGS_MASK, + __print_flags_u64(__entry->pfn_val & PFN_FLAGS_MASK, "|", + PFN_FLAGS_TRACE), + (unsigned long)__entry->radix_entry + ) +) + +#define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \ +DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ + TP_PROTO(struct inode *inode, struct vm_fault *vmf, \ + long length, pfn_t pfn, void *radix_entry), \ + TP_ARGS(inode, vmf, length, pfn, radix_entry)) + +DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); +DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback); + +#endif /* _TRACE_FS_DAX_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 15bf875d0e4a..304ff94363b2 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -1,3 +1,6 @@ +#include <linux/node.h> +#include <linux/mmzone.h> +#include <linux/compaction.h> /* * The order of these masks is important. Matching masks will be seen * first and the left over flags will end up showing by themselves. @@ -171,3 +174,98 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ (flags) ? __print_flags(flags, "|", \ __def_vmaflag_names \ ) : "none" + +#ifdef CONFIG_COMPACTION +#define COMPACTION_STATUS \ + EM( COMPACT_SKIPPED, "skipped") \ + EM( COMPACT_DEFERRED, "deferred") \ + EM( COMPACT_CONTINUE, "continue") \ + EM( COMPACT_SUCCESS, "success") \ + EM( COMPACT_PARTIAL_SKIPPED, "partial_skipped") \ + EM( COMPACT_COMPLETE, "complete") \ + EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ + EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ + EMe(COMPACT_CONTENDED, "contended") + +/* High-level compaction status feedback */ +#define COMPACTION_FAILED 1 +#define COMPACTION_WITHDRAWN 2 +#define COMPACTION_PROGRESS 3 + +#define compact_result_to_feedback(result) \ +({ \ + enum compact_result __result = result; \ + (compaction_failed(__result)) ? COMPACTION_FAILED : \ + (compaction_withdrawn(__result)) ? COMPACTION_WITHDRAWN : COMPACTION_PROGRESS; \ +}) + +#define COMPACTION_FEEDBACK \ + EM(COMPACTION_FAILED, "failed") \ + EM(COMPACTION_WITHDRAWN, "withdrawn") \ + EMe(COMPACTION_PROGRESS, "progress") + +#define COMPACTION_PRIORITY \ + EM(COMPACT_PRIO_SYNC_FULL, "COMPACT_PRIO_SYNC_FULL") \ + EM(COMPACT_PRIO_SYNC_LIGHT, "COMPACT_PRIO_SYNC_LIGHT") \ + EMe(COMPACT_PRIO_ASYNC, "COMPACT_PRIO_ASYNC") +#else +#define COMPACTION_STATUS +#define COMPACTION_PRIORITY +#define COMPACTION_FEEDBACK +#endif + +#ifdef CONFIG_ZONE_DMA +#define IFDEF_ZONE_DMA(X) X +#else +#define IFDEF_ZONE_DMA(X) +#endif + +#ifdef CONFIG_ZONE_DMA32 +#define IFDEF_ZONE_DMA32(X) X +#else +#define IFDEF_ZONE_DMA32(X) +#endif + +#ifdef CONFIG_HIGHMEM +#define IFDEF_ZONE_HIGHMEM(X) X +#else +#define IFDEF_ZONE_HIGHMEM(X) +#endif + +#define ZONE_TYPE \ + IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \ + IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \ + EM (ZONE_NORMAL, "Normal") \ + IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \ + EMe(ZONE_MOVABLE,"Movable") + +#define LRU_NAMES \ + EM (LRU_INACTIVE_ANON, "inactive_anon") \ + EM (LRU_ACTIVE_ANON, "active_anon") \ + EM (LRU_INACTIVE_FILE, "inactive_file") \ + EM (LRU_ACTIVE_FILE, "active_file") \ + EMe(LRU_UNEVICTABLE, "unevictable") + +/* + * First define the enums in the above macros to be exported to userspace + * via TRACE_DEFINE_ENUM(). + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +COMPACTION_STATUS +COMPACTION_PRIORITY +COMPACTION_FEEDBACK +ZONE_TYPE +LRU_NAMES + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings + * that will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index 1e974983757e..38baeb27221a 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -4,6 +4,7 @@ #if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_OOM_H #include <linux/tracepoint.h> +#include <trace/events/mmflags.h> TRACE_EVENT(oom_score_adj_update, @@ -27,6 +28,86 @@ TRACE_EVENT(oom_score_adj_update, __entry->pid, __entry->comm, __entry->oom_score_adj) ); +TRACE_EVENT(reclaim_retry_zone, + + TP_PROTO(struct zoneref *zoneref, + int order, + unsigned long reclaimable, + unsigned long available, + unsigned long min_wmark, + int no_progress_loops, + bool wmark_check), + + TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check), + + TP_STRUCT__entry( + __field( int, node) + __field( int, zone_idx) + __field( int, order) + __field( unsigned long, reclaimable) + __field( unsigned long, available) + __field( unsigned long, min_wmark) + __field( int, no_progress_loops) + __field( bool, wmark_check) + ), + + TP_fast_assign( + __entry->node = zone_to_nid(zoneref->zone); + __entry->zone_idx = zoneref->zone_idx; + __entry->order = order; + __entry->reclaimable = reclaimable; + __entry->available = available; + __entry->min_wmark = min_wmark; + __entry->no_progress_loops = no_progress_loops; + __entry->wmark_check = wmark_check; + ), + + TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d", + __entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE), + __entry->order, + __entry->reclaimable, __entry->available, __entry->min_wmark, + __entry->no_progress_loops, + __entry->wmark_check) +); + +#ifdef CONFIG_COMPACTION +TRACE_EVENT(compact_retry, + + TP_PROTO(int order, + enum compact_priority priority, + enum compact_result result, + int retries, + int max_retries, + bool ret), + + TP_ARGS(order, priority, result, retries, max_retries, ret), + + TP_STRUCT__entry( + __field( int, order) + __field( int, priority) + __field( int, result) + __field( int, retries) + __field( int, max_retries) + __field( bool, ret) + ), + + TP_fast_assign( + __entry->order = order; + __entry->priority = priority; + __entry->result = compact_result_to_feedback(result); + __entry->retries = retries; + __entry->max_retries = max_retries; + __entry->ret = ret; + ), + + TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d", + __entry->order, + __print_symbolic(__entry->priority, COMPACTION_PRIORITY), + __print_symbolic(__entry->result, COMPACTION_FEEDBACK), + __entry->retries, __entry->max_retries, + __entry->ret) +); +#endif /* CONFIG_COMPACTION */ #endif /* This part must be outside protection */ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c88fd0934e7e..27e8a5c77579 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -15,6 +15,7 @@ #define RECLAIM_WB_MIXED 0x0010u #define RECLAIM_WB_SYNC 0x0004u /* Unused, all reclaim async */ #define RECLAIM_WB_ASYNC 0x0008u +#define RECLAIM_WB_LRU (RECLAIM_WB_ANON|RECLAIM_WB_FILE) #define show_reclaim_flags(flags) \ (flags) ? __print_flags(flags, "|", \ @@ -269,26 +270,27 @@ TRACE_EVENT(mm_shrink_slab_end, __entry->retval) ); -DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, - +TRACE_EVENT(mm_vmscan_lru_isolate, TP_PROTO(int classzone_idx, int order, unsigned long nr_requested, unsigned long nr_scanned, + unsigned long nr_skipped, unsigned long nr_taken, isolate_mode_t isolate_mode, - int file), + int lru), - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file), + TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), TP_STRUCT__entry( __field(int, classzone_idx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) + __field(unsigned long, nr_skipped) __field(unsigned long, nr_taken) __field(isolate_mode_t, isolate_mode) - __field(int, file) + __field(int, lru) ), TP_fast_assign( @@ -296,47 +298,21 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; + __entry->nr_skipped = nr_skipped; __entry->nr_taken = nr_taken; __entry->isolate_mode = isolate_mode; - __entry->file = file; + __entry->lru = lru; ), - TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d", + TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->isolate_mode, __entry->classzone_idx, __entry->order, __entry->nr_requested, __entry->nr_scanned, + __entry->nr_skipped, __entry->nr_taken, - __entry->file) -); - -DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, - - TP_PROTO(int classzone_idx, - int order, - unsigned long nr_requested, - unsigned long nr_scanned, - unsigned long nr_taken, - isolate_mode_t isolate_mode, - int file), - - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) - -); - -DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, - - TP_PROTO(int classzone_idx, - int order, - unsigned long nr_requested, - unsigned long nr_scanned, - unsigned long nr_taken, - isolate_mode_t isolate_mode, - int file), - - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) - + __print_symbolic(__entry->lru, LRU_NAMES)) ); TRACE_EVENT(mm_vmscan_writepage, @@ -365,14 +341,27 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, + unsigned long nr_dirty, unsigned long nr_writeback, + unsigned long nr_congested, unsigned long nr_immediate, + unsigned long nr_activate, unsigned long nr_ref_keep, + unsigned long nr_unmap_fail, int priority, int file), - TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file), + TP_ARGS(nid, nr_scanned, nr_reclaimed, nr_dirty, nr_writeback, + nr_congested, nr_immediate, nr_activate, nr_ref_keep, + nr_unmap_fail, priority, file), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_scanned) __field(unsigned long, nr_reclaimed) + __field(unsigned long, nr_dirty) + __field(unsigned long, nr_writeback) + __field(unsigned long, nr_congested) + __field(unsigned long, nr_immediate) + __field(unsigned long, nr_activate) + __field(unsigned long, nr_ref_keep) + __field(unsigned long, nr_unmap_fail) __field(int, priority) __field(int, reclaim_flags) ), @@ -381,17 +370,102 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; + __entry->nr_dirty = nr_dirty; + __entry->nr_writeback = nr_writeback; + __entry->nr_congested = nr_congested; + __entry->nr_immediate = nr_immediate; + __entry->nr_activate = nr_activate; + __entry->nr_ref_keep = nr_ref_keep; + __entry->nr_unmap_fail = nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_shrink_flags(file); ), - TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, + __entry->nr_dirty, __entry->nr_writeback, + __entry->nr_congested, __entry->nr_immediate, + __entry->nr_activate, __entry->nr_ref_keep, + __entry->nr_unmap_fail, __entry->priority, + show_reclaim_flags(__entry->reclaim_flags)) +); + +TRACE_EVENT(mm_vmscan_lru_shrink_active, + + TP_PROTO(int nid, unsigned long nr_taken, + unsigned long nr_active, unsigned long nr_deactivated, + unsigned long nr_referenced, int priority, int file), + + TP_ARGS(nid, nr_taken, nr_active, nr_deactivated, nr_referenced, priority, file), + + TP_STRUCT__entry( + __field(int, nid) + __field(unsigned long, nr_taken) + __field(unsigned long, nr_active) + __field(unsigned long, nr_deactivated) + __field(unsigned long, nr_referenced) + __field(int, priority) + __field(int, reclaim_flags) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->nr_taken = nr_taken; + __entry->nr_active = nr_active; + __entry->nr_deactivated = nr_deactivated; + __entry->nr_referenced = nr_referenced; + __entry->priority = priority; + __entry->reclaim_flags = trace_shrink_flags(file); + ), + + TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s", + __entry->nid, + __entry->nr_taken, + __entry->nr_active, __entry->nr_deactivated, __entry->nr_referenced, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); +TRACE_EVENT(mm_vmscan_inactive_list_is_low, + + TP_PROTO(int nid, int reclaim_idx, + unsigned long total_inactive, unsigned long inactive, + unsigned long total_active, unsigned long active, + unsigned long ratio, int file), + + TP_ARGS(nid, reclaim_idx, total_inactive, inactive, total_active, active, ratio, file), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, reclaim_idx) + __field(unsigned long, total_inactive) + __field(unsigned long, inactive) + __field(unsigned long, total_active) + __field(unsigned long, active) + __field(unsigned long, ratio) + __field(int, reclaim_flags) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->reclaim_idx = reclaim_idx; + __entry->total_inactive = total_inactive; + __entry->inactive = inactive; + __entry->total_active = total_active; + __entry->active = active; + __entry->ratio = ratio; + __entry->reclaim_flags = trace_shrink_flags(file) & RECLAIM_WB_LRU; + ), + + TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s", + __entry->nid, + __entry->reclaim_idx, + __entry->total_inactive, __entry->inactive, + __entry->total_active, __entry->active, + __entry->ratio, + show_reclaim_flags(__entry->reclaim_flags)) +); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 5c06f4af8323..00f643164ca2 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -283,8 +283,16 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_symbols_seq(p, value, symbols); \ }) +#undef __print_flags_u64 #undef __print_symbolic_u64 #if BITS_PER_LONG == 32 +#define __print_flags_u64(flag, delim, flag_array...) \ + ({ \ + static const struct trace_print_flags_u64 __flags[] = \ + { flag_array, { -1, NULL } }; \ + trace_print_flags_seq_u64(p, delim, flag, __flags); \ + }) + #define __print_symbolic_u64(value, symbol_array...) \ ({ \ static const struct trace_print_flags_u64 symbols[] = \ @@ -292,6 +300,9 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_symbols_seq_u64(p, value, symbols); \ }) #else +#define __print_flags_u64(flag, delim, flag_array...) \ + __print_flags(flag, delim, flag_array) + #define __print_symbolic_u64(value, symbol_array...) \ __print_symbolic(value, symbol_array) #endif diff --git a/include/uapi/asm-generic/ioctl.h b/include/uapi/asm-generic/ioctl.h index 7e7c11b52143..749b32fe5623 100644 --- a/include/uapi/asm-generic/ioctl.h +++ b/include/uapi/asm-generic/ioctl.h @@ -48,6 +48,9 @@ /* * Direction bits, which any architecture can choose to override * before including this file. + * + * NOTE: _IOC_WRITE means userland is writing and kernel is + * reading. _IOC_READ means userland is reading and kernel is writing. */ #ifndef _IOC_NONE @@ -72,7 +75,12 @@ #define _IOC_TYPECHECK(t) (sizeof(t)) #endif -/* used to create numbers */ +/* + * Used to create numbers. + * + * NOTE: _IOW means userland is writing and kernel is reading. _IOR + * means userland is reading and kernel is writing. + */ #define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) #define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size))) #define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size))) diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 9057d7af3ae1..9ac4b68c54d1 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -11,13 +11,18 @@ #include <linux/types.h> -#define UFFD_API ((__u64)0xAA) /* - * After implementing the respective features it will become: - * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ - * UFFD_FEATURE_EVENT_FORK) + * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and + * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In + * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ + * means the userland is reading). */ -#define UFFD_API_FEATURES (0) +#define UFFD_API ((__u64)0xAA) +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_MADVDONTNEED | \ + UFFD_FEATURE_MISSING_HUGETLBFS | \ + UFFD_FEATURE_MISSING_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -26,6 +31,9 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE) +#define UFFD_API_RANGE_IOCTLS_BASIC \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY) /* * Valid ioctl command number range with this API is from 0x00 to @@ -72,6 +80,21 @@ struct uffd_msg { } pagefault; struct { + __u32 ufd; + } fork; + + struct { + __u64 from; + __u64 to; + __u64 len; + } remap; + + struct { + __u64 start; + __u64 end; + } madv_dn; + + struct { /* unused reserved fields */ __u64 reserved1; __u64 reserved2; @@ -84,9 +107,9 @@ struct uffd_msg { * Start at 0x12 and not at 0 to be more strict against bugs. */ #define UFFD_EVENT_PAGEFAULT 0x12 -#if 0 /* not available yet */ #define UFFD_EVENT_FORK 0x13 -#endif +#define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_MADVDONTNEED 0x15 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -104,11 +127,37 @@ struct uffdio_api { * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE * are to be considered implicitly always enabled in all kernels as * long as the uffdio_api.api requested matches UFFD_API. + * + * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER + * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on + * hugetlbfs virtual memory ranges. Adding or not adding + * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has + * no real functional effect after UFFDIO_API returns, but + * it's only useful for an initial feature set probe at + * UFFDIO_API time. There are two ways to use it: + * + * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the + * uffdio_api.features before calling UFFDIO_API, an error + * will be returned by UFFDIO_API on a kernel without + * hugetlbfs missing support + * + * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in + * uffdio_api.features and instead it will be set by the + * kernel in the uffdio_api.features if the kernel supports + * it, so userland can later check if the feature flag is + * present in uffdio_api.features after UFFDIO_API + * succeeded. + * + * UFFD_FEATURE_MISSING_SHMEM works the same as + * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem + * (i.e. tmpfs and other shmem based APIs). */ -#if 0 /* not available yet */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) -#endif +#define UFFD_FEATURE_EVENT_REMAP (1<<2) +#define UFFD_FEATURE_EVENT_MADVDONTNEED (1<<3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) +#define UFFD_FEATURE_MISSING_SHMEM (1<<5) __u64 features; __u64 ioctls; diff --git a/init/Kconfig b/init/Kconfig index 483ad679aa37..8c39615165b7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1781,6 +1781,20 @@ config SLUB_DEBUG SLUB sysfs support. /sys/slab will not exist and there will be no support for cache validation etc. +config SLUB_MEMCG_SYSFS_ON + default n + bool "Enable memcg SLUB sysfs support by default" if EXPERT + depends on SLUB && SYSFS && MEMCG + help + SLUB creates a directory under /sys/kernel/slab for each + allocation cache to host info and debug files. If memory + cgroup is enabled, each cache can have per memory cgroup + caches. SLUB can create the same sysfs directories for these + caches under /sys/kernel/slab/CACHE/cgroup but it can lead + to a very high number of debug files being created. This is + controlled by slub_memcg_sysfs boot parameter and this + config option determines the parameter's default value. + config COMPAT_BRK bool "Disable heap randomization" default y diff --git a/kernel/fork.c b/kernel/fork.c index ff82e24573b6..d12fcc4db8a3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -55,6 +55,7 @@ #include <linux/rmap.h> #include <linux/ksm.h> #include <linux/acct.h> +#include <linux/userfaultfd_k.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/freezer.h> @@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; + LIST_HEAD(uf); uprobe_start_dup_mmap(); if (down_write_killable(&oldmm->mmap_sem)) { @@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= - ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); tmp->vm_next = tmp->vm_prev = NULL; - tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); @@ -678,6 +681,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); return retval; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index aea6a1218c7d..070866c32eb9 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -124,6 +124,44 @@ EXPORT_SYMBOL(trace_print_symbols_seq); #if BITS_PER_LONG == 32 const char * +trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, + unsigned long long flags, + const struct trace_print_flags_u64 *flag_array) +{ + unsigned long long mask; + const char *str; + const char *ret = trace_seq_buffer_ptr(p); + int i, first = 1; + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (!first && delim) + trace_seq_puts(p, delim); + else + first = 0; + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (!first && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%llx", flags); + } + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_flags_seq_u64); + +const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array) { diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 12b8dd640786..b5de262a9eb9 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -137,12 +137,14 @@ static void watchdog_overflow_callback(struct perf_event *event, * Reduce the watchdog noise by only printing messages * that are different from what cpu0 displayed. */ -static unsigned long cpu0_err; +static unsigned long firstcpu_err; +static atomic_t watchdog_cpus; int watchdog_nmi_enable(unsigned int cpu) { struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); + int firstcpu = 0; /* nothing to do if the hard lockup detector is disabled */ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) @@ -156,19 +158,22 @@ int watchdog_nmi_enable(unsigned int cpu) if (event != NULL) goto out_enable; + if (atomic_inc_return(&watchdog_cpus) == 1) + firstcpu = 1; + wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - /* save cpu0 error for future comparision */ - if (cpu == 0 && IS_ERR(event)) - cpu0_err = PTR_ERR(event); + /* save the first cpu's error for future comparision */ + if (firstcpu && IS_ERR(event)) + firstcpu_err = PTR_ERR(event); if (!IS_ERR(event)) { - /* only print for cpu0 or different than cpu0 */ - if (cpu == 0 || cpu0_err) + /* only print for the first cpu initialized */ + if (firstcpu || firstcpu_err) pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); goto out_save; } @@ -186,7 +191,7 @@ int watchdog_nmi_enable(unsigned int cpu) smp_mb__after_atomic(); /* skip displaying the same error again */ - if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) return PTR_ERR(event); /* vary the KERN level based on the returned errno */ @@ -222,9 +227,9 @@ void watchdog_nmi_disable(unsigned int cpu) /* should be in cleanup, but blocks oprofile */ perf_event_release_kernel(event); - } - if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ - cpu0_err = 0; + if (atomic_dec_and_test(&watchdog_cpus)) + firstcpu_err = 0; } } diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 8971370bfb16..60c57ec936db 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -1155,6 +1155,11 @@ static void check_unmap(struct dma_debug_entry *ref) dir2name[ref->direction]); } + /* + * Drivers should use dma_mapping_error() to check the returned + * addresses of dma_map_single() and dma_map_page(). + * If not, print this warning message. See Documentation/DMA-API.txt. + */ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { err_printk(ref->dev, entry, "DMA-API: device driver failed to check map error" diff --git a/lib/show_mem.c b/lib/show_mem.c index 1feed6a2b12a..0beaa1d899aa 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -9,13 +9,13 @@ #include <linux/quicklist.h> #include <linux/cma.h> -void show_mem(unsigned int filter) +void show_mem(unsigned int filter, nodemask_t *nodemask) { pg_data_t *pgdat; unsigned long total = 0, reserved = 0, highmem = 0; printk("Mem-Info:\n"); - show_free_areas(filter); + show_free_areas(filter, nodemask); for_each_online_pgdat(pgdat) { unsigned long flags; diff --git a/mm/Makefile b/mm/Makefile index 295bd7a9f76b..433eaf9a876e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -35,7 +35,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o vmacache.o \ + compaction.o vmacache.o swap_slots.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 39ce616a9d71..6d861d090e9f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -411,8 +411,8 @@ retry: while (*node != NULL) { parent = *node; - congested = container_of(parent, struct bdi_writeback_congested, - rb_node); + congested = rb_entry(parent, struct bdi_writeback_congested, + rb_node); if (congested->blkcg_id < blkcg_id) node = &parent->rb_left; else if (congested->blkcg_id > blkcg_id) diff --git a/mm/bootmem.c b/mm/bootmem.c index e8a55a3c9feb..9fedb27c6451 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -53,7 +53,7 @@ early_param("bootmem_debug", bootmem_debug_setup); static unsigned long __init bootmap_bytes(unsigned long pages) { - unsigned long bytes = DIV_ROUND_UP(pages, 8); + unsigned long bytes = DIV_ROUND_UP(pages, BITS_PER_BYTE); return ALIGN(bytes, sizeof(long)); } diff --git a/mm/compaction.c b/mm/compaction.c index 949198d01260..0aa2757399ee 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -548,7 +548,7 @@ isolate_fail: if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); - count_compact_events(COMPACTFREE_SCANNED, nr_scanned); + cc->total_free_scanned += nr_scanned; if (total_isolated) count_compact_events(COMPACTISOLATED, total_isolated); return total_isolated; @@ -931,7 +931,7 @@ isolate_fail: trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, nr_scanned, nr_isolated); - count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); + cc->total_migrate_scanned += nr_scanned; if (nr_isolated) count_compact_events(COMPACTISOLATED, nr_isolated); @@ -1631,6 +1631,9 @@ out: zone->compact_cached_free_pfn = free_pfn; } + count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); + count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned); + trace_mm_compaction_end(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync, ret); @@ -1645,6 +1648,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, + .total_migrate_scanned = 0, + .total_free_scanned = 0, .order = order, .gfp_mask = gfp_mask, .zone = zone, @@ -1757,6 +1762,8 @@ static void compact_node(int nid) struct zone *zone; struct compact_control cc = { .order = -1, + .total_migrate_scanned = 0, + .total_free_scanned = 0, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, @@ -1883,6 +1890,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct zone *zone; struct compact_control cc = { .order = pgdat->kcompactd_max_order, + .total_migrate_scanned = 0, + .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = true, @@ -1891,7 +1900,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.classzone_idx); - count_vm_event(KCOMPACTD_WAKE); + count_compact_event(KCOMPACTD_WAKE); for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { int status; @@ -1909,6 +1918,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) cc.nr_freepages = 0; cc.nr_migratepages = 0; + cc.total_migrate_scanned = 0; + cc.total_free_scanned = 0; cc.zone = zone; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1927,6 +1938,11 @@ static void kcompactd_do_work(pg_data_t *pgdat) defer_compaction(zone, cc.order); } + count_compact_events(KCOMPACTD_MIGRATE_SCANNED, + cc.total_migrate_scanned); + count_compact_events(KCOMPACTD_FREE_SCANNED, + cc.total_free_scanned); + VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); } @@ -1950,6 +1966,13 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; + /* + * Pairs with implicit barrier in wait_event_freezable() + * such that wakeups are not missed in the lockless + * waitqueue_active() call. + */ + smp_acquire__after_ctrl_dep(); + if (pgdat->kcompactd_classzone_idx > classzone_idx) pgdat->kcompactd_classzone_idx = classzone_idx; diff --git a/mm/filemap.c b/mm/filemap.c index 3f9afded581b..416d563468a3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -788,7 +788,7 @@ static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void return autoremove_wake_function(wait, mode, sync, key); } -void wake_up_page_bit(struct page *page, int bit_nr) +static void wake_up_page_bit(struct page *page, int bit_nr) { wait_queue_head_t *q = page_waitqueue(page); struct wait_page_key key; @@ -821,7 +821,13 @@ void wake_up_page_bit(struct page *page, int bit_nr) } spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL(wake_up_page_bit); + +static void wake_up_page(struct page *page, int bit) +{ + if (!PageWaiters(page)) + return; + wake_up_page_bit(page, bit); +} static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, bool lock) @@ -1013,7 +1019,7 @@ EXPORT_SYMBOL_GPL(page_endio); /** * __lock_page - get a lock on the page, assuming we need to sleep to get it - * @page: the page to lock + * @__page: the page to lock */ void __lock_page(struct page *__page) { @@ -572,7 +572,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, - gup_flags); + gup_flags, nonblocking); continue; } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5f3ad65c85de..f9ecc2aeadfc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -142,42 +142,6 @@ static struct shrinker huge_zero_page_shrinker = { }; #ifdef CONFIG_SYSFS - -static ssize_t triple_flag_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count, - enum transparent_hugepage_flag enabled, - enum transparent_hugepage_flag deferred, - enum transparent_hugepage_flag req_madv) -{ - if (!memcmp("defer", buf, - min(sizeof("defer")-1, count))) { - if (enabled == deferred) - return -EINVAL; - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - set_bit(deferred, &transparent_hugepage_flags); - } else if (!memcmp("always", buf, - min(sizeof("always")-1, count))) { - clear_bit(deferred, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - set_bit(enabled, &transparent_hugepage_flags); - } else if (!memcmp("madvise", buf, - min(sizeof("madvise")-1, count))) { - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(deferred, &transparent_hugepage_flags); - set_bit(req_madv, &transparent_hugepage_flags); - } else if (!memcmp("never", buf, - min(sizeof("never")-1, count))) { - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - clear_bit(deferred, &transparent_hugepage_flags); - } else - return -EINVAL; - - return count; -} - static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -193,19 +157,28 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - ssize_t ret; + ssize_t ret = count; - ret = triple_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_FLAG, - TRANSPARENT_HUGEPAGE_FLAG, - TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else + ret = -EINVAL; if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } - return ret; } static struct kobj_attribute enabled_attr = @@ -241,32 +214,58 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj, return count; } -/* - * Currently defrag only disables __GFP_NOWAIT for allocation. A blind - * __GFP_REPEAT is too aggressive, it's never worth swapping tons of - * memory just to allocate one more hugepage. - */ static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "[always] defer madvise never\n"); + return sprintf(buf, "[always] defer defer+madvise madvise never\n"); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always [defer] madvise never\n"); - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always defer [madvise] never\n"); - else - return sprintf(buf, "always defer madvise [never]\n"); - + return sprintf(buf, "always [defer] defer+madvise madvise never\n"); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always defer [defer+madvise] madvise never\n"); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always defer defer+madvise [madvise] never\n"); + return sprintf(buf, "always defer defer+madvise madvise [never]\n"); } + static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - return triple_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("defer", buf, + min(sizeof("defer")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("defer+madvise", buf, + min(sizeof("defer+madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; } static struct kobj_attribute defrag_attr = __ATTR(defrag, 0644, defrag_show, defrag_store); @@ -612,25 +611,28 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, } /* - * If THP defrag is set to always then directly reclaim/compact as necessary - * If set to defer then do only background reclaim/compact and defer to khugepaged - * If set to madvise and the VMA is flagged then directly reclaim/compact - * When direct reclaim/compact is allowed, don't retry except for flagged VMA's + * always: directly stall for all thp allocations + * defer: wake kswapd and fail if not immediately available + * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise + * fail if not immediately available + * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately + * available + * never: never stall for any thp allocation */ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { - bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, - &transparent_hugepage_flags) && vma_madvised) - return GFP_TRANSHUGE; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); - + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + 0); return GFP_TRANSHUGE_LIGHT; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c7025c132670..30e7709a5121 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -32,6 +32,7 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> +#include <linux/userfaultfd_k.h> #include "internal.h" int hugepages_treat_as_movable; @@ -3680,6 +3681,38 @@ retry: size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; + + /* + * Check for page in userfault range + */ + if (userfaultfd_missing(vma)) { + u32 hash; + struct vm_fault vmf = { + .vma = vma, + .address = address, + .flags = flags, + /* + * Hard to debug if it ends up being + * used by a callee that assumes + * something about the other + * uninitialized fields... same as in + * memory.c + */ + }; + + /* + * hugetlb_fault_mutex must be dropped before + * handling userfault. Reacquire after handling + * fault to make calling code simpler. + */ + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, + idx, address); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + ret = handle_userfault(&vmf, VM_UFFD_MISSING); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + goto out; + } + page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { ret = PTR_ERR(page); @@ -3948,10 +3981,113 @@ out_mutex: return ret; } +/* + * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with + * modifications for huge pages. + */ +int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, + pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + int vm_shared = dst_vma->vm_flags & VM_SHARED; + struct hstate *h = hstate_vma(dst_vma); + pte_t _dst_pte; + spinlock_t *ptl; + int ret; + struct page *page; + + if (!*pagep) { + ret = -ENOMEM; + page = alloc_huge_page(dst_vma, dst_addr, 0); + if (IS_ERR(page)) + goto out; + + ret = copy_huge_page_from_user(page, + (const void __user *) src_addr, + pages_per_huge_page(h), false); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + ret = -EFAULT; + *pagep = page; + /* don't free the page */ + goto out; + } + } else { + page = *pagep; + *pagep = NULL; + } + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + set_page_huge_active(page); + + /* + * If shared, add to page cache + */ + if (vm_shared) { + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); + + ret = huge_add_to_page_cache(page, mapping, idx); + if (ret) + goto out_release_nounlock; + } + + ptl = huge_pte_lockptr(h, dst_mm, dst_pte); + spin_lock(ptl); + + ret = -EEXIST; + if (!huge_pte_none(huge_ptep_get(dst_pte))) + goto out_release_unlock; + + if (vm_shared) { + page_dup_rmap(page, true); + } else { + ClearPagePrivate(page); + hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + } + + _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = huge_pte_mkdirty(_dst_pte); + _dst_pte = pte_mkyoung(_dst_pte); + + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, + dst_vma->vm_flags & VM_WRITE); + hugetlb_count_add(pages_per_huge_page(h), dst_mm); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + spin_unlock(ptl); + if (vm_shared) + unlock_page(page); + ret = 0; +out: + return ret; +out_release_unlock: + spin_unlock(ptl); +out_release_nounlock: + if (vm_shared) + unlock_page(page); + put_page(page); + goto out; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags) + long i, unsigned int flags, int *nonblocking) { unsigned long pfn_offset; unsigned long vaddr = *position; @@ -4014,16 +4150,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, ((flags & FOLL_WRITE) && !huge_pte_write(huge_ptep_get(pte)))) { int ret; + unsigned int fault_flags = 0; if (pte) spin_unlock(ptl); - ret = hugetlb_fault(mm, vma, vaddr, - (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); - if (!(ret & VM_FAULT_ERROR)) - continue; - - remainder = 0; - break; + if (flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | + FAULT_FLAG_RETRY_NOWAIT; + if (flags & FOLL_TRIED) { + VM_WARN_ON_ONCE(fault_flags & + FAULT_FLAG_ALLOW_RETRY); + fault_flags |= FAULT_FLAG_TRIED; + } + ret = hugetlb_fault(mm, vma, vaddr, fault_flags); + if (ret & VM_FAULT_ERROR) { + remainder = 0; + break; + } + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + *nr_pages = 0; + /* + * VM_FAULT_RETRY must not return an + * error, it will return zero + * instead. + * + * No need to update "position" as the + * caller will not check it after + * *nr_pages is set to 0. + */ + return i; + } + continue; } pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; @@ -4052,6 +4215,11 @@ same_page: spin_unlock(ptl); } *nr_pages = remainder; + /* + * setting position is actually required only if remainder is + * not zero but it's faster not to add a "if (remainder)" + * branch. + */ *position = vaddr; return i ? i : -EFAULT; diff --git a/mm/internal.h b/mm/internal.h index 7aa2ea0a8623..8ab72f4374e0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -43,6 +43,11 @@ int do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); +static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); +} + void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, @@ -133,9 +138,9 @@ struct alloc_context { * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ static inline unsigned long -__find_buddy_index(unsigned long page_idx, unsigned int order) +__find_buddy_pfn(unsigned long page_pfn, unsigned int order) { - return page_idx ^ (1 << order); + return page_pfn ^ (1 << order); } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, @@ -175,6 +180,8 @@ struct compact_control { struct list_head migratepages; /* List of pages being migrated */ unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long total_migrate_scanned; + unsigned long total_free_scanned; unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ diff --git a/mm/madvise.c b/mm/madvise.c index 0e3828eae9f8..b530a4986035 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -10,6 +10,7 @@ #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> +#include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> #include <linux/sched.h> @@ -24,6 +25,8 @@ #include <asm/tlb.h> +#include "internal.h" + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_sem for writing. Others, which simply traverse vmas, need @@ -473,10 +476,11 @@ static long madvise_dontneed(struct vm_area_struct *vma, unsigned long start, unsigned long end) { *prev = vma; - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + if (!can_madv_dontneed_vma(vma)) return -EINVAL; - zap_page_range(vma, start, end - start, NULL); + madvise_userfault_dontneed(vma, prev, start, end); + zap_page_range(vma, start, end - start); return 0; } diff --git a/mm/memblock.c b/mm/memblock.c index 7608bc305936..c004f52be419 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -611,10 +611,10 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", - (unsigned long long)base, - (unsigned long long)base + size - 1, - 0UL, (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_add: [%pa-%pa] %pF\n", + &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } @@ -718,10 +718,10 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { - memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", - (unsigned long long)base, - (unsigned long long)base + size - 1, - (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg(" memblock_free: [%pa-%pa] %pF\n", + &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); return memblock_remove_range(&memblock.reserved, base, size); @@ -729,10 +729,10 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", - (unsigned long long)base, - (unsigned long long)base + size - 1, - 0UL, (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n", + &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); } @@ -1105,6 +1105,31 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, *out_nid = r->nid; } +unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, + unsigned long max_pfn) +{ + struct memblock_type *type = &memblock.memory; + unsigned int right = type->cnt; + unsigned int mid, left = 0; + phys_addr_t addr = PFN_PHYS(pfn + 1); + + do { + mid = (right + left) / 2; + + if (addr < type->regions[mid].base) + right = mid; + else if (addr >= (type->regions[mid].base + + type->regions[mid].size)) + left = mid + 1; + else { + /* addr is within the region, so pfn + 1 is valid */ + return min(pfn + 1, max_pfn); + } + } while (left < right); + + return min(PHYS_PFN(type->regions[right].base), max_pfn); +} + /** * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for @@ -1202,8 +1227,8 @@ phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys alloc = __memblock_alloc_base(size, align, max_addr); if (alloc == 0) - panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", - (unsigned long long) size, (unsigned long long) max_addr); + panic("ERROR: Failed to allocate %pa bytes below %pa.\n", + &size, &max_addr); return alloc; } @@ -1274,18 +1299,17 @@ static void * __init memblock_virt_alloc_internal( if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; - again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, nid, flags); - if (alloc) + if (alloc && !memblock_reserve(alloc, size)) goto done; if (nid != NUMA_NO_NODE) { alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, NUMA_NO_NODE, flags); - if (alloc) + if (alloc && !memblock_reserve(alloc, size)) goto done; } @@ -1303,7 +1327,6 @@ again: return NULL; done: - memblock_reserve(alloc, size); ptr = phys_to_virt(alloc); memset(ptr, 0, size); @@ -1615,8 +1638,7 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size if (idx == -1) return 0; - return memblock.memory.regions[idx].base <= base && - (memblock.memory.regions[idx].base + + return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size) >= end; } @@ -1673,7 +1695,7 @@ phys_addr_t __init_memblock memblock_get_current_limit(void) static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { - unsigned long long base, size; + phys_addr_t base, end, size; unsigned long flags; int idx; struct memblock_region *rgn; @@ -1685,23 +1707,24 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name base = rgn->base; size = rgn->size; + end = base + size - 1; flags = rgn->flags; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", - name, idx, base, base + size - 1, size, nid_buf, flags); + pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n", + name, idx, &base, &end, &size, nid_buf, flags); } } void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); - pr_info(" memory size = %#llx reserved size = %#llx\n", - (unsigned long long)memblock.memory.total_size, - (unsigned long long)memblock.reserved.total_size); + pr_info(" memory size = %pa reserved size = %pa\n", + &memblock.memory.total_size, + &memblock.reserved.total_size); memblock_dump(&memblock.memory, "memory"); memblock_dump(&memblock.reserved, "reserved"); @@ -1727,19 +1750,14 @@ static int memblock_debug_show(struct seq_file *m, void *private) struct memblock_type *type = m->private; struct memblock_region *reg; int i; + phys_addr_t end; for (i = 0; i < type->cnt; i++) { reg = &type->regions[i]; - seq_printf(m, "%4d: ", i); - if (sizeof(phys_addr_t) == 4) - seq_printf(m, "0x%08lx..0x%08lx\n", - (unsigned long)reg->base, - (unsigned long)(reg->base + reg->size - 1)); - else - seq_printf(m, "0x%016llx..0x%016llx\n", - (unsigned long long)reg->base, - (unsigned long long)(reg->base + reg->size - 1)); + end = reg->base + reg->size - 1; + seq_printf(m, "%4d: ", i); + seq_printf(m, "%pa..%pa\n", ®->base, &end); } return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b822e158b319..1fd6affcdde7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -317,6 +317,8 @@ void memcg_put_cache_ids(void) DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); +struct workqueue_struct *memcg_kmem_cache_wq; + #endif /* !CONFIG_SLOB */ /** @@ -2143,8 +2145,6 @@ struct memcg_kmem_cache_create_work { struct work_struct work; }; -static struct workqueue_struct *memcg_kmem_cache_create_wq; - static void memcg_kmem_cache_create_func(struct work_struct *w) { struct memcg_kmem_cache_create_work *cw = @@ -2176,7 +2176,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - queue_work(memcg_kmem_cache_create_wq, &cw->work); + queue_work(memcg_kmem_cache_wq, &cw->work); } static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, @@ -2837,6 +2837,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) */ memcg->kmemcg_id = memcg_id; memcg->kmem_state = KMEM_ONLINE; + INIT_LIST_HEAD(&memcg->kmem_caches); return 0; } @@ -4002,9 +4003,9 @@ static struct cftype mem_cgroup_legacy_files[] = { #ifdef CONFIG_SLABINFO { .name = "kmem.slabinfo", - .seq_start = slab_start, - .seq_next = slab_next, - .seq_stop = slab_stop, + .seq_start = memcg_slab_start, + .seq_next = memcg_slab_next, + .seq_stop = memcg_slab_stop, .seq_show = memcg_slab_show, }, #endif @@ -5777,12 +5778,12 @@ static int __init mem_cgroup_init(void) #ifndef CONFIG_SLOB /* * Kmem cache creation is mostly done with the slab_mutex held, - * so use a special workqueue to avoid stalling all worker - * threads in case lots of cgroups are created simultaneously. + * so use a workqueue with limited concurrency to avoid stalling + * all worker threads in case lots of cgroups are created and + * destroyed simultaneously. */ - memcg_kmem_cache_create_wq = - alloc_ordered_workqueue("memcg_kmem_cache_create", 0); - BUG_ON(!memcg_kmem_cache_create_wq); + memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); + BUG_ON(!memcg_kmem_cache_wq); #endif cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, diff --git a/mm/memory.c b/mm/memory.c index 6bf2b471e30c..7663068a33c6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1155,12 +1155,6 @@ again: if (!PageAnon(page)) { if (pte_dirty(ptent)) { - /* - * oom_reaper cannot tear down dirty - * pages - */ - if (unlikely(details && details->ignore_dirty)) - continue; force_flush = 1; set_page_dirty(page); } @@ -1179,8 +1173,8 @@ again: } continue; } - /* only check swap_entries if explicitly asked for in details */ - if (unlikely(details && !details->check_swap_entries)) + /* If details->check_mapping, we leave swap entries. */ + if (unlikely(details)) continue; entry = pte_to_swp_entry(ptent); @@ -1376,12 +1370,11 @@ void unmap_vmas(struct mmu_gather *tlb, * @vma: vm_area_struct holding the applicable pages * @start: starting address of pages to zap * @size: number of bytes to zap - * @details: details of shared cache invalidation * * Caller must protect the VMA list */ void zap_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long size, struct zap_details *details) + unsigned long size) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; @@ -1392,7 +1385,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); for ( ; vma && vma->vm_start < end; vma = vma->vm_next) - unmap_single_vma(&tlb, vma, start, end, details); + unmap_single_vma(&tlb, vma, start, end, NULL); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } @@ -3471,12 +3464,10 @@ out: static int create_huge_pmd(struct vm_fault *vmf) { - struct vm_area_struct *vma = vmf->vma; - if (vma_is_anonymous(vma)) + if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); - if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, - vmf->flags); + if (vmf->vma->vm_ops->pmd_fault) + return vmf->vma->vm_ops->pmd_fault(vmf); return VM_FAULT_FALLBACK; } @@ -3485,8 +3476,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); if (vmf->vma->vm_ops->pmd_fault) - return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, - vmf->pmd, vmf->flags); + return vmf->vma->vm_ops->pmd_fault(vmf); /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); @@ -4155,6 +4145,38 @@ void copy_user_huge_page(struct page *dst, struct page *src, copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); } } + +long copy_huge_page_from_user(struct page *dst_page, + const void __user *usr_src, + unsigned int pages_per_huge_page, + bool allow_pagefault) +{ + void *src = (void *)usr_src; + void *page_kaddr; + unsigned long i, rc = 0; + unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; + + for (i = 0; i < pages_per_huge_page; i++) { + if (allow_pagefault) + page_kaddr = kmap(dst_page + i); + else + page_kaddr = kmap_atomic(dst_page + i); + rc = copy_from_user(page_kaddr, + (const void __user *)(src + i * PAGE_SIZE), + PAGE_SIZE); + if (allow_pagefault) + kunmap(dst_page + i); + else + kunmap_atomic(page_kaddr); + + ret_val -= (PAGE_SIZE - rc); + if (rc) + break; + + cond_resched(); + } + return ret_val; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b8c11e063ff0..d67787d10ff0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -179,7 +179,7 @@ static void release_memory_resource(struct resource *res) void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) { - page->lru.next = (struct list_head *) type; + page->freelist = (void *)type; SetPagePrivate(page); set_page_private(page, info); page_ref_inc(page); @@ -189,11 +189,12 @@ void put_page_bootmem(struct page *page) { unsigned long type; - type = (unsigned long) page->lru.next; + type = (unsigned long) page->freelist; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (page_ref_dec_return(page) == 1) { + page->freelist = NULL; ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); diff --git a/mm/mmap.c b/mm/mmap.c index dc4291dcc99b..b729084eea90 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2806,11 +2806,11 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk(unsigned long addr, unsigned long request) +static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - unsigned long flags, len; + unsigned long len; struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; @@ -2821,7 +2821,10 @@ static int do_brk(unsigned long addr, unsigned long request) if (!len) return 0; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + /* Until we need other flags, refuse anything except VM_EXEC. */ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (offset_in_page(error)) @@ -2889,7 +2892,12 @@ out: return 0; } -int vm_brk(unsigned long addr, unsigned long len) +static int do_brk(unsigned long addr, unsigned long len) +{ + return do_brk_flags(addr, len, 0); +} + +int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; int ret; @@ -2898,13 +2906,19 @@ int vm_brk(unsigned long addr, unsigned long len) if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_brk(addr, len); + ret = do_brk_flags(addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); if (populate && !ret) mm_populate(addr, len); return ret; } +EXPORT_SYMBOL(vm_brk_flags); + +int vm_brk(unsigned long addr, unsigned long len) +{ + return vm_brk_flags(addr, len, 0); +} EXPORT_SYMBOL(vm_brk); /* Release all mmaps. */ diff --git a/mm/mmzone.c b/mm/mmzone.c index 5652be858e5e..a51c0a67ea3d 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -60,7 +60,7 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z, * Find the next suitable zone to use for the allocation. * Only filter based on nodemask if it's set */ - if (likely(nodes == NULL)) + if (unlikely(nodes == NULL)) while (zonelist_zone_idx(z) > highest_zoneidx) z++; else diff --git a/mm/mprotect.c b/mm/mprotect.c index f9c07f54dd62..a45b4dc6a7f5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -33,34 +33,6 @@ #include "internal.h" -/* - * For a prot_numa update we only hold mmap_sem for read so there is a - * potential race with faulting where a pmd was temporarily none. This - * function checks for a transhuge pmd under the appropriate lock. It - * returns a pte if it was successfully locked or NULL if it raced with - * a transhuge insertion. - */ -static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, int prot_numa, spinlock_t **ptl) -{ - pte_t *pte; - spinlock_t *pmdl; - - /* !prot_numa is protected by mmap_sem held for write */ - if (!prot_numa) - return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); - - pmdl = pmd_lock(vma->vm_mm, pmd); - if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { - spin_unlock(pmdl); - return NULL; - } - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); - spin_unlock(pmdl); - return pte; -} - static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -71,7 +43,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long pages = 0; int target_node = NUMA_NO_NODE; - pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); + /* + * Can be called with only the mmap_sem for reading by + * prot_numa so we must check the pmd isn't constantly + * changing from under us from pmd_none to pmd_trans_huge + * and/or the other way around. + */ + if (pmd_trans_unstable(pmd)) + return 0; + + /* + * The pmd points to a regular pte so the pmd can't change + * from under us even if the mmap_sem is only hold for + * reading. + */ + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) return 0; @@ -177,8 +163,6 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { __split_huge_pmd(vma, pmd, addr, false, NULL); - if (pmd_trans_unstable(pmd)) - continue; } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, newprot, prot_numa); diff --git a/mm/mremap.c b/mm/mremap.c index 30d7d2482eea..8779928d6a70 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -22,6 +22,7 @@ #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/mm-arch-hooks.h> +#include <linux/userfaultfd_k.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -250,7 +251,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, - unsigned long new_len, unsigned long new_addr, bool *locked) + unsigned long new_len, unsigned long new_addr, + bool *locked, struct vm_userfaultfd_ctx *uf) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; @@ -309,6 +311,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_addr = new_addr; new_addr = err; } else { + mremap_userfaultfd_prep(new_vma, uf); arch_remap(mm, old_addr, old_addr + old_len, new_addr, new_addr + new_len); } @@ -413,7 +416,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, } static unsigned long mremap_to(unsigned long addr, unsigned long old_len, - unsigned long new_addr, unsigned long new_len, bool *locked) + unsigned long new_addr, unsigned long new_len, bool *locked, + struct vm_userfaultfd_ctx *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -458,7 +462,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (offset_in_page(ret)) goto out1; - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf); if (!(offset_in_page(ret))) goto out; out1: @@ -497,6 +501,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long ret = -EINVAL; unsigned long charged = 0; bool locked = false; + struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; @@ -523,7 +528,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, - &locked); + &locked, &uf); goto out; } @@ -592,7 +597,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); + ret = move_vma(vma, addr, old_len, new_len, new_addr, + &locked, &uf); } out: if (offset_in_page(ret)) { @@ -602,5 +608,6 @@ out: up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); + mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); return ret; } diff --git a/mm/nommu.c b/mm/nommu.c index 24f9f5f39145..bc964c26be8c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1191,7 +1191,7 @@ error_free: enomem: pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; } @@ -1412,13 +1412,13 @@ error_getting_vma: kmem_cache_free(vm_region_jar, region); pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; error_getting_region: pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ec9f11d4f094..8256788ac119 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -417,7 +417,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) if (oc->memcg) mem_cgroup_print_oom_info(oc->memcg, p); else - show_mem(SHOW_MEM_FILTER_NODES); + show_mem(SHOW_MEM_FILTER_NODES, nm); if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); } @@ -465,8 +465,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct zap_details details = {.check_swap_entries = true, - .ignore_dirty = true}; bool ret = true; /* @@ -510,14 +508,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) tlb_gather_mmu(&tlb, mm, 0, -1); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (is_vm_hugetlb_page(vma)) - continue; - - /* - * mlocked VMAs require explicit munlocking before unmap. - * Let's keep it simple here and skip such VMAs. - */ - if (vma->vm_flags & VM_LOCKED) + if (!can_madv_dontneed_vma(vma)) continue; /* @@ -532,7 +523,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, - &details); + NULL); } tlb_finish_mmu(&tlb, 0, -1); pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", @@ -1013,7 +1004,7 @@ bool out_of_memory(struct oom_control *oc) * make sure exclude 0 mask - all other users should have at least * ___GFP_DIRECT_RECLAIM to get here. */ - if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL))) + if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) return true; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3e0c69a97b7..c21b33668133 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,6 +55,7 @@ #include <linux/kmemleak.h> #include <linux/compaction.h> #include <trace/events/kmem.h> +#include <trace/events/oom.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/migrate.h> @@ -714,7 +715,7 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is not in a hole && + * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. @@ -729,9 +730,6 @@ static inline void rmv_page_order(struct page *page) static inline int page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { - if (!pfn_valid_within(page_to_pfn(buddy))) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { if (page_zone_id(page) != page_zone_id(buddy)) return 0; @@ -787,9 +785,8 @@ static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order, int migratetype) { - unsigned long page_idx; - unsigned long combined_idx; - unsigned long uninitialized_var(buddy_idx); + unsigned long combined_pfn; + unsigned long uninitialized_var(buddy_pfn); struct page *buddy; unsigned int max_order; @@ -802,15 +799,16 @@ static inline void __free_one_page(struct page *page, if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - page_idx = pfn & ((1 << MAX_ORDER) - 1); - - VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); continue_merging: while (order < max_order - 1) { - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); + + if (!pfn_valid_within(buddy_pfn)) + goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -824,9 +822,9 @@ continue_merging: zone->free_area[order].nr_free--; rmv_page_order(buddy); } - combined_idx = buddy_idx & page_idx; - page = page + (combined_idx - page_idx); - page_idx = combined_idx; + combined_pfn = buddy_pfn & pfn; + page = page + (combined_pfn - pfn); + pfn = combined_pfn; order++; } if (max_order < MAX_ORDER) { @@ -841,8 +839,8 @@ continue_merging: if (unlikely(has_isolate_pageblock(zone))) { int buddy_mt; - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); buddy_mt = get_pageblock_migratetype(buddy); if (migratetype != buddy_mt @@ -865,12 +863,12 @@ done_merging: * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { + if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { struct page *higher_page, *higher_buddy; - combined_idx = buddy_idx & page_idx; - higher_page = page + (combined_idx - page_idx); - buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = higher_page + (buddy_idx - combined_idx); + combined_pfn = buddy_pfn & pfn; + higher_page = page + (combined_pfn - pfn); + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); + higher_buddy = higher_page + (buddy_pfn - combined_pfn); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -3007,18 +3005,12 @@ static inline bool should_suppress_show_mem(void) return ret; } -static DEFINE_RATELIMIT_STATE(nopage_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) +static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; - struct va_format vaf; - va_list args; + static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) return; /* @@ -3033,6 +3025,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; + show_mem(filter, nodemask); +} + +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) + return; + pr_warn("%s: ", current->comm); va_start(args, fmt); @@ -3041,11 +3047,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) pr_cont("%pV", &vaf); va_end(args); - pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); + pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); + if (nodemask) + pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); + else + pr_cont("(null)\n"); + + cpuset_print_current_mems_allowed(); dump_stack(); - if (!should_suppress_show_mem()) - show_mem(filter); + warn_alloc_show_mem(gfp_mask, nodemask); +} + +static inline struct page * +__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, + const struct alloc_context *ac) +{ + struct page *page; + + page = get_page_from_freelist(gfp_mask, order, + alloc_flags|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + alloc_flags, ac); + + return page; } static inline struct page * @@ -3083,47 +3114,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - if (!(gfp_mask & __GFP_NOFAIL)) { - /* Coredumps can quickly deplete all memory reserves */ - if (current->flags & PF_DUMPCORE) - goto out; - /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - goto out; - /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) - goto out; - if (pm_suspended_storage()) - goto out; - /* - * XXX: GFP_NOFS allocations should rather fail than rely on - * other request to make a forward progress. - * We are in an unfortunate situation where out_of_memory cannot - * do much for this context but let's try it to at least get - * access to memory reserved if the current task is killed (see - * out_of_memory). Once filesystems are ready to handle allocation - * failures more gracefully we should just bail out here. - */ + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (ac->high_zoneidx < ZONE_NORMAL) + goto out; + if (pm_suspended_storage()) + goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + + /* The OOM killer may not free memory on a specific node */ + if (gfp_mask & __GFP_THISNODE) + goto out; - /* The OOM killer may not free memory on a specific node */ - if (gfp_mask & __GFP_THISNODE) - goto out; - } /* Exhausted what can be done so it's blamo time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; - if (gfp_mask & __GFP_NOFAIL) { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); - /* - * fallback to ignore cpuset restriction if our nodes - * are depleted - */ - if (!page) - page = get_page_from_freelist(gfp_mask, order, + /* + * Help non-failing allocations by giving them access to memory + * reserves + */ + if (gfp_mask & __GFP_NOFAIL) + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); - } } out: mutex_unlock(&oom_lock); @@ -3192,6 +3218,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, { int max_retries = MAX_COMPACT_RETRIES; int min_priority; + bool ret = false; + int retries = *compaction_retries; + enum compact_priority priority = *compact_priority; if (!order) return false; @@ -3213,8 +3242,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * But do not retry if the given zonelist is not suitable for * compaction. */ - if (compaction_withdrawn(compact_result)) - return compaction_zonelist_suitable(ac, order, alloc_flags); + if (compaction_withdrawn(compact_result)) { + ret = compaction_zonelist_suitable(ac, order, alloc_flags); + goto out; + } /* * !costly requests are much more important than __GFP_REPEAT @@ -3226,8 +3257,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; - if (*compaction_retries <= max_retries) - return true; + if (*compaction_retries <= max_retries) { + ret = true; + goto out; + } /* * Make sure there are attempts at the highest priority if we exhausted @@ -3236,12 +3269,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, check_priority: min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { (*compact_priority)--; *compaction_retries = 0; - return true; + ret = true; } - return false; +out: + trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); + return ret; } #else static inline struct page * @@ -3464,6 +3500,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, ac->nodemask) { unsigned long available; unsigned long reclaimable; + unsigned long min_wmark = min_wmark_pages(zone); + bool wmark; available = reclaimable = zone_reclaimable_pages(zone); available -= DIV_ROUND_UP((*no_progress_loops) * available, @@ -3474,8 +3512,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Would the allocation succeed if we reclaimed the whole * available? */ - if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags, available)) { + wmark = __zone_watermark_ok(zone, order, min_wmark, + ac_classzone_idx(ac), alloc_flags, available); + trace_reclaim_retry_zone(z, order, reclaimable, + available, min_wmark, *no_progress_loops, wmark); + if (wmark) { /* * If we didn't make any progress and have a lot of * dirty + writeback pages then we should wait for @@ -3555,6 +3596,14 @@ retry_cpuset: no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + + /* + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* * We need to recalculate the starting point for the zonelist iterator * because we might have used different nodemask in the fast path, or @@ -3566,14 +3615,6 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; - - /* - * The fast path uses conservative alloc_flags to succeed only until - * kswapd needs to be woken up, and to avoid the cost of setting up - * alloc_flags precisely. So we do that now. - */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); - if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); @@ -3650,35 +3691,21 @@ retry: goto got_pg; /* Caller is not willing to reclaim, we can't balance anything */ - if (!can_direct_reclaim) { - /* - * All existing users of the __GFP_NOFAIL are blockable, so warn - * of any new users that actually allow this type of allocation - * to fail. - */ - WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); + if (!can_direct_reclaim) goto nopage; - } - /* Avoid recursion of direct reclaim */ - if (current->flags & PF_MEMALLOC) { - /* - * __GFP_NOFAIL request from this context is rather bizarre - * because we cannot reclaim anything and only can loop waiting - * for somebody to do a work for us. - */ - if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { - cond_resched(); - goto retry; - } - goto nopage; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, ac->nodemask, + "page allocation stalls for %ums, order:%u", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; } - /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + /* Avoid recursion of direct reclaim */ + if (current->flags & PF_MEMALLOC) goto nopage; - /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); @@ -3702,14 +3729,6 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; @@ -3738,6 +3757,10 @@ retry: if (page) goto got_pg; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE)) + goto nopage; + /* Retry as long as the OOM killer is making progress */ if (did_some_progress) { no_progress_loops = 0; @@ -3755,7 +3778,48 @@ nopage: if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry_cpuset; - warn_alloc(gfp_mask, + /* + * Make sure that __GFP_NOFAIL request doesn't leak out and make sure + * we always retry + */ + if (gfp_mask & __GFP_NOFAIL) { + /* + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually require GFP_NOWAIT + */ + if (WARN_ON_ONCE(!can_direct_reclaim)) + goto fail; + + /* + * PF_MEMALLOC request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us + */ + WARN_ON_ONCE(current->flags & PF_MEMALLOC); + + /* + * non failing costly orders are a hard requirement which we + * are not prepared for much so let's warn about these users + * so that we can identify them and convert them to something + * else. + */ + WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + + /* + * Help non-failing allocations by giving them access to memory + * reserves but do not use ALLOC_NO_WATERMARKS because this + * could deplete whole memory reserves which would just make + * the situation worse + */ + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + if (page) + goto got_pg; + + cond_resched(); + goto retry; + } +fail: + warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: return page; @@ -4252,20 +4316,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) * Determine whether the node should be displayed or not, depending on whether * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ -bool skip_free_areas_node(unsigned int flags, int nid) +static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) { - bool ret = false; - unsigned int cpuset_mems_cookie; - if (!(flags & SHOW_MEM_FILTER_NODES)) - goto out; + return false; - do { - cpuset_mems_cookie = read_mems_allowed_begin(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (read_mems_allowed_retry(cpuset_mems_cookie)); -out: - return ret; + /* + * no node mask - aka implicit memory numa policy. Do not bother with + * the synchronization - read_mems_allowed_begin - because we do not + * have to be precise here. + */ + if (!nodemask) + nodemask = &cpuset_current_mems_allowed; + + return !node_isset(nid, *nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -4306,7 +4370,7 @@ static void show_migration_types(unsigned char type) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void show_free_areas(unsigned int filter) +void show_free_areas(unsigned int filter, nodemask_t *nodemask) { unsigned long free_pcp = 0; int cpu; @@ -4314,7 +4378,7 @@ void show_free_areas(unsigned int filter) pg_data_t *pgdat; for_each_populated_zone(zone) { - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; for_each_online_cpu(cpu) @@ -4348,6 +4412,9 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { + if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) + continue; + printk("Node %d" " active_anon:%lukB" " inactive_anon:%lukB" @@ -4397,7 +4464,7 @@ void show_free_areas(unsigned int filter) for_each_populated_zone(zone) { int i; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; free_pcp = 0; @@ -4462,7 +4529,7 @@ void show_free_areas(unsigned int filter) unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); printk(KERN_CONT "%s: ", zone->name); @@ -5083,8 +5150,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * Skip to the pfn preceding the next valid one (or + * end_pfn), such that we hit a valid pfn (or end_pfn) + * on our next iteration of the loop. + */ + pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; +#endif continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a5594bfcc5ed..f4e17a57926a 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -83,7 +83,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) unsigned long flags, nr_pages; bool isolated_page = false; unsigned int order; - unsigned long page_idx, buddy_idx; + unsigned long pfn, buddy_pfn; struct page *buddy; zone = page_zone(page); @@ -102,11 +102,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (PageBuddy(page)) { order = page_order(page); if (order >= pageblock_order) { - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + pfn = page_to_pfn(page); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); - if (pfn_valid_within(page_to_pfn(buddy)) && + if (pfn_valid_within(buddy_pfn) && !is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); isolated_page = true; diff --git a/mm/shmem.c b/mm/shmem.c index 3a7587a0314d..9c6d22ff44e2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -34,6 +34,8 @@ #include <linux/uio.h> #include <linux/khugepaged.h> +#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ + static struct vfsmount *shm_mnt; #ifdef CONFIG_SHMEM @@ -70,6 +72,8 @@ static struct vfsmount *shm_mnt; #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> +#include <linux/userfaultfd_k.h> +#include <linux/rmap.h> #include <linux/uaccess.h> #include <asm/pgtable.h> @@ -115,13 +119,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); + gfp_t gfp, struct vm_area_struct *vma, + struct vm_fault *vmf, int *fault_type); int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp) { return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL); + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); } static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) @@ -190,6 +195,11 @@ static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; static struct file_system_type shmem_fs_type; +bool vma_is_shmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &shmem_vm_ops; +} + static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); @@ -1570,7 +1580,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct mm_struct *fault_mm, int *fault_type) + struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1624,7 +1634,7 @@ repeat: * bring it back from swap or allocate. */ sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = fault_mm ? : current->mm; + charge_mm = vma ? vma->vm_mm : current->mm; if (swap.val) { /* Look it up and read it in.. */ @@ -1634,7 +1644,8 @@ repeat: if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT); + mem_cgroup_count_vm_event(charge_mm, + PGMAJFAULT); } /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); @@ -1703,6 +1714,11 @@ repeat: swap_free(swap); } else { + if (vma && userfaultfd_missing(vma)) { + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); + return 0; + } + /* shmem_symlink() */ if (mapping->a_ops != &shmem_aops) goto alloc_nohuge; @@ -1965,7 +1981,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) sgp = SGP_NOHUGE; error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, - gfp, vma->vm_mm, &ret); + gfp, vma, vmf, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); return ret; @@ -2175,10 +2191,123 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode bool shmem_mapping(struct address_space *mapping) { - if (!mapping->host) - return false; + return mapping->a_ops == &shmem_aops; +} - return mapping->host->i_sb->s_op == &shmem_ops; +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + struct inode *inode = file_inode(dst_vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + gfp_t gfp = mapping_gfp_mask(mapping); + pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct mem_cgroup *memcg; + spinlock_t *ptl; + void *page_kaddr; + struct page *page; + pte_t _dst_pte, *dst_pte; + int ret; + + ret = -ENOMEM; + if (shmem_acct_block(info->flags, 1)) + goto out; + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks) >= 0) + goto out_unacct_blocks; + percpu_counter_inc(&sbinfo->used_blocks); + } + + if (!*pagep) { + page = shmem_alloc_page(gfp, info, pgoff); + if (!page) + goto out_dec_used_blocks; + + page_kaddr = kmap_atomic(page); + ret = copy_from_user(page_kaddr, (const void __user *)src_addr, + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + *pagep = page; + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -1); + shmem_unacct_blocks(info->flags, 1); + /* don't free the page */ + return -EFAULT; + } + } else { + page = *pagep; + *pagep = NULL; + } + + VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); + __SetPageLocked(page); + __SetPageSwapBacked(page); + __SetPageUptodate(page); + + ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); + if (ret) + goto out_release; + + ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); + if (!ret) { + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); + radix_tree_preload_end(); + } + if (ret) + goto out_release_uncharge; + + mem_cgroup_commit_charge(page, memcg, false, false); + + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_release_uncharge_unlock; + + lru_cache_add_anon(page); + + spin_lock(&info->lock); + info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + + inc_mm_counter(dst_mm, mm_counter_file(page)); + page_add_file_rmap(page, false); + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + unlock_page(page); + pte_unmap_unlock(dst_pte, ptl); + ret = 0; +out: + return ret; +out_release_uncharge_unlock: + pte_unmap_unlock(dst_pte, ptl); +out_release_uncharge: + mem_cgroup_cancel_charge(page, memcg, false); +out_release: + unlock_page(page); + put_page(page); +out_dec_used_blocks: + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -1); +out_unacct_blocks: + shmem_unacct_blocks(info->flags, 1); + goto out; } #ifdef CONFIG_TMPFS @@ -4140,7 +4269,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, BUG_ON(mapping->a_ops != &shmem_aops); error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, - gfp, NULL, NULL); + gfp, NULL, NULL, NULL); if (error) page = ERR_PTR(error); else diff --git a/mm/slab.c b/mm/slab.c index 4f2ec6bb46eb..bd63450a9b16 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1288,7 +1288,8 @@ void __init kmem_cache_init(void) * Initialize the caches that provide memory for the kmem_cache_node * structures first. Without this, further allocations will bug. */ - kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", + kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( + kmalloc_info[INDEX_NODE].name, kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_state = PARTIAL_NODE; setup_kmalloc_cache_index_table(); @@ -2332,6 +2333,13 @@ int __kmem_cache_shrink(struct kmem_cache *cachep) return (ret ? 1 : 0); } +#ifdef CONFIG_MEMCG +void __kmemcg_cache_deactivate(struct kmem_cache *cachep) +{ + __kmem_cache_shrink(cachep); +} +#endif + int __kmem_cache_shutdown(struct kmem_cache *cachep) { return __kmem_cache_shrink(cachep); diff --git a/mm/slab.h b/mm/slab.h index de6579dc362c..65e7c3fcac72 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -71,6 +71,12 @@ extern struct list_head slab_caches; /* The slab cache that manages slab cache information */ extern struct kmem_cache *kmem_cache; +/* A table of kmalloc cache names and sizes */ +extern const struct kmalloc_info_struct { + const char *name; + unsigned long size; +} kmalloc_info[]; + unsigned long calculate_alignment(unsigned long flags, unsigned long align, unsigned long size); @@ -162,6 +168,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); +void __kmemcg_cache_deactivate(struct kmem_cache *s); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; @@ -195,17 +202,22 @@ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) + +/* List of all root caches. */ +extern struct list_head slab_root_caches; +#define root_caches_node memcg_params.__root_caches_node + /* * Iterate over all memcg caches of the given root cache. The caller must hold * slab_mutex. */ #define for_each_memcg_cache(iter, root) \ - list_for_each_entry(iter, &(root)->memcg_params.list, \ - memcg_params.list) + list_for_each_entry(iter, &(root)->memcg_params.children, \ + memcg_params.children_node) static inline bool is_root_cache(struct kmem_cache *s) { - return s->memcg_params.is_root_cache; + return !s->memcg_params.root_cache; } static inline bool slab_equal_or_root(struct kmem_cache *s, @@ -294,9 +306,16 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, } extern void slab_init_memcg_params(struct kmem_cache *); +extern void memcg_link_cache(struct kmem_cache *s); +extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, + void (*deact_fn)(struct kmem_cache *)); #else /* CONFIG_MEMCG && !CONFIG_SLOB */ +/* If !memcg, all caches are root. */ +#define slab_root_caches slab_caches +#define root_caches_node list + #define for_each_memcg_cache(iter, root) \ for ((void)(iter), (void)(root); 0; ) @@ -341,6 +360,11 @@ static inline void memcg_uncharge_slab(struct page *page, int order, static inline void slab_init_memcg_params(struct kmem_cache *s) { } + +static inline void memcg_link_cache(struct kmem_cache *s) +{ +} + #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) @@ -488,6 +512,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) void *slab_start(struct seq_file *m, loff_t *pos); void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); +void *memcg_slab_start(struct seq_file *m, loff_t *pos); +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); +void memcg_slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); diff --git a/mm/slab_common.c b/mm/slab_common.c index ae323841adb1..23ff74e61838 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -30,6 +30,11 @@ LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; +static LIST_HEAD(slab_caches_to_rcu_destroy); +static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); +static DECLARE_WORK(slab_caches_to_rcu_destroy_work, + slab_caches_to_rcu_destroy_workfn); + /* * Set of flags that will prevent slab merging */ @@ -133,11 +138,14 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, } #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) + +LIST_HEAD(slab_root_caches); + void slab_init_memcg_params(struct kmem_cache *s) { - s->memcg_params.is_root_cache = true; - INIT_LIST_HEAD(&s->memcg_params.list); + s->memcg_params.root_cache = NULL; RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); + INIT_LIST_HEAD(&s->memcg_params.children); } static int init_memcg_params(struct kmem_cache *s, @@ -145,10 +153,11 @@ static int init_memcg_params(struct kmem_cache *s, { struct memcg_cache_array *arr; - if (memcg) { - s->memcg_params.is_root_cache = false; - s->memcg_params.memcg = memcg; + if (root_cache) { s->memcg_params.root_cache = root_cache; + s->memcg_params.memcg = memcg; + INIT_LIST_HEAD(&s->memcg_params.children_node); + INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node); return 0; } @@ -177,9 +186,6 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size) { struct memcg_cache_array *old, *new; - if (!is_root_cache(s)) - return 0; - new = kzalloc(sizeof(struct memcg_cache_array) + new_array_size * sizeof(void *), GFP_KERNEL); if (!new) @@ -203,7 +209,7 @@ int memcg_update_all_caches(int num_memcgs) int ret = 0; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { + list_for_each_entry(s, &slab_root_caches, root_caches_node) { ret = update_memcg_params(s, num_memcgs); /* * Instead of freeing the memory, we'll just leave the caches @@ -215,6 +221,28 @@ int memcg_update_all_caches(int num_memcgs) mutex_unlock(&slab_mutex); return ret; } + +void memcg_link_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) { + list_add(&s->root_caches_node, &slab_root_caches); + } else { + list_add(&s->memcg_params.children_node, + &s->memcg_params.root_cache->memcg_params.children); + list_add(&s->memcg_params.kmem_caches_node, + &s->memcg_params.memcg->kmem_caches); + } +} + +static void memcg_unlink_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) { + list_del(&s->root_caches_node); + } else { + list_del(&s->memcg_params.children_node); + list_del(&s->memcg_params.kmem_caches_node); + } +} #else static inline int init_memcg_params(struct kmem_cache *s, struct mem_cgroup *memcg, struct kmem_cache *root_cache) @@ -225,6 +253,10 @@ static inline int init_memcg_params(struct kmem_cache *s, static inline void destroy_memcg_params(struct kmem_cache *s) { } + +static inline void memcg_unlink_cache(struct kmem_cache *s) +{ +} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ /* @@ -255,7 +287,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, { struct kmem_cache *s; - if (slab_nomerge || (flags & SLAB_NEVER_MERGE)) + if (slab_nomerge) return NULL; if (ctor) @@ -266,7 +298,10 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, size = ALIGN(size, align); flags = kmem_cache_flags(size, flags, name, NULL); - list_for_each_entry_reverse(s, &slab_caches, list) { + if (flags & SLAB_NEVER_MERGE) + return NULL; + + list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) { if (slab_unmergeable(s)) continue; @@ -350,6 +385,7 @@ static struct kmem_cache *create_cache(const char *name, s->refcount = 1; list_add(&s->list, &slab_caches); + memcg_link_cache(s); out: if (err) return ERR_PTR(err); @@ -458,33 +494,58 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create); -static int shutdown_cache(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) +static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) { - if (__kmem_cache_shutdown(s) != 0) - return -EBUSY; + LIST_HEAD(to_destroy); + struct kmem_cache *s, *s2; + + /* + * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the + * @slab_caches_to_rcu_destroy list. The slab pages are freed + * through RCU and and the associated kmem_cache are dereferenced + * while freeing the pages, so the kmem_caches should be freed only + * after the pending RCU operations are finished. As rcu_barrier() + * is a pretty slow operation, we batch all pending destructions + * asynchronously. + */ + mutex_lock(&slab_mutex); + list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy); + mutex_unlock(&slab_mutex); + + if (list_empty(&to_destroy)) + return; - if (s->flags & SLAB_DESTROY_BY_RCU) - *need_rcu_barrier = true; + rcu_barrier(); - list_move(&s->list, release); - return 0; + list_for_each_entry_safe(s, s2, &to_destroy, list) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_release(s); +#else + slab_kmem_cache_release(s); +#endif + } } -static void release_caches(struct list_head *release, bool need_rcu_barrier) +static int shutdown_cache(struct kmem_cache *s) { - struct kmem_cache *s, *s2; + if (__kmem_cache_shutdown(s) != 0) + return -EBUSY; - if (need_rcu_barrier) - rcu_barrier(); + memcg_unlink_cache(s); + list_del(&s->list); - list_for_each_entry_safe(s, s2, release, list) { + if (s->flags & SLAB_DESTROY_BY_RCU) { + list_add_tail(&s->list, &slab_caches_to_rcu_destroy); + schedule_work(&slab_caches_to_rcu_destroy_work); + } else { #ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_remove(s); + sysfs_slab_release(s); #else slab_kmem_cache_release(s); #endif } + + return 0; } #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) @@ -551,8 +612,6 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; } - list_add(&s->memcg_params.list, &root_cache->memcg_params.list); - /* * Since readers won't lock (see cache_from_memcg_idx()), we need a * barrier here to ensure nobody will see the kmem_cache partially @@ -568,6 +627,66 @@ out_unlock: put_online_cpus(); } +static void kmemcg_deactivate_workfn(struct work_struct *work) +{ + struct kmem_cache *s = container_of(work, struct kmem_cache, + memcg_params.deact_work); + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + + s->memcg_params.deact_fn(s); + + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); + + /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */ + css_put(&s->memcg_params.memcg->css); +} + +static void kmemcg_deactivate_rcufn(struct rcu_head *head) +{ + struct kmem_cache *s = container_of(head, struct kmem_cache, + memcg_params.deact_rcu_head); + + /* + * We need to grab blocking locks. Bounce to ->deact_work. The + * work item shares the space with the RCU head and can't be + * initialized eariler. + */ + INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn); + queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work); +} + +/** + * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a + * sched RCU grace period + * @s: target kmem_cache + * @deact_fn: deactivation function to call + * + * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex + * held after a sched RCU grace period. The slab is guaranteed to stay + * alive until @deact_fn is finished. This is to be used from + * __kmemcg_cache_deactivate(). + */ +void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, + void (*deact_fn)(struct kmem_cache *)) +{ + if (WARN_ON_ONCE(is_root_cache(s)) || + WARN_ON_ONCE(s->memcg_params.deact_fn)) + return; + + /* pin memcg so that @s doesn't get destroyed in the middle */ + css_get(&s->memcg_params.memcg->css); + + s->memcg_params.deact_fn = deact_fn; + call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn); +} + void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) { int idx; @@ -579,41 +698,15 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) get_online_cpus(); get_online_mems(); -#ifdef CONFIG_SLUB - /* - * In case of SLUB, we need to disable empty slab caching to - * avoid pinning the offline memory cgroup by freeable kmem - * pages charged to it. SLAB doesn't need this, as it - * periodically purges unused slabs. - */ - mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL; - if (c) { - c->cpu_partial = 0; - c->min_partial = 0; - } - } - mutex_unlock(&slab_mutex); - /* - * kmem_cache->cpu_partial is checked locklessly (see - * put_cpu_partial()). Make sure the change is visible. - */ - synchronize_sched(); -#endif - mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - if (!is_root_cache(s)) - continue; - + list_for_each_entry(s, &slab_root_caches, root_caches_node) { arr = rcu_dereference_protected(s->memcg_params.memcg_caches, lockdep_is_held(&slab_mutex)); c = arr->entries[idx]; if (!c) continue; - __kmem_cache_shrink(c); + __kmemcg_cache_deactivate(c); arr->entries[idx] = NULL; } mutex_unlock(&slab_mutex); @@ -622,47 +715,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) put_online_cpus(); } -static int __shutdown_memcg_cache(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) -{ - BUG_ON(is_root_cache(s)); - - if (shutdown_cache(s, release, need_rcu_barrier)) - return -EBUSY; - - list_del(&s->memcg_params.list); - return 0; -} - void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { - LIST_HEAD(release); - bool need_rcu_barrier = false; struct kmem_cache *s, *s2; get_online_cpus(); get_online_mems(); mutex_lock(&slab_mutex); - list_for_each_entry_safe(s, s2, &slab_caches, list) { - if (is_root_cache(s) || s->memcg_params.memcg != memcg) - continue; + list_for_each_entry_safe(s, s2, &memcg->kmem_caches, + memcg_params.kmem_caches_node) { /* * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. */ - BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier)); + BUG_ON(shutdown_cache(s)); } mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - - release_caches(&release, need_rcu_barrier); } -static int shutdown_memcg_caches(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) +static int shutdown_memcg_caches(struct kmem_cache *s) { struct memcg_cache_array *arr; struct kmem_cache *c, *c2; @@ -681,13 +756,13 @@ static int shutdown_memcg_caches(struct kmem_cache *s, c = arr->entries[i]; if (!c) continue; - if (__shutdown_memcg_cache(c, release, need_rcu_barrier)) + if (shutdown_cache(c)) /* * The cache still has objects. Move it to a temporary * list so as not to try to destroy it for a second * time while iterating over inactive caches below. */ - list_move(&c->memcg_params.list, &busy); + list_move(&c->memcg_params.children_node, &busy); else /* * The cache is empty and will be destroyed soon. Clear @@ -702,23 +777,22 @@ static int shutdown_memcg_caches(struct kmem_cache *s, * Second, shutdown all caches left from memory cgroups that are now * offline. */ - list_for_each_entry_safe(c, c2, &s->memcg_params.list, - memcg_params.list) - __shutdown_memcg_cache(c, release, need_rcu_barrier); + list_for_each_entry_safe(c, c2, &s->memcg_params.children, + memcg_params.children_node) + shutdown_cache(c); - list_splice(&busy, &s->memcg_params.list); + list_splice(&busy, &s->memcg_params.children); /* * A cache being destroyed must be empty. In particular, this means * that all per memcg caches attached to it must be empty too. */ - if (!list_empty(&s->memcg_params.list)) + if (!list_empty(&s->memcg_params.children)) return -EBUSY; return 0; } #else -static inline int shutdown_memcg_caches(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) +static inline int shutdown_memcg_caches(struct kmem_cache *s) { return 0; } @@ -734,8 +808,6 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - LIST_HEAD(release); - bool need_rcu_barrier = false; int err; if (unlikely(!s)) @@ -751,9 +823,9 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - err = shutdown_memcg_caches(s, &release, &need_rcu_barrier); + err = shutdown_memcg_caches(s); if (!err) - err = shutdown_cache(s, &release, &need_rcu_barrier); + err = shutdown_cache(s); if (err) { pr_err("kmem_cache_destroy %s: Slab cache still has objects\n", @@ -765,8 +837,6 @@ out_unlock: put_online_mems(); put_online_cpus(); - - release_caches(&release, need_rcu_barrier); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -828,6 +898,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, create_boot_cache(s, name, size, flags); list_add(&s->list, &slab_caches); + memcg_link_cache(s); s->refcount = 1; return s; } @@ -912,10 +983,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is * kmalloc-67108864. */ -static struct { - const char *name; - unsigned long size; -} const kmalloc_info[] __initconst = { +const struct kmalloc_info_struct kmalloc_info[] __initconst = { {NULL, 0}, {"kmalloc-96", 96}, {"kmalloc-192", 192}, {"kmalloc-8", 8}, {"kmalloc-16", 16}, {"kmalloc-32", 32}, @@ -1138,12 +1206,12 @@ static void print_slabinfo_header(struct seq_file *m) void *slab_start(struct seq_file *m, loff_t *pos) { mutex_lock(&slab_mutex); - return seq_list_start(&slab_caches, *pos); + return seq_list_start(&slab_root_caches, *pos); } void *slab_next(struct seq_file *m, void *p, loff_t *pos) { - return seq_list_next(p, &slab_caches, pos); + return seq_list_next(p, &slab_root_caches, pos); } void slab_stop(struct seq_file *m, void *p) @@ -1195,25 +1263,44 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m) static int slab_show(struct seq_file *m, void *p) { - struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node); - if (p == slab_caches.next) + if (p == slab_root_caches.next) print_slabinfo_header(m); - if (is_root_cache(s)) - cache_show(s, m); + cache_show(s, m); return 0; } #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +void *memcg_slab_start(struct seq_file *m, loff_t *pos) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + mutex_lock(&slab_mutex); + return seq_list_start(&memcg->kmem_caches, *pos); +} + +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + return seq_list_next(p, &memcg->kmem_caches, pos); +} + +void memcg_slab_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + int memcg_slab_show(struct seq_file *m, void *p) { - struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct kmem_cache *s = list_entry(p, struct kmem_cache, + memcg_params.kmem_caches_node); struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - if (p == slab_caches.next) + if (p == memcg->kmem_caches.next) print_slabinfo_header(m); - if (!is_root_cache(s) && s->memcg_params.memcg == memcg) - cache_show(s, m); + cache_show(s, m); return 0; } #endif diff --git a/mm/slub.c b/mm/slub.c index 7ec0a965c6a3..7f4bc7027ed5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -214,11 +214,13 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void memcg_propagate_slab_attrs(struct kmem_cache *s); +static void sysfs_slab_remove(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } +static inline void sysfs_slab_remove(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) @@ -1630,6 +1632,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) flags &= ~GFP_SLAB_BUG_MASK; pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", invalid_mask, &invalid_mask, flags, &flags); + dump_stack(); } return allocate_slab(s, @@ -3686,6 +3689,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) if (n->nr_partial || slabs_node(s, node)) return 1; } + sysfs_slab_remove(s); return 0; } @@ -3952,6 +3956,42 @@ int __kmem_cache_shrink(struct kmem_cache *s) return ret; } +#ifdef CONFIG_MEMCG +static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s) +{ + /* + * Called with all the locks held after a sched RCU grace period. + * Even if @s becomes empty after shrinking, we can't know that @s + * doesn't have allocations already in-flight and thus can't + * destroy @s until the associated memcg is released. + * + * However, let's remove the sysfs files for empty caches here. + * Each cache has a lot of interface files which aren't + * particularly useful for empty draining caches; otherwise, we can + * easily end up with millions of unnecessary sysfs files on + * systems which have a lot of memory and transient cgroups. + */ + if (!__kmem_cache_shrink(s)) + sysfs_slab_remove(s); +} + +void __kmemcg_cache_deactivate(struct kmem_cache *s) +{ + /* + * Disable empty slabs caching. Used to avoid pinning offline + * memory cgroups by kmem pages that can be freed. + */ + s->cpu_partial = 0; + s->min_partial = 0; + + /* + * s->cpu_partial is checked locklessly (see put_cpu_partial), so + * we have to make sure the change is visible before shrinking. + */ + slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu); +} +#endif + static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; @@ -4108,6 +4148,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) } slab_init_memcg_params(s); list_add(&s->list, &slab_caches); + memcg_link_cache(s); return s; } @@ -4667,6 +4708,22 @@ enum slab_stat_type { #define SO_OBJECTS (1 << SL_OBJECTS) #define SO_TOTAL (1 << SL_TOTAL) +#ifdef CONFIG_MEMCG +static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); + +static int __init setup_slub_memcg_sysfs(char *str) +{ + int v; + + if (get_option(&str, &v) > 0) + memcg_sysfs_enabled = v; + + return 1; +} + +__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs); +#endif + static ssize_t show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { @@ -5570,8 +5627,14 @@ static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; + struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); + if (!kset) { + kobject_init(&s->kobj, &slab_ktype); + return 0; + } + if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -5588,7 +5651,7 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - s->kobj.kset = cache_kset(s); + s->kobj.kset = kset; err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) goto out; @@ -5598,7 +5661,7 @@ static int sysfs_slab_add(struct kmem_cache *s) goto out_del_kobj; #ifdef CONFIG_MEMCG - if (is_root_cache(s)) { + if (is_root_cache(s) && memcg_sysfs_enabled) { s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); if (!s->memcg_kset) { err = -ENOMEM; @@ -5621,7 +5684,7 @@ out_del_kobj: goto out; } -void sysfs_slab_remove(struct kmem_cache *s) +static void sysfs_slab_remove(struct kmem_cache *s) { if (slab_state < FULL) /* @@ -5630,12 +5693,26 @@ void sysfs_slab_remove(struct kmem_cache *s) */ return; + if (!s->kobj.state_in_sysfs) + /* + * For a memcg cache, this may be called during + * deactivation and again on shutdown. Remove only once. + * A cache is never shut down before deactivation is + * complete, so no need to worry about synchronization. + */ + return; + #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); - kobject_put(&s->kobj); +} + +void sysfs_slab_release(struct kmem_cache *s) +{ + if (slab_state >= FULL) + kobject_put(&s->kobj); } /* diff --git a/mm/sparse.c b/mm/sparse.c index 1e168bf2779a..db6bf3c97ea2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -662,12 +662,12 @@ static void free_map_bootmem(struct page *memmap) >> PAGE_SHIFT; for (i = 0; i < nr_pages; i++, page++) { - magic = (unsigned long) page->lru.next; + magic = (unsigned long) page->freelist; BUG_ON(magic == NODE_INFO); maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); - removing_section_nr = page->private; + removing_section_nr = page_private(page); /* * When this function is called, the removing section is diff --git a/mm/swap.c b/mm/swap.c index 844baedd2429..aabf2e90fe32 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -971,12 +971,6 @@ EXPORT_SYMBOL(pagevec_lookup_tag); void __init swap_setup(void) { unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); -#ifdef CONFIG_SWAP - int i; - - for (i = 0; i < MAX_SWAPFILES; i++) - spin_lock_init(&swapper_spaces[i].tree_lock); -#endif /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff --git a/mm/swap_slots.c b/mm/swap_slots.c new file mode 100644 index 000000000000..9b5bc86f96ad --- /dev/null +++ b/mm/swap_slots.c @@ -0,0 +1,342 @@ +/* + * Manage cache of swap slots to be used for and returned from + * swap. + * + * Copyright(c) 2016 Intel Corporation. + * + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * We allocate the swap slots from the global pool and put + * it into local per cpu caches. This has the advantage + * of no needing to acquire the swap_info lock every time + * we need a new slot. + * + * There is also opportunity to simply return the slot + * to local caches without needing to acquire swap_info + * lock. We do not reuse the returned slots directly but + * move them back to the global pool in a batch. This + * allows the slots to coaellesce and reduce fragmentation. + * + * The swap entry allocated is marked with SWAP_HAS_CACHE + * flag in map_count that prevents it from being allocated + * again from the global pool. + * + * The swap slots cache is protected by a mutex instead of + * a spin lock as when we search for slots with scan_swap_map, + * we can possibly sleep. + */ + +#include <linux/swap_slots.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/vmalloc.h> +#include <linux/mutex.h> + +#ifdef CONFIG_SWAP + +static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); +static bool swap_slot_cache_active; +bool swap_slot_cache_enabled; +static bool swap_slot_cache_initialized; +DEFINE_MUTEX(swap_slots_cache_mutex); +/* Serialize swap slots cache enable/disable operations */ +DEFINE_MUTEX(swap_slots_cache_enable_mutex); + +static void __drain_swap_slots_cache(unsigned int type); +static void deactivate_swap_slots_cache(void); +static void reactivate_swap_slots_cache(void); + +#define use_swap_slot_cache (swap_slot_cache_active && \ + swap_slot_cache_enabled && swap_slot_cache_initialized) +#define SLOTS_CACHE 0x1 +#define SLOTS_CACHE_RET 0x2 + +static void deactivate_swap_slots_cache(void) +{ + mutex_lock(&swap_slots_cache_mutex); + swap_slot_cache_active = false; + __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + mutex_unlock(&swap_slots_cache_mutex); +} + +static void reactivate_swap_slots_cache(void) +{ + mutex_lock(&swap_slots_cache_mutex); + swap_slot_cache_active = true; + mutex_unlock(&swap_slots_cache_mutex); +} + +/* Must not be called with cpu hot plug lock */ +void disable_swap_slots_cache_lock(void) +{ + mutex_lock(&swap_slots_cache_enable_mutex); + swap_slot_cache_enabled = false; + if (swap_slot_cache_initialized) { + /* serialize with cpu hotplug operations */ + get_online_cpus(); + __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + put_online_cpus(); + } +} + +static void __reenable_swap_slots_cache(void) +{ + swap_slot_cache_enabled = has_usable_swap(); +} + +void reenable_swap_slots_cache_unlock(void) +{ + __reenable_swap_slots_cache(); + mutex_unlock(&swap_slots_cache_enable_mutex); +} + +static bool check_cache_active(void) +{ + long pages; + + if (!swap_slot_cache_enabled || !swap_slot_cache_initialized) + return false; + + pages = get_nr_swap_pages(); + if (!swap_slot_cache_active) { + if (pages > num_online_cpus() * + THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE) + reactivate_swap_slots_cache(); + goto out; + } + + /* if global pool of slot caches too low, deactivate cache */ + if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE) + deactivate_swap_slots_cache(); +out: + return swap_slot_cache_active; +} + +static int alloc_swap_slot_cache(unsigned int cpu) +{ + struct swap_slots_cache *cache; + swp_entry_t *slots, *slots_ret; + + /* + * Do allocation outside swap_slots_cache_mutex + * as vzalloc could trigger reclaim and get_swap_page, + * which can lock swap_slots_cache_mutex. + */ + slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); + if (!slots) + return -ENOMEM; + + slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); + if (!slots_ret) { + vfree(slots); + return -ENOMEM; + } + + mutex_lock(&swap_slots_cache_mutex); + cache = &per_cpu(swp_slots, cpu); + if (cache->slots || cache->slots_ret) + /* cache already allocated */ + goto out; + if (!cache->lock_initialized) { + mutex_init(&cache->alloc_lock); + spin_lock_init(&cache->free_lock); + cache->lock_initialized = true; + } + cache->nr = 0; + cache->cur = 0; + cache->n_ret = 0; + cache->slots = slots; + slots = NULL; + cache->slots_ret = slots_ret; + slots_ret = NULL; +out: + mutex_unlock(&swap_slots_cache_mutex); + if (slots) + vfree(slots); + if (slots_ret) + vfree(slots_ret); + return 0; +} + +static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + struct swap_slots_cache *cache; + swp_entry_t *slots = NULL; + + cache = &per_cpu(swp_slots, cpu); + if ((type & SLOTS_CACHE) && cache->slots) { + mutex_lock(&cache->alloc_lock); + swapcache_free_entries(cache->slots + cache->cur, cache->nr); + cache->cur = 0; + cache->nr = 0; + if (free_slots && cache->slots) { + vfree(cache->slots); + cache->slots = NULL; + } + mutex_unlock(&cache->alloc_lock); + } + if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { + spin_lock_irq(&cache->free_lock); + swapcache_free_entries(cache->slots_ret, cache->n_ret); + cache->n_ret = 0; + if (free_slots && cache->slots_ret) { + slots = cache->slots_ret; + cache->slots_ret = NULL; + } + spin_unlock_irq(&cache->free_lock); + if (slots) + vfree(slots); + } +} + +static void __drain_swap_slots_cache(unsigned int type) +{ + unsigned int cpu; + + /* + * This function is called during + * 1) swapoff, when we have to make sure no + * left over slots are in cache when we remove + * a swap device; + * 2) disabling of swap slot cache, when we run low + * on swap slots when allocating memory and need + * to return swap slots to global pool. + * + * We cannot acquire cpu hot plug lock here as + * this function can be invoked in the cpu + * hot plug path: + * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback + * -> memory allocation -> direct reclaim -> get_swap_page + * -> drain_swap_slots_cache + * + * Hence the loop over current online cpu below could miss cpu that + * is being brought online but not yet marked as online. + * That is okay as we do not schedule and run anything on a + * cpu before it has been marked online. Hence, we will not + * fill any swap slots in slots cache of such cpu. + * There are no slots on such cpu that need to be drained. + */ + for_each_online_cpu(cpu) + drain_slots_cache_cpu(cpu, type, false); +} + +static int free_slot_cache(unsigned int cpu) +{ + mutex_lock(&swap_slots_cache_mutex); + drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); + mutex_unlock(&swap_slots_cache_mutex); + return 0; +} + +int enable_swap_slots_cache(void) +{ + int ret = 0; + + mutex_lock(&swap_slots_cache_enable_mutex); + if (swap_slot_cache_initialized) { + __reenable_swap_slots_cache(); + goto out_unlock; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", + alloc_swap_slot_cache, free_slot_cache); + if (ret < 0) + goto out_unlock; + swap_slot_cache_initialized = true; + __reenable_swap_slots_cache(); +out_unlock: + mutex_unlock(&swap_slots_cache_enable_mutex); + return 0; +} + +/* called with swap slot cache's alloc lock held */ +static int refill_swap_slots_cache(struct swap_slots_cache *cache) +{ + if (!use_swap_slot_cache || cache->nr) + return 0; + + cache->cur = 0; + if (swap_slot_cache_active) + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots); + + return cache->nr; +} + +int free_swap_slot(swp_entry_t entry) +{ + struct swap_slots_cache *cache; + + BUG_ON(!swap_slot_cache_initialized); + + cache = &get_cpu_var(swp_slots); + if (use_swap_slot_cache && cache->slots_ret) { + spin_lock_irq(&cache->free_lock); + /* Swap slots cache may be deactivated before acquiring lock */ + if (!use_swap_slot_cache) { + spin_unlock_irq(&cache->free_lock); + goto direct_free; + } + if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { + /* + * Return slots to global pool. + * The current swap_map value is SWAP_HAS_CACHE. + * Set it to 0 to indicate it is available for + * allocation in global pool + */ + swapcache_free_entries(cache->slots_ret, cache->n_ret); + cache->n_ret = 0; + } + cache->slots_ret[cache->n_ret++] = entry; + spin_unlock_irq(&cache->free_lock); + } else { +direct_free: + swapcache_free_entries(&entry, 1); + } + put_cpu_var(swp_slots); + + return 0; +} + +swp_entry_t get_swap_page(void) +{ + swp_entry_t entry, *pentry; + struct swap_slots_cache *cache; + + /* + * Preemption is allowed here, because we may sleep + * in refill_swap_slots_cache(). But it is safe, because + * accesses to the per-CPU data structure are protected by the + * mutex cache->alloc_lock. + * + * The alloc path here does not touch cache->slots_ret + * so cache->free_lock is not taken. + */ + cache = raw_cpu_ptr(&swp_slots); + + entry.val = 0; + if (check_cache_active()) { + mutex_lock(&cache->alloc_lock); + if (cache->slots) { +repeat: + if (cache->nr) { + pentry = &cache->slots[cache->cur++]; + entry = *pentry; + pentry->val = 0; + cache->nr--; + } else { + if (refill_swap_slots_cache(cache)) + goto repeat; + } + } + mutex_unlock(&cache->alloc_lock); + if (entry.val) + return entry; + } + + get_swap_pages(1, &entry); + + return entry; +} + +#endif /* CONFIG_SWAP */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 35d7e0ee1c77..473b71e052a8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -17,6 +17,8 @@ #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/migrate.h> +#include <linux/vmalloc.h> +#include <linux/swap_slots.h> #include <asm/pgtable.h> @@ -32,15 +34,8 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space swapper_spaces[MAX_SWAPFILES] = { - [0 ... MAX_SWAPFILES - 1] = { - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .i_mmap_writable = ATOMIC_INIT(0), - .a_ops = &swap_aops, - /* swap cache doesn't use writeback related tags */ - .flags = 1 << AS_NO_WRITEBACK_TAGS, - } -}; +struct address_space *swapper_spaces[MAX_SWAPFILES]; +static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -53,11 +48,26 @@ static struct { unsigned long total_swapcache_pages(void) { - int i; + unsigned int i, j, nr; unsigned long ret = 0; + struct address_space *spaces; - for (i = 0; i < MAX_SWAPFILES; i++) - ret += swapper_spaces[i].nrpages; + rcu_read_lock(); + for (i = 0; i < MAX_SWAPFILES; i++) { + /* + * The corresponding entries in nr_swapper_spaces and + * swapper_spaces will be reused only after at least + * one grace period. So it is impossible for them + * belongs to different usage. + */ + nr = nr_swapper_spaces[i]; + spaces = rcu_dereference(swapper_spaces[i]); + if (!nr || !spaces) + continue; + for (j = 0; j < nr; j++) + ret += spaces[j].nrpages; + } + rcu_read_unlock(); return ret; } @@ -315,6 +325,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, break; /* + * Just skip read ahead for unused swap slot. + * During swap_off when swap_slot_cache is disabled, + * we have to handle the race between putting + * swap entry in swap cache and marking swap slot + * as SWAP_HAS_CACHE. That's done in later part of code or + * else swap_off will be aborted if we return NULL. + */ + if (!__swp_swapcount(entry) && swap_slot_cache_enabled) + break; + + /* * Get a new page to read into from swap. */ if (!new_page) { @@ -505,3 +526,38 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } + +int init_swap_address_space(unsigned int type, unsigned long nr_pages) +{ + struct address_space *spaces, *space; + unsigned int i, nr; + + nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + spaces = vzalloc(sizeof(struct address_space) * nr); + if (!spaces) + return -ENOMEM; + for (i = 0; i < nr; i++) { + space = spaces + i; + INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); + atomic_set(&space->i_mmap_writable, 0); + space->a_ops = &swap_aops; + /* swap cache doesn't use writeback related tags */ + mapping_set_no_writeback_tags(space); + spin_lock_init(&space->tree_lock); + } + nr_swapper_spaces[type] = nr; + rcu_assign_pointer(swapper_spaces[type], spaces); + + return 0; +} + +void exit_swap_address_space(unsigned int type) +{ + struct address_space *spaces; + + spaces = swapper_spaces[type]; + nr_swapper_spaces[type] = 0; + rcu_assign_pointer(swapper_spaces[type], NULL); + synchronize_rcu(); + kvfree(spaces); +} diff --git a/mm/swapfile.c b/mm/swapfile.c index 4761701d1721..2cac12cc9abe 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -34,6 +34,7 @@ #include <linux/frontswap.h> #include <linux/swapfile.h> #include <linux/export.h> +#include <linux/swap_slots.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -257,6 +258,47 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset) +{ + struct swap_cluster_info *ci; + + ci = si->cluster_info; + if (ci) { + ci += offset / SWAPFILE_CLUSTER; + spin_lock(&ci->lock); + } + return ci; +} + +static inline void unlock_cluster(struct swap_cluster_info *ci) +{ + if (ci) + spin_unlock(&ci->lock); +} + +static inline struct swap_cluster_info *lock_cluster_or_swap_info( + struct swap_info_struct *si, + unsigned long offset) +{ + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + if (!ci) + spin_lock(&si->lock); + + return ci; +} + +static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + if (ci) + unlock_cluster(ci); + else + spin_unlock(&si->lock); +} + static inline bool cluster_list_empty(struct swap_cluster_list *list) { return cluster_is_null(&list->head); @@ -281,9 +323,17 @@ static void cluster_list_add_tail(struct swap_cluster_list *list, cluster_set_next_flag(&list->head, idx, 0); cluster_set_next_flag(&list->tail, idx, 0); } else { + struct swap_cluster_info *ci_tail; unsigned int tail = cluster_next(&list->tail); - cluster_set_next(&ci[tail], idx); + /* + * Nested cluster lock, but both cluster locks are + * only acquired when we held swap_info_struct->lock + */ + ci_tail = ci + tail; + spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); + cluster_set_next(ci_tail, idx); + unlock_cluster(ci_tail); cluster_set_next_flag(&list->tail, idx, 0); } } @@ -328,7 +378,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, */ static void swap_do_scheduled_discard(struct swap_info_struct *si) { - struct swap_cluster_info *info; + struct swap_cluster_info *info, *ci; unsigned int idx; info = si->cluster_info; @@ -341,10 +391,14 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) SWAPFILE_CLUSTER); spin_lock(&si->lock); - cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); + ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); + cluster_set_flag(ci, CLUSTER_FLAG_FREE); + unlock_cluster(ci); cluster_list_add_tail(&si->free_clusters, info, idx); + ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); } } @@ -443,12 +497,13 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, * Try to get a swap entry from current cpu's swap entry pool (a cluster). This * might involve allocating a new cluster for current CPU too. */ -static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, +static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, unsigned long *offset, unsigned long *scan_base) { struct percpu_cluster *cluster; + struct swap_cluster_info *ci; bool found_free; - unsigned long tmp; + unsigned long tmp, max; new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); @@ -466,7 +521,7 @@ new_cluster: *scan_base = *offset = si->cluster_next; goto new_cluster; } else - return; + return false; } found_free = false; @@ -476,14 +531,21 @@ new_cluster: * check if there is still free entry in the cluster */ tmp = cluster->next; - while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * - SWAPFILE_CLUSTER) { + max = min_t(unsigned long, si->max, + (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); + if (tmp >= max) { + cluster_set_null(&cluster->index); + goto new_cluster; + } + ci = lock_cluster(si, tmp); + while (tmp < max) { if (!si->swap_map[tmp]) { found_free = true; break; } tmp++; } + unlock_cluster(ci); if (!found_free) { cluster_set_null(&cluster->index); goto new_cluster; @@ -491,15 +553,22 @@ new_cluster: cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; + return found_free; } -static unsigned long scan_swap_map(struct swap_info_struct *si, - unsigned char usage) +static int scan_swap_map_slots(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[]) { + struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + int n_ret = 0; + + if (nr > SWAP_BATCH) + nr = SWAP_BATCH; /* * We try to cluster swap pages by allocating them sequentially @@ -517,8 +586,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, /* SSD algorithm */ if (si->cluster_info) { - scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); - goto checks; + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + goto checks; + else + goto scan; } if (unlikely(!si->cluster_nr--)) { @@ -562,8 +633,14 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, checks: if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset)) - scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + while (scan_swap_map_ssd_cluster_conflict(si, offset)) { + /* take a break if we already got some slots */ + if (n_ret) + goto done; + if (!scan_swap_map_try_ssd_cluster(si, &offset, + &scan_base)) + goto scan; + } } if (!(si->flags & SWP_WRITEOK)) goto no_page; @@ -572,9 +649,11 @@ checks: if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; + ci = lock_cluster(si, offset); /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; + unlock_cluster(ci); spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset); spin_lock(&si->lock); @@ -584,8 +663,13 @@ checks: goto scan; /* check next one */ } - if (si->swap_map[offset]) - goto scan; + if (si->swap_map[offset]) { + unlock_cluster(ci); + if (!n_ret) + goto scan; + else + goto done; + } if (offset == si->lowest_bit) si->lowest_bit++; @@ -601,10 +685,45 @@ checks: } si->swap_map[offset] = usage; inc_cluster_info_page(si, si->cluster_info, offset); + unlock_cluster(ci); si->cluster_next = offset + 1; - si->flags -= SWP_SCANNING; + slots[n_ret++] = swp_entry(si->type, offset); + + /* got enough slots or reach max slots? */ + if ((n_ret == nr) || (offset >= si->highest_bit)) + goto done; + + /* search for next available slot */ + + /* time to take a break? */ + if (unlikely(--latency_ration < 0)) { + if (n_ret) + goto done; + spin_unlock(&si->lock); + cond_resched(); + spin_lock(&si->lock); + latency_ration = LATENCY_LIMIT; + } + + /* try to get more slots in cluster */ + if (si->cluster_info) { + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + goto checks; + else + goto done; + } + /* non-ssd case */ + ++offset; - return offset; + /* non-ssd case, still more slots in cluster? */ + if (si->cluster_nr && !si->swap_map[offset]) { + --si->cluster_nr; + goto checks; + } + +done: + si->flags -= SWP_SCANNING; + return n_ret; scan: spin_unlock(&si->lock); @@ -642,17 +761,41 @@ scan: no_page: si->flags -= SWP_SCANNING; - return 0; + return n_ret; +} + +static unsigned long scan_swap_map(struct swap_info_struct *si, + unsigned char usage) +{ + swp_entry_t entry; + int n_ret; + + n_ret = scan_swap_map_slots(si, usage, 1, &entry); + + if (n_ret) + return swp_offset(entry); + else + return 0; + } -swp_entry_t get_swap_page(void) +int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) { struct swap_info_struct *si, *next; - pgoff_t offset; + long avail_pgs; + int n_ret = 0; - if (atomic_long_read(&nr_swap_pages) <= 0) + avail_pgs = atomic_long_read(&nr_swap_pages); + if (avail_pgs <= 0) goto noswap; - atomic_long_dec(&nr_swap_pages); + + if (n_goal > SWAP_BATCH) + n_goal = SWAP_BATCH; + + if (n_goal > avail_pgs) + n_goal = avail_pgs; + + atomic_long_sub(n_goal, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -678,14 +821,14 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - - /* This is called for allocating swap entry for cache */ - offset = scan_swap_map(si, SWAP_HAS_CACHE); + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries); spin_unlock(&si->lock); - if (offset) - return swp_entry(si->type, offset); + if (n_ret) + goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", - si->type); + si->type); + spin_lock(&swap_avail_lock); nextsi: /* @@ -696,7 +839,8 @@ nextsi: * up between us dropping swap_avail_lock and taking si->lock. * Since we dropped the swap_avail_lock, the swap_avail_head * list may have been modified; so if next is still in the - * swap_avail_head list then try it, otherwise start over. + * swap_avail_head list then try it, otherwise start over + * if we have not gotten any slots. */ if (plist_node_empty(&next->avail_list)) goto start_over; @@ -704,9 +848,11 @@ nextsi: spin_unlock(&swap_avail_lock); - atomic_long_inc(&nr_swap_pages); +check_out: + if (n_ret < n_goal) + atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages); noswap: - return (swp_entry_t) {0}; + return n_ret; } /* The only caller of this function is now suspend routine */ @@ -731,7 +877,7 @@ swp_entry_t get_swap_page_of_type(int type) return (swp_entry_t) {0}; } -static struct swap_info_struct *swap_info_get(swp_entry_t entry) +static struct swap_info_struct *__swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; unsigned long offset, type; @@ -747,34 +893,76 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) offset = swp_offset(entry); if (offset >= p->max) goto bad_offset; - if (!p->swap_map[offset]) - goto bad_free; - spin_lock(&p->lock); return p; -bad_free: - pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); - goto out; bad_offset: - pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); + pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val); goto out; bad_device: - pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); + pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val); goto out; bad_nofile: - pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val); +out: + return NULL; +} + +static struct swap_info_struct *_swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct *p; + + p = __swap_info_get(entry); + if (!p) + goto out; + if (!p->swap_map[swp_offset(entry)]) + goto bad_free; + return p; + +bad_free: + pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); + goto out; out: return NULL; } -static unsigned char swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) +static struct swap_info_struct *swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct *p; + + p = _swap_info_get(entry); + if (p) + spin_lock(&p->lock); + return p; +} + +static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, + struct swap_info_struct *q) +{ + struct swap_info_struct *p; + + p = _swap_info_get(entry); + + if (p != q) { + if (q != NULL) + spin_unlock(&q->lock); + if (p != NULL) + spin_lock(&p->lock); + } + return p; +} + +static unsigned char __swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) { + struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; + ci = lock_cluster_or_swap_info(p, offset); + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; @@ -798,38 +986,52 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, } usage = count | has_cache; - p->swap_map[offset] = usage; - - /* free if no reference */ - if (!usage) { - mem_cgroup_uncharge_swap(entry); - dec_cluster_info_page(p, p->cluster_info, offset); - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) { - bool was_full = !p->highest_bit; - p->highest_bit = offset; - if (was_full && (p->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&p->avail_list)); - if (plist_node_empty(&p->avail_list)) - plist_add(&p->avail_list, - &swap_avail_head); - spin_unlock(&swap_avail_lock); - } - } - atomic_long_inc(&nr_swap_pages); - p->inuse_pages--; - frontswap_invalidate_page(p->type, offset); - if (p->flags & SWP_BLKDEV) { - struct gendisk *disk = p->bdev->bd_disk; - if (disk->fops->swap_slot_free_notify) - disk->fops->swap_slot_free_notify(p->bdev, - offset); + p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; + + unlock_cluster_or_swap_info(p, ci); + + return usage; +} + +static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + unsigned char count; + + ci = lock_cluster(p, offset); + count = p->swap_map[offset]; + VM_BUG_ON(count != SWAP_HAS_CACHE); + p->swap_map[offset] = 0; + dec_cluster_info_page(p, p->cluster_info, offset); + unlock_cluster(ci); + + mem_cgroup_uncharge_swap(entry); + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) { + bool was_full = !p->highest_bit; + + p->highest_bit = offset; + if (was_full && (p->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&p->avail_list)); + if (plist_node_empty(&p->avail_list)) + plist_add(&p->avail_list, + &swap_avail_head); + spin_unlock(&swap_avail_lock); } } + atomic_long_inc(&nr_swap_pages); + p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); + if (p->flags & SWP_BLKDEV) { + struct gendisk *disk = p->bdev->bd_disk; - return usage; + if (disk->fops->swap_slot_free_notify) + disk->fops->swap_slot_free_notify(p->bdev, + offset); + } } /* @@ -840,10 +1042,10 @@ void swap_free(swp_entry_t entry) { struct swap_info_struct *p; - p = swap_info_get(entry); + p = _swap_info_get(entry); if (p) { - swap_entry_free(p, entry, 1); - spin_unlock(&p->lock); + if (!__swap_entry_free(p, entry, 1)) + free_swap_slot(entry); } } @@ -854,11 +1056,33 @@ void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; - p = swap_info_get(entry); + p = _swap_info_get(entry); if (p) { - swap_entry_free(p, entry, SWAP_HAS_CACHE); - spin_unlock(&p->lock); + if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE)) + free_swap_slot(entry); + } +} + +void swapcache_free_entries(swp_entry_t *entries, int n) +{ + struct swap_info_struct *p, *prev; + int i; + + if (n <= 0) + return; + + prev = NULL; + p = NULL; + for (i = 0; i < n; ++i) { + p = swap_info_get_cont(entries[i], prev); + if (p) + swap_entry_free(p, entries[i]); + else + break; + prev = p; } + if (p) + spin_unlock(&p->lock); } /* @@ -870,13 +1094,39 @@ int page_swapcount(struct page *page) { int count = 0; struct swap_info_struct *p; + struct swap_cluster_info *ci; swp_entry_t entry; + unsigned long offset; entry.val = page_private(page); - p = swap_info_get(entry); + p = _swap_info_get(entry); if (p) { - count = swap_count(p->swap_map[swp_offset(entry)]); - spin_unlock(&p->lock); + offset = swp_offset(entry); + ci = lock_cluster_or_swap_info(p, offset); + count = swap_count(p->swap_map[offset]); + unlock_cluster_or_swap_info(p, ci); + } + return count; +} + +/* + * How many references to @entry are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ +int __swp_swapcount(swp_entry_t entry) +{ + int count = 0; + pgoff_t offset; + struct swap_info_struct *si; + struct swap_cluster_info *ci; + + si = __swap_info_get(entry); + if (si) { + offset = swp_offset(entry); + ci = lock_cluster_or_swap_info(si, offset); + count = swap_count(si->swap_map[offset]); + unlock_cluster_or_swap_info(si, ci); } return count; } @@ -889,22 +1139,26 @@ int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; struct swap_info_struct *p; + struct swap_cluster_info *ci; struct page *page; pgoff_t offset; unsigned char *map; - p = swap_info_get(entry); + p = _swap_info_get(entry); if (!p) return 0; - count = swap_count(p->swap_map[swp_offset(entry)]); + offset = swp_offset(entry); + + ci = lock_cluster_or_swap_info(p, offset); + + count = swap_count(p->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; - offset = swp_offset(entry); page = vmalloc_to_page(p->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); @@ -919,7 +1173,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - spin_unlock(&p->lock); + unlock_cluster_or_swap_info(p, ci); return count; } @@ -1011,21 +1265,23 @@ int free_swap_and_cache(swp_entry_t entry) { struct swap_info_struct *p; struct page *page = NULL; + unsigned char count; if (non_swap_entry(entry)) return 1; - p = swap_info_get(entry); + p = _swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { + count = __swap_entry_free(p, entry, 1); + if (count == SWAP_HAS_CACHE) { page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page && !trylock_page(page)) { put_page(page); page = NULL; } - } - spin_unlock(&p->lock); + } else if (!count) + free_swap_slot(entry); } if (page) { /* @@ -1853,6 +2109,17 @@ static void reinsert_swap_info(struct swap_info_struct *p) spin_unlock(&swap_lock); } +bool has_usable_swap(void) +{ + bool ret = true; + + spin_lock(&swap_lock); + if (plist_head_empty(&swap_active_head)) + ret = false; + spin_unlock(&swap_lock); + return ret; +} + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; @@ -1923,6 +2190,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&p->lock); spin_unlock(&swap_lock); + disable_swap_slots_cache_lock(); + set_current_oom_origin(); err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ clear_current_oom_origin(); @@ -1930,9 +2199,12 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (err) { /* re-insert swap space back into swap_list */ reinsert_swap_info(p); + reenable_swap_slots_cache_unlock(); goto out_dput; } + reenable_swap_slots_cache_unlock(); + flush_work(&p->discard_work); destroy_swap_extents(p); @@ -1975,6 +2247,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) vfree(frontswap_map); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); + exit_swap_address_space(p->type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { @@ -2298,6 +2571,13 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return maxpages; } +#define SWAP_CLUSTER_INFO_COLS \ + DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) +#define SWAP_CLUSTER_SPACE_COLS \ + DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) +#define SWAP_CLUSTER_COLS \ + max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) + static int setup_swap_map_and_extents(struct swap_info_struct *p, union swap_header *swap_header, unsigned char *swap_map, @@ -2305,11 +2585,12 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, unsigned long maxpages, sector_t *span) { - int i; + unsigned int j, k; unsigned int nr_good_pages; int nr_extents; unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; + unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; + unsigned long i, idx; nr_good_pages = maxpages - 1; /* omit header page */ @@ -2357,15 +2638,23 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, if (!cluster_info) return nr_extents; - for (i = 0; i < nr_clusters; i++) { - if (!cluster_count(&cluster_info[idx])) { + + /* + * Reduce false cache line sharing between cluster_info and + * sharing same address space. + */ + for (k = 0; k < SWAP_CLUSTER_COLS; k++) { + j = (k + col) % SWAP_CLUSTER_COLS; + for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { + idx = i * SWAP_CLUSTER_COLS + j; + if (idx >= nr_clusters) + continue; + if (cluster_count(&cluster_info[idx])) + continue; cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); cluster_list_add_tail(&p->free_clusters, cluster_info, idx); } - idx++; - if (idx == nr_clusters) - idx = 0; } return nr_extents; } @@ -2468,6 +2757,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; + unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; /* @@ -2475,13 +2765,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * SSD */ p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - cluster_info = vzalloc(DIV_ROUND_UP(maxpages, - SWAPFILE_CLUSTER) * sizeof(*cluster_info)); + cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info)); if (!cluster_info) { error = -ENOMEM; goto bad_swap; } + + for (ci = 0; ci < nr_cluster; ci++) + spin_lock_init(&((cluster_info + ci)->lock)); + p->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!p->percpu_cluster) { error = -ENOMEM; @@ -2538,6 +2832,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } + error = init_swap_address_space(p->type, maxpages); + if (error) + goto bad_swap; + mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) @@ -2593,6 +2891,8 @@ out: putname(name); if (inode && S_ISREG(inode->i_mode)) inode_unlock(inode); + if (!error) + enable_swap_slots_cache(); return error; } @@ -2627,6 +2927,7 @@ void si_swapinfo(struct sysinfo *val) static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; + struct swap_cluster_info *ci; unsigned long offset, type; unsigned char count; unsigned char has_cache; @@ -2640,10 +2941,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) goto bad_file; p = swap_info[type]; offset = swp_offset(entry); - - spin_lock(&p->lock); if (unlikely(offset >= p->max)) - goto unlock_out; + goto out; + + ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; @@ -2686,7 +2987,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p->swap_map[offset] = count | has_cache; unlock_out: - spin_unlock(&p->lock); + unlock_cluster_or_swap_info(p, ci); out: return err; @@ -2775,6 +3076,7 @@ EXPORT_SYMBOL_GPL(__page_file_index); int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) { struct swap_info_struct *si; + struct swap_cluster_info *ci; struct page *head; struct page *page; struct page *list_page; @@ -2798,6 +3100,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } offset = swp_offset(entry); + + ci = lock_cluster(si, offset); + count = si->swap_map[offset] & ~SWAP_HAS_CACHE; if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { @@ -2810,6 +3115,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { + unlock_cluster(ci); spin_unlock(&si->lock); return -ENOMEM; } @@ -2858,6 +3164,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out: + unlock_cluster(ci); spin_unlock(&si->lock); outer: if (page) @@ -2871,7 +3178,8 @@ outer: * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. + * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster + * lock. */ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af817e5060fb..1e5c2f94e8a3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -14,6 +14,9 @@ #include <linux/swapops.h> #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> +#include <linux/hugetlb.h> +#include <linux/pagemap.h> +#include <linux/shmem_fs.h> #include <asm/tlbflush.h> #include "internal.h" @@ -139,6 +142,234 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) return pmd; } +#ifdef CONFIG_HUGETLB_PAGE +/* + * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is + * called with mmap_sem held, it will release mmap_sem before returning. + */ +static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, + struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + bool zeropage) +{ + int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; + int vm_shared = dst_vma->vm_flags & VM_SHARED; + ssize_t err; + pte_t *dst_pte; + unsigned long src_addr, dst_addr; + long copied; + struct page *page; + struct hstate *h; + unsigned long vma_hpagesize; + pgoff_t idx; + u32 hash; + struct address_space *mapping; + + /* + * There is no default zero huge page for all huge page sizes as + * supported by hugetlb. A PMD_SIZE huge pages may exist as used + * by THP. Since we can not reliably insert a zero page, this + * feature is not supported. + */ + if (zeropage) { + up_read(&dst_mm->mmap_sem); + return -EINVAL; + } + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + page = NULL; + vma_hpagesize = vma_kernel_pagesize(dst_vma); + + /* + * Validate alignment based on huge page size + */ + err = -EINVAL; + if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) + goto out_unlock; + +retry: + /* + * On routine entry dst_vma is set. If we had to drop mmap_sem and + * retry, dst_vma will be set to NULL and we must lookup again. + */ + if (!dst_vma) { + err = -EINVAL; + dst_vma = find_vma(dst_mm, dst_start); + if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) + goto out_unlock; + + if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) + goto out_unlock; + + /* + * Make sure the remaining dst range is both valid and + * fully within a single existing vma. + */ + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + goto out_unlock; + + vm_shared = dst_vma->vm_flags & VM_SHARED; + } + + if (WARN_ON(dst_addr & (vma_hpagesize - 1) || + (len - copied) & (vma_hpagesize - 1))) + goto out_unlock; + + /* + * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + goto out_unlock; + + /* + * If not shared, ensure the dst_vma has a anon_vma. + */ + err = -ENOMEM; + if (!vm_shared) { + if (unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + } + + h = hstate_vma(dst_vma); + + while (src_addr < src_start + len) { + pte_t dst_pteval; + + BUG_ON(dst_addr >= dst_start + len); + VM_BUG_ON(dst_addr & ~huge_page_mask(h)); + + /* + * Serialize via hugetlb_fault_mutex + */ + idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; + hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, + idx, dst_addr); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + err = -ENOMEM; + dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); + if (!dst_pte) { + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } + + err = -EEXIST; + dst_pteval = huge_ptep_get(dst_pte); + if (!huge_pte_none(dst_pteval)) { + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } + + err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, + dst_addr, src_addr, &page); + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + vm_alloc_shared = vm_shared; + + cond_resched(); + + if (unlikely(err == -EFAULT)) { + up_read(&dst_mm->mmap_sem); + BUG_ON(!page); + + err = copy_huge_page_from_user(page, + (const void __user *)src_addr, + pages_per_huge_page(h), true); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + down_read(&dst_mm->mmap_sem); + + dst_vma = NULL; + goto retry; + } else + BUG_ON(page); + + if (!err) { + dst_addr += vma_hpagesize; + src_addr += vma_hpagesize; + copied += vma_hpagesize; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + up_read(&dst_mm->mmap_sem); +out: + if (page) { + /* + * We encountered an error and are about to free a newly + * allocated huge page. + * + * Reservation handling is very subtle, and is different for + * private and shared mappings. See the routine + * restore_reserve_on_error for details. Unfortunately, we + * can not call restore_reserve_on_error now as it would + * require holding mmap_sem. + * + * If a reservation for the page existed in the reservation + * map of a private mapping, the map was modified to indicate + * the reservation was consumed when the page was allocated. + * We clear the PagePrivate flag now so that the global + * reserve count will not be incremented in free_huge_page. + * The reservation map will still indicate the reservation + * was consumed and possibly prevent later page allocation. + * This is better than leaking a global reservation. If no + * reservation existed, it is still safe to clear PagePrivate + * as no adjustments to reservation counts were made during + * allocation. + * + * The reservation map for shared mappings indicates which + * pages have reservations. When a huge page is allocated + * for an address with a reservation, no change is made to + * the reserve map. In this case PagePrivate will be set + * to indicate that the global reservation count should be + * incremented when the page is freed. This is the desired + * behavior. However, when a huge page is allocated for an + * address without a reservation a reservation entry is added + * to the reservation map, and PagePrivate will not be set. + * When the page is freed, the global reserve count will NOT + * be incremented and it will appear as though we have leaked + * reserved page. In this case, set PagePrivate so that the + * global reserve count will be incremented to match the + * reservation map entry which was created. + * + * Note that vm_alloc_shared is based on the flags of the vma + * for which the page was originally allocated. dst_vma could + * be different or NULL on error. + */ + if (vm_alloc_shared) + SetPagePrivate(page); + else + ClearPagePrivate(page); + put_page(page); + } + BUG_ON(copied < 0); + BUG_ON(err > 0); + BUG_ON(!copied && !err); + return copied ? copied : err; +} +#else /* !CONFIG_HUGETLB_PAGE */ +/* fail at build time if gcc attempts to use this */ +extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, + struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + bool zeropage); +#endif /* CONFIG_HUGETLB_PAGE */ + static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, @@ -175,13 +406,28 @@ retry: */ err = -EINVAL; dst_vma = find_vma(dst_mm, dst_start); - if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + if (!dst_vma) goto out_unlock; + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock; /* + * If this is a HUGETLB vma, pass off to appropriate routine + */ + if (is_vm_hugetlb_page(dst_vma)) + return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, + src_start, len, zeropage); + + /* * Be strict and only allow __mcopy_atomic on userfaultfd * registered ranges to prevent userland errors going * unnoticed. As far as the VM consistency is concerned, it @@ -193,11 +439,7 @@ retry: if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; - /* - * FIXME: only allow copying on anonymous vmas, tmpfs should - * be added. - */ - if (dst_vma->vm_ops) + if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; /* @@ -206,7 +448,7 @@ retry: * dst_vma. */ err = -ENOMEM; - if (unlikely(anon_vma_prepare(dst_vma))) + if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma))) goto out_unlock; while (src_addr < src_start + len) { @@ -243,12 +485,21 @@ retry: BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); - if (!zeropage) - err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, &page); - else - err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, - dst_addr); + if (vma_is_anonymous(dst_vma)) { + if (!zeropage) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, + &page); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, + dst_vma, dst_addr); + } else { + err = -EINVAL; /* if zeropage is true return -EINVAL */ + if (likely(!zeropage)) + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, + dst_vma, dst_addr, + src_addr, &page); + } cond_resched(); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3ca82d44edd3..d89034a393f2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1662,7 +1662,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); @@ -1724,7 +1724,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure: %lu bytes", real_size); return NULL; } @@ -2309,7 +2309,7 @@ EXPORT_SYMBOL_GPL(free_vm_area); #ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { - return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; + return rb_entry_safe(n, struct vmap_area, rb_node); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 532a2a750952..7bb23ff229b6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -234,22 +234,39 @@ bool pgdat_reclaimable(struct pglist_data *pgdat) pgdat_reclaimable_pages(pgdat) * 6; } -unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) +/** + * lruvec_lru_size - Returns the number of pages on the given LRU list. + * @lruvec: lru vector + * @lru: lru to use + * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) + */ +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { + unsigned long lru_size; + int zid; + if (!mem_cgroup_disabled()) - return mem_cgroup_get_lru_size(lruvec, lru); + lru_size = mem_cgroup_get_lru_size(lruvec, lru); + else + lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); - return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); -} + for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; + unsigned long size; -unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zone_idx) -{ - if (!mem_cgroup_disabled()) - return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); + if (!managed_zone(zone)) + continue; + + if (!mem_cgroup_disabled()) + size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid); + else + size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid], + NR_ZONE_LRU_BASE + lru); + lru_size -= min(size, lru_size); + } + + return lru_size; - return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], - NR_ZONE_LRU_BASE + lru); } /* @@ -912,6 +929,17 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } +struct reclaim_stat { + unsigned nr_dirty; + unsigned nr_unqueued_dirty; + unsigned nr_congested; + unsigned nr_writeback; + unsigned nr_immediate; + unsigned nr_activate; + unsigned nr_ref_keep; + unsigned nr_unmap_fail; +}; + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -919,22 +947,20 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pglist_data *pgdat, struct scan_control *sc, enum ttu_flags ttu_flags, - unsigned long *ret_nr_dirty, - unsigned long *ret_nr_unqueued_dirty, - unsigned long *ret_nr_congested, - unsigned long *ret_nr_writeback, - unsigned long *ret_nr_immediate, + struct reclaim_stat *stat, bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; - unsigned long nr_unqueued_dirty = 0; - unsigned long nr_dirty = 0; - unsigned long nr_congested = 0; - unsigned long nr_reclaimed = 0; - unsigned long nr_writeback = 0; - unsigned long nr_immediate = 0; + unsigned nr_unqueued_dirty = 0; + unsigned nr_dirty = 0; + unsigned nr_congested = 0; + unsigned nr_reclaimed = 0; + unsigned nr_writeback = 0; + unsigned nr_immediate = 0; + unsigned nr_ref_keep = 0; + unsigned nr_unmap_fail = 0; cond_resched(); @@ -1073,6 +1099,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGEREF_ACTIVATE: goto activate_locked; case PAGEREF_KEEP: + nr_ref_keep++; goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: @@ -1110,6 +1137,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : (ttu_flags | TTU_BATCH_FLUSH))) { case SWAP_FAIL: + nr_unmap_fail++; goto activate_locked; case SWAP_AGAIN: goto keep_locked; @@ -1276,11 +1304,16 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); - *ret_nr_dirty += nr_dirty; - *ret_nr_congested += nr_congested; - *ret_nr_unqueued_dirty += nr_unqueued_dirty; - *ret_nr_writeback += nr_writeback; - *ret_nr_immediate += nr_immediate; + if (stat) { + stat->nr_dirty = nr_dirty; + stat->nr_congested = nr_congested; + stat->nr_unqueued_dirty = nr_unqueued_dirty; + stat->nr_writeback = nr_writeback; + stat->nr_immediate = nr_immediate; + stat->nr_activate = pgactivate; + stat->nr_ref_keep = nr_ref_keep; + stat->nr_unmap_fail = nr_unmap_fail; + } return nr_reclaimed; } @@ -1292,7 +1325,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; + unsigned long ret; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1305,8 +1338,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, - &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1437,6 +1469,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; + unsigned long skipped = 0, total_skipped = 0; unsigned long scan, nr_pages; LIST_HEAD(pages_skipped); @@ -1488,14 +1521,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, */ if (!list_empty(&pages_skipped)) { int zid; - unsigned long total_skipped = 0; for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); - total_skipped += nr_skipped[zid]; + skipped += nr_skipped[zid]; } /* @@ -1503,13 +1535,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * close to unreclaimable. If the LRU list is empty, account * skipped pages as a full scan. */ - scan += list_empty(src) ? total_skipped : total_skipped >> 2; + total_skipped = list_empty(src) ? skipped : skipped >> 2; list_splice(&pages_skipped, src); } - *nr_scanned = scan; - trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, - nr_taken, mode, is_file_lru(lru)); + *nr_scanned = scan + total_skipped; + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, + scan, skipped, nr_taken, mode, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } @@ -1669,30 +1701,6 @@ static int current_may_throttle(void) bdi_write_congested(current->backing_dev_info); } -static bool inactive_reclaimable_pages(struct lruvec *lruvec, - struct scan_control *sc, enum lru_list lru) -{ - int zid; - struct zone *zone; - int file = is_file_lru(lru); - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - - if (!global_reclaim(sc)) - return true; - - for (zid = sc->reclaim_idx; zid >= 0; zid--) { - zone = &pgdat->node_zones[zid]; - if (!managed_zone(zone)) - continue; - - if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE + - LRU_FILE * file) >= SWAP_CLUSTER_MAX) - return true; - } - - return false; -} - /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages @@ -1705,19 +1713,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_dirty = 0; - unsigned long nr_congested = 0; - unsigned long nr_unqueued_dirty = 0; - unsigned long nr_writeback = 0; - unsigned long nr_immediate = 0; + struct reclaim_stat stat = {}; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - if (!inactive_reclaimable_pages(lruvec, sc, lru)) - return 0; - while (unlikely(too_many_isolated(pgdat, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1754,9 +1755,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return 0; nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, - &nr_dirty, &nr_unqueued_dirty, &nr_congested, - &nr_writeback, &nr_immediate, - false); + &stat, false); spin_lock_irq(&pgdat->lru_lock); @@ -1790,7 +1789,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * of pages under pages flagged for immediate reclaim and stall if any * are encountered in the nr_immediate check below. */ - if (nr_writeback && nr_writeback == nr_taken) + if (stat.nr_writeback && stat.nr_writeback == nr_taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* @@ -1802,7 +1801,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * Tag a zone as congested if all the dirty pages scanned were * backed by a congested BDI and wait_iff_congested will stall. */ - if (nr_dirty && nr_dirty == nr_congested) + if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) set_bit(PGDAT_CONGESTED, &pgdat->flags); /* @@ -1811,7 +1810,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * the pgdat PGDAT_DIRTY and kswapd will start writing pages from * reclaim context. */ - if (nr_unqueued_dirty == nr_taken) + if (stat.nr_unqueued_dirty == nr_taken) set_bit(PGDAT_DIRTY, &pgdat->flags); /* @@ -1820,7 +1819,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * that pages are cycling through the LRU faster than * they are written so also forcibly stall. */ - if (nr_immediate && current_may_throttle()) + if (stat.nr_immediate && current_may_throttle()) congestion_wait(BLK_RW_ASYNC, HZ/10); } @@ -1835,6 +1834,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, + stat.nr_dirty, stat.nr_writeback, + stat.nr_congested, stat.nr_immediate, + stat.nr_activate, stat.nr_ref_keep, + stat.nr_unmap_fail, sc->priority, file); return nr_reclaimed; } @@ -1855,17 +1858,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * * The downside is that we have to touch page->_refcount against each page. * But we had to alter page->flags anyway. + * + * Returns the number of pages moved to the given lru. */ -static void move_active_pages_to_lru(struct lruvec *lruvec, +static unsigned move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *list, struct list_head *pages_to_free, enum lru_list lru) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); - unsigned long pgmoved = 0; struct page *page; int nr_pages; + int nr_moved = 0; while (!list_empty(list)) { page = lru_to_page(list); @@ -1877,7 +1882,6 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, nr_pages = hpage_nr_pages(page); update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); list_move(&page->lru, &lruvec->lists[lru]); - pgmoved += nr_pages; if (put_page_testzero(page)) { __ClearPageLRU(page); @@ -1891,11 +1895,15 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, pages_to_free); + } else { + nr_moved += nr_pages; } } if (!is_active_lru(lru)) - __count_vm_events(PGDEACTIVATE, pgmoved); + __count_vm_events(PGDEACTIVATE, nr_moved); + + return nr_moved; } static void shrink_active_list(unsigned long nr_to_scan, @@ -1911,7 +1919,8 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_inactive); struct page *page; struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - unsigned long nr_rotated = 0; + unsigned nr_deactivate, nr_activate; + unsigned nr_rotated = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -1989,13 +1998,15 @@ static void shrink_active_list(unsigned long nr_to_scan, */ reclaim_stat->recent_rotated[file] += nr_rotated; - move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); - move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); + nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); + trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, + nr_deactivate, nr_rotated, sc->priority, file); } /* @@ -2025,14 +2036,13 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc) + struct scan_control *sc, bool trace) { unsigned long inactive_ratio; - unsigned long inactive; - unsigned long active; + unsigned long inactive, active; + enum lru_list inactive_lru = file * LRU_FILE; + enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; unsigned long gb; - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - int zid; /* * If we don't have swap space, anonymous page deactivation @@ -2041,27 +2051,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, if (!file && !total_swap_pages) return false; - inactive = lruvec_lru_size(lruvec, file * LRU_FILE); - active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); - - /* - * For zone-constrained allocations, it is necessary to check if - * deactivations are required for lowmem to be reclaimed. This - * calculates the inactive/active pages available in eligible zones. - */ - for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) { - struct zone *zone = &pgdat->node_zones[zid]; - unsigned long inactive_zone, active_zone; - - if (!managed_zone(zone)) - continue; - - inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid); - active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid); - - inactive -= min(inactive, inactive_zone); - active -= min(active, active_zone); - } + inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); + active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -2069,6 +2060,13 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, else inactive_ratio = 1; + if (trace) + trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id, + sc->reclaim_idx, + lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, + lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, + inactive_ratio, file); + return inactive * inactive_ratio < active; } @@ -2076,7 +2074,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc)) + if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2207,8 +2205,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true, sc) && - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { + if (!inactive_list_is_low(lruvec, true, sc, false) && + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2234,10 +2232,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon in [0], file in [1] */ - anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); - file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); spin_lock_irq(&pgdat->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { @@ -2275,7 +2273,7 @@ out: unsigned long size; unsigned long scan; - size = lruvec_lru_size(lruvec, lru); + size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); scan = size >> sc->priority; if (!scan && pass && force_scan) @@ -2432,7 +2430,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, sc)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -3082,7 +3080,7 @@ static void age_active_anon(struct pglist_data *pgdat, do { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false, sc)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); diff --git a/mm/vmstat.c b/mm/vmstat.c index 7c28df36f50f..69f9aff39a2e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1038,6 +1038,8 @@ const char * const vmstat_text[] = { "compact_fail", "compact_success", "compact_daemon_wake", + "compact_daemon_migrate_scanned", + "compact_daemon_free_scanned", #endif #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/workingset.c b/mm/workingset.c index abb58ffa3c64..a67f5796b995 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -267,7 +267,7 @@ bool workingset_refault(void *shadow) } lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); - active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); rcu_read_unlock(); /* diff --git a/mm/z3fold.c b/mm/z3fold.c index 8f9e89ca1d31..207e5ddc87a2 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -50,7 +50,7 @@ #define ZHDR_SIZE_ALIGNED CHUNK_SIZE #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) -#define BUDDY_MASK ((1 << NCHUNKS_ORDER) - 1) +#define BUDDY_MASK (0x3) struct z3fold_pool; struct z3fold_ops { @@ -109,7 +109,7 @@ struct z3fold_header { unsigned short middle_chunks; unsigned short last_chunks; unsigned short start_middle; - unsigned short first_num:NCHUNKS_ORDER; + unsigned short first_num:2; }; /* @@ -179,7 +179,11 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) return (struct z3fold_header *)(handle & PAGE_MASK); } -/* Returns buddy number */ +/* + * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle + * but that doesn't matter. because the masking will result in the + * correct buddy number. + */ static enum buddy handle_to_buddy(unsigned long handle) { struct z3fold_header *zhdr = handle_to_z3fold_header(handle); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9cc3c0b2c2c1..a1f24989ac23 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -25,7 +25,7 @@ * Usage of struct page flags: * PG_private: identifies the first component page * PG_private2: identifies the last component page - * PG_owner_priv_1: indentifies the huge component page + * PG_owner_priv_1: identifies the huge component page * */ @@ -364,7 +364,7 @@ static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) { return kmem_cache_alloc(pool->zspage_cachep, flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); -}; +} static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) { @@ -2383,7 +2383,7 @@ struct zs_pool *zs_create_pool(const char *name) goto err; /* - * Iterate reversly, because, size of size_class that we want to use + * Iterate reversely, because, size of size_class that we want to use * for merging should be larger or equal to current size. */ for (i = zs_size_classes - 1; i >= 0; i--) { diff --git a/scripts/Lindent b/scripts/Lindent index 6d889de4e70b..57b564c24d61 100755 --- a/scripts/Lindent +++ b/scripts/Lindent @@ -1,21 +1,25 @@ #!/bin/sh + PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1" -RES=`indent --version` + +RES=`indent --version | cut -d' ' -f3` if [ "$RES" = "" ]; then exit 1 fi -V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1` -V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2` -V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3` +V1=`echo $RES | cut -d'.' -f1` +V2=`echo $RES | cut -d'.' -f2` +V3=`echo $RES | cut -d'.' -f3` + if [ $V1 -gt 2 ]; then PARAM="$PARAM -il0" elif [ $V1 -eq 2 ]; then if [ $V2 -gt 2 ]; then - PARAM="$PARAM -il0"; + PARAM="$PARAM -il0" elif [ $V2 -eq 2 ]; then if [ $V3 -ge 10 ]; then PARAM="$PARAM -il0" fi fi fi + indent $PARAM "$@" diff --git a/scripts/checkincludes.pl b/scripts/checkincludes.pl index 97b2c6143fe4..381c018a4612 100755 --- a/scripts/checkincludes.pl +++ b/scripts/checkincludes.pl @@ -37,6 +37,8 @@ if ($#ARGV >= 1) { } } +my $dup_counter = 0; + foreach my $file (@ARGV) { open(my $f, '<', $file) or die "Cannot open $file: $!.\n"; @@ -57,6 +59,7 @@ foreach my $file (@ARGV) { foreach my $filename (keys %includedfiles) { if ($includedfiles{$filename} > 1) { print "$file: $filename is included more than once.\n"; + ++$dup_counter; } } next; @@ -73,6 +76,7 @@ foreach my $file (@ARGV) { if ($includedfiles{$filename} > 1) { $includedfiles{$filename}--; $dups++; + ++$dup_counter; } else { print {$f} $_; } @@ -87,3 +91,7 @@ foreach my $file (@ARGV) { } close($f); } + +if ($dup_counter == 0) { + print "No duplicate includes found.\n"; +} diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index dd8397894d5c..694a075381b0 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -78,6 +78,9 @@ my (@stack, $re, $dre, $x, $xs, $funcre); } elsif ($arch eq 'mips') { #88003254: 27bdffe0 addiu sp,sp,-32 $re = qr/.*addiu.*sp,sp,-(([0-9]{2}|[3-9])[0-9]{2})/o; + } elsif ($arch eq 'nios2') { + #25a8: defffb04 addi sp,sp,-20 + $re = qr/.*addi.*sp,sp,-(([0-9]{2}|[3-9])[0-9]{2})/o; } elsif ($arch eq 'parisc' || $arch eq 'parisc64') { $re = qr/.*ldo ($x{1,8})\(sp\),sp/o; } elsif ($arch eq 'ppc') { diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 163c720d3f2b..b3a1994b5df7 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -16,6 +16,7 @@ absense||absence absolut||absolute absoulte||absolute acccess||access +acceess||access acceleratoin||acceleration accelleration||acceleration accesing||accessing @@ -39,13 +40,14 @@ achitecture||architecture acient||ancient acitions||actions acitve||active -acknowldegement||acknowldegement +acknowldegement||acknowledgment acknowledgement||acknowledgment ackowledge||acknowledge ackowledged||acknowledged acording||according activete||activate acumulating||accumulating +acumulator||accumulator adapater||adapter addional||additional additionaly||additionally @@ -184,6 +186,7 @@ cacluated||calculated caculation||calculation calender||calendar calle||called +callibration||calibration calucate||calculate calulate||calculate cancelation||cancellation @@ -244,6 +247,7 @@ compatiblity||compatibility competion||completion compilant||compliant compleatly||completely +completition||completion completly||completely complient||compliant componnents||components @@ -257,6 +261,7 @@ conected||connected configuratoin||configuration configuraton||configuration configuretion||configuration +configutation||configuration conider||consider conjuction||conjunction connectinos||connections @@ -317,6 +322,7 @@ dependant||dependent depreacted||deprecated depreacte||deprecate desactivate||deactivate +desciptor||descriptor desciptors||descriptors descripton||description descrition||description @@ -417,9 +423,12 @@ extention||extension extracter||extractor faild||failed faill||fail +failied||failed +faillure||failure failue||failure failuer||failure faireness||fairness +falied||failed faliure||failure familar||familiar fatser||faster @@ -441,6 +450,7 @@ forseeable||foreseeable forse||force fortan||fortran forwardig||forwarding +framming||framing framwork||framework frequncy||frequency frome||from @@ -482,6 +492,7 @@ howver||however hsould||should hypter||hyper identidier||identifier +illigal||illegal imblance||imbalance immeadiately||immediately immedaite||immediate @@ -520,6 +531,7 @@ informtion||information infromation||information ingore||ignore inital||initial +initalized||initialized initalised||initialized initalise||initialize initalize||initialize @@ -532,6 +544,7 @@ initilize||initialize inofficial||unofficial insititute||institute instal||install +instanciated||instantiated inteface||interface integreated||integrated integrety||integrity @@ -557,9 +570,10 @@ intialized||initialized intialize||initialize intregral||integral intrrupt||interrupt +intterrupt||interrupt intuative||intuitive invaid||invalid -invalde||invald +invalde||invalid invalide||invalid invididual||individual invokation||invocation @@ -567,6 +581,8 @@ invokations||invocations irrelevent||irrelevant isnt||isn't isssue||issue +iternations||iterations +itertation||iteration itslef||itself jave||java jeffies||jiffies @@ -621,6 +637,7 @@ messsage||message messsages||messages microprocesspr||microprocessor milliseonds||milliseconds +minium||minimum minumum||minimum miscelleneous||miscellaneous misformed||malformed @@ -668,6 +685,7 @@ occurances||occurrences occured||occurred occurence||occurrence occure||occurred +occured||occurred occuring||occurring offet||offset omitt||omit @@ -681,8 +699,10 @@ optionnal||optional optmizations||optimizations orientatied||orientated orientied||oriented +orignal||original otherise||otherwise ouput||output +oustanding||outstanding overaall||overall overhread||overhead overlaping||overlapping @@ -705,6 +725,7 @@ paramter||parameter paramters||parameters particuarly||particularly particularily||particularly +partiton||partition pased||passed passin||passing pathes||paths @@ -724,6 +745,7 @@ pleaes||please ploting||plotting plugable||pluggable poinnter||pointer +pointeur||pointer poiter||pointer posible||possible positon||position @@ -752,6 +774,7 @@ procceed||proceed proccesors||processors procesed||processed proces||process +procesing||processing processessing||processing processess||processes processpr||processor @@ -780,6 +803,7 @@ protable||portable protcol||protocol protecion||protection protocoll||protocol +promixity||proximity psudo||pseudo psuedo||pseudo psychadelic||psychedelic @@ -801,6 +825,7 @@ recommanded||recommended recyle||recycle redircet||redirect redirectrion||redirection +reename||rename refcounf||refcount refence||reference refered||referred @@ -944,7 +969,9 @@ suble||subtle substract||subtract succesfully||successfully succesful||successful +successed||succeeded successfull||successful +successfuly||successfully sucessfully||successfully sucess||success superflous||superfluous @@ -960,6 +987,7 @@ suppport||support supress||suppress surpresses||suppresses susbsystem||subsystem +suspeneded||suspended suspicously||suspiciously swaping||swapping switchs||switches @@ -967,6 +995,7 @@ symetric||symmetric synax||syntax synchonized||synchronized syncronize||synchronize +syncronized||synchronized syncronizing||synchronizing syncronus||synchronous syste||system @@ -991,7 +1020,7 @@ tramsmitted||transmitted tramsmit||transmit tranfer||transfer transciever||transceiver -transferd||transferrd +transferd||transferred transfered||transferred transfering||transferring transision||transition @@ -1005,22 +1034,30 @@ ture||true tyep||type udpate||update uesd||used +uncommited||uncommitted unconditionaly||unconditionally underun||underrun unecessary||unnecessary unexecpted||unexpected +unexpcted||unexpected unexpectd||unexpected unexpeted||unexpected +unexpexted||unexpected unfortunatelly||unfortunately unifiy||unify unintialized||uninitialized +unkmown||unknown unknonw||unknown unknow||unknown unkown||unknown unneedingly||unnecessarily +unnsupported||unsupported +unmached||unmatched unresgister||unregister +unrgesiter||unregister unsinged||unsigned unstabel||unstable +unsolicitied||unsolicited unsuccessfull||unsuccessful unsuported||unsupported untill||until diff --git a/scripts/tags.sh b/scripts/tags.sh index df5fa777d300..d661f2f3ef61 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -128,6 +128,8 @@ all_target_sources() all_kconfigs() { + find ${tree}arch/ -maxdepth 1 $ignore \ + -name "Kconfig*" -not -type l -print; for arch in $ALLSOURCE_ARCHS; do find_sources $arch 'Kconfig*' done diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index bbab7f4664ac..900dfaf81051 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -10,6 +10,8 @@ BINARIES += on-fault-limit BINARIES += thuge-gen BINARIES += transhuge-stress BINARIES += userfaultfd +BINARIES += userfaultfd_hugetlb +BINARIES += userfaultfd_shmem BINARIES += mlock-random-test all: $(BINARIES) @@ -18,6 +20,12 @@ all: $(BINARIES) userfaultfd: userfaultfd.c ../../../../usr/include/linux/kernel.h $(CC) $(CFLAGS) -O2 -o $@ $< -lpthread +userfaultfd_hugetlb: userfaultfd.c ../../../../usr/include/linux/kernel.h + $(CC) $(CFLAGS) -DHUGETLB_TEST -O2 -o $@ $< -lpthread + +userfaultfd_shmem: userfaultfd.c ../../../../usr/include/linux/kernel.h + $(CC) $(CFLAGS) -DSHMEM_TEST -O2 -o $@ $< -lpthread + mlock-random-test: mlock-random-test.c $(CC) $(CFLAGS) -o $@ $< -lcap diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index e11968b3677e..c92f6cf31d0a 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -103,6 +103,30 @@ else echo "[PASS]" fi +echo "----------------------------" +echo "running userfaultfd_hugetlb" +echo "----------------------------" +# 258MB total huge pages == 128MB src and 128MB dst +./userfaultfd_hugetlb 128 32 $mnt/ufd_test_file +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi +rm -f $mnt/ufd_test_file + +echo "----------------------------" +echo "running userfaultfd_shmem" +echo "----------------------------" +./userfaultfd_shmem 128 32 +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + #cleanup umount $mnt rm -rf $mnt diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index d77ed41b2094..5a840a605a16 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -63,6 +63,7 @@ #include <sys/mman.h> #include <sys/syscall.h> #include <sys/ioctl.h> +#include <sys/wait.h> #include <pthread.h> #include <linux/userfaultfd.h> @@ -76,8 +77,12 @@ static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; #define BOUNCE_POLL (1<<3) static int bounces; +#ifdef HUGETLB_TEST +static int huge_fd; +static char *huge_fd_off0; +#endif static unsigned long long *count_verify; -static int uffd, finished, *pipefd; +static int uffd, uffd_flags, finished, *pipefd; static char *area_src, *area_dst; static char *zeropage; pthread_attr_t attr; @@ -97,6 +102,102 @@ pthread_attr_t attr; ~(unsigned long)(sizeof(unsigned long long) \ - 1))) +#if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) + +/* Anonymous memory */ +#define EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ + (1 << _UFFDIO_COPY) | \ + (1 << _UFFDIO_ZEROPAGE)) + +static int release_pages(char *rel_area) +{ + int ret = 0; + + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) { + perror("madvise"); + ret = 1; + } + + return ret; +} + +static void allocate_area(void **alloc_area) +{ + if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) { + fprintf(stderr, "out of memory\n"); + *alloc_area = NULL; + } +} + +#else /* HUGETLB_TEST or SHMEM_TEST */ + +#define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_BASIC + +#ifdef HUGETLB_TEST + +/* HugeTLB memory */ +static int release_pages(char *rel_area) +{ + int ret = 0; + + if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + rel_area == huge_fd_off0 ? 0 : + nr_pages * page_size, + nr_pages * page_size)) { + perror("fallocate"); + ret = 1; + } + + return ret; +} + + +static void allocate_area(void **alloc_area) +{ + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_HUGETLB, huge_fd, + *alloc_area == area_src ? 0 : + nr_pages * page_size); + if (*alloc_area == MAP_FAILED) { + fprintf(stderr, "mmap of hugetlbfs file failed\n"); + *alloc_area = NULL; + } + + if (*alloc_area == area_src) + huge_fd_off0 = *alloc_area; +} + +#elif defined(SHMEM_TEST) + +/* Shared memory */ +static int release_pages(char *rel_area) +{ + int ret = 0; + + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) { + perror("madvise"); + ret = 1; + } + + return ret; +} + +static void allocate_area(void **alloc_area) +{ + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (*alloc_area == MAP_FAILED) { + fprintf(stderr, "shared memory mmap failed\n"); + *alloc_area = NULL; + } +} + +#else /* SHMEM_TEST */ +#error "Undefined test type" +#endif /* HUGETLB_TEST */ + +#endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */ + static int my_bcmp(char *str1, char *str2, size_t n) { unsigned long i; @@ -217,7 +318,7 @@ static void *locking_thread(void *arg) return NULL; } -static int copy_page(unsigned long offset) +static int copy_page(int ufd, unsigned long offset) { struct uffdio_copy uffdio_copy; @@ -229,7 +330,7 @@ static int copy_page(unsigned long offset) uffdio_copy.len = page_size; uffdio_copy.mode = 0; uffdio_copy.copy = 0; - if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) { + if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { /* real retval in ufdio_copy.copy */ if (uffdio_copy.copy != -EEXIST) fprintf(stderr, "UFFDIO_COPY error %Ld\n", @@ -247,6 +348,7 @@ static void *uffd_poll_thread(void *arg) unsigned long cpu = (unsigned long) arg; struct pollfd pollfd[2]; struct uffd_msg msg; + struct uffdio_register uffd_reg; int ret; unsigned long offset; char tmp_chr; @@ -278,16 +380,35 @@ static void *uffd_poll_thread(void *arg) continue; perror("nonblocking read error"), exit(1); } - if (msg.event != UFFD_EVENT_PAGEFAULT) + switch (msg.event) { + default: fprintf(stderr, "unexpected msg event %u\n", msg.event), exit(1); - if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - fprintf(stderr, "unexpected write fault\n"), exit(1); - offset = (char *)(unsigned long)msg.arg.pagefault.address - - area_dst; - offset &= ~(page_size-1); - if (copy_page(offset)) - userfaults++; + break; + case UFFD_EVENT_PAGEFAULT: + if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + fprintf(stderr, "unexpected write fault\n"), exit(1); + offset = (char *)(unsigned long)msg.arg.pagefault.address - + area_dst; + offset &= ~(page_size-1); + if (copy_page(uffd, offset)) + userfaults++; + break; + case UFFD_EVENT_FORK: + uffd = msg.arg.fork.ufd; + pollfd[0].fd = uffd; + break; + case UFFD_EVENT_MADVDONTNEED: + uffd_reg.range.start = msg.arg.madv_dn.start; + uffd_reg.range.len = msg.arg.madv_dn.end - + msg.arg.madv_dn.start; + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) + fprintf(stderr, "madv_dn failure\n"), exit(1); + break; + case UFFD_EVENT_REMAP: + area_dst = (char *)(unsigned long)msg.arg.remap.to; + break; + } } return (void *)userfaults; } @@ -324,7 +445,7 @@ static void *uffd_read_thread(void *arg) offset = (char *)(unsigned long)msg.arg.pagefault.address - area_dst; offset &= ~(page_size-1); - if (copy_page(offset)) + if (copy_page(uffd, offset)) (*this_cpu_userfaults)++; } return (void *)NULL; @@ -338,7 +459,7 @@ static void *background_thread(void *arg) for (page_nr = cpu * nr_pages_per_cpu; page_nr < (cpu+1) * nr_pages_per_cpu; page_nr++) - copy_page(page_nr * page_size); + copy_page(uffd, page_nr * page_size); return NULL; } @@ -384,10 +505,8 @@ static int stress(unsigned long *userfaults) * UFFDIO_COPY without writing zero pages into area_dst * because the background threads already completed). */ - if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) { - perror("madvise"); + if (release_pages(area_src)) return 1; - } for (cpu = 0; cpu < nr_cpus; cpu++) { char c; @@ -414,27 +533,9 @@ static int stress(unsigned long *userfaults) return 0; } -static int userfaultfd_stress(void) +static int userfaultfd_open(int features) { - void *area; - char *tmp_area; - unsigned long nr; - struct uffdio_register uffdio_register; struct uffdio_api uffdio_api; - unsigned long cpu; - int uffd_flags, err; - unsigned long userfaults[nr_cpus]; - - if (posix_memalign(&area, page_size, nr_pages * page_size)) { - fprintf(stderr, "out of memory\n"); - return 1; - } - area_src = area; - if (posix_memalign(&area, page_size, nr_pages * page_size)) { - fprintf(stderr, "out of memory\n"); - return 1; - } - area_dst = area; uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); if (uffd < 0) { @@ -445,7 +546,7 @@ static int userfaultfd_stress(void) uffd_flags = fcntl(uffd, F_GETFD, NULL); uffdio_api.api = UFFD_API; - uffdio_api.features = 0; + uffdio_api.features = features; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { fprintf(stderr, "UFFDIO_API\n"); return 1; @@ -455,6 +556,235 @@ static int userfaultfd_stress(void) return 1; } + return 0; +} + +/* + * For non-cooperative userfaultfd test we fork() a process that will + * generate pagefaults, will mremap the area monitored by the + * userfaultfd and at last this process will release the monitored + * area. + * For the anonymous and shared memory the area is divided into two + * parts, the first part is accessed before mremap, and the second + * part is accessed after mremap. Since hugetlbfs does not support + * mremap, the entire monitored area is accessed in a single pass for + * HUGETLB_TEST. + * The release of the pages currently generates event only for + * anonymous memory (UFFD_EVENT_MADVDONTNEED), hence it is not checked + * for hugetlb and shmem. + */ +static int faulting_process(void) +{ + unsigned long nr; + unsigned long long count; + +#ifndef HUGETLB_TEST + unsigned long split_nr_pages = (nr_pages + 1) / 2; +#else + unsigned long split_nr_pages = nr_pages; +#endif + + for (nr = 0; nr < split_nr_pages; nr++) { + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) { + fprintf(stderr, + "nr %lu memory corruption %Lu %Lu\n", + nr, count, + count_verify[nr]), exit(1); + } + } + +#ifndef HUGETLB_TEST + area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, area_src); + if (area_dst == MAP_FAILED) + perror("mremap"), exit(1); + + for (; nr < nr_pages; nr++) { + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) { + fprintf(stderr, + "nr %lu memory corruption %Lu %Lu\n", + nr, count, + count_verify[nr]), exit(1); + } + } + +#ifndef SHMEM_TEST + if (release_pages(area_dst)) + return 1; + + for (nr = 0; nr < nr_pages; nr++) { + if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) + fprintf(stderr, "nr %lu is not zero\n", nr), exit(1); + } +#endif /* SHMEM_TEST */ + +#endif /* HUGETLB_TEST */ + + return 0; +} + +static int uffdio_zeropage(int ufd, unsigned long offset) +{ + struct uffdio_zeropage uffdio_zeropage; + int ret; + unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE); + + if (offset >= nr_pages * page_size) + fprintf(stderr, "unexpected offset %lu\n", + offset), exit(1); + uffdio_zeropage.range.start = (unsigned long) area_dst + offset; + uffdio_zeropage.range.len = page_size; + uffdio_zeropage.mode = 0; + ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); + if (ret) { + /* real retval in ufdio_zeropage.zeropage */ + if (has_zeropage) { + if (uffdio_zeropage.zeropage == -EEXIST) + fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"), + exit(1); + else + fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n", + uffdio_zeropage.zeropage), exit(1); + } else { + if (uffdio_zeropage.zeropage != -EINVAL) + fprintf(stderr, + "UFFDIO_ZEROPAGE not -EINVAL %Ld\n", + uffdio_zeropage.zeropage), exit(1); + } + } else if (has_zeropage) { + if (uffdio_zeropage.zeropage != page_size) { + fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n", + uffdio_zeropage.zeropage), exit(1); + } else + return 1; + } else { + fprintf(stderr, + "UFFDIO_ZEROPAGE succeeded %Ld\n", + uffdio_zeropage.zeropage), exit(1); + } + + return 0; +} + +/* exercise UFFDIO_ZEROPAGE */ +static int userfaultfd_zeropage_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long expected_ioctls; + + printf("testing UFFDIO_ZEROPAGE: "); + fflush(stdout); + + if (release_pages(area_dst)) + return 1; + + if (userfaultfd_open(0) < 0) + return 1; + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + fprintf(stderr, "register failure\n"), exit(1); + + expected_ioctls = EXPECTED_IOCTLS; + if ((uffdio_register.ioctls & expected_ioctls) != + expected_ioctls) + fprintf(stderr, + "unexpected missing ioctl for anon memory\n"), + exit(1); + + if (uffdio_zeropage(uffd, 0)) { + if (my_bcmp(area_dst, zeropage, page_size)) + fprintf(stderr, "zeropage is not zero\n"), exit(1); + } + + close(uffd); + printf("done.\n"); + return 0; +} + +static int userfaultfd_events_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long expected_ioctls; + unsigned long userfaults; + pthread_t uffd_mon; + int err, features; + pid_t pid; + char c; + + printf("testing events (fork, remap, madv_dn): "); + fflush(stdout); + + if (release_pages(area_dst)) + return 1; + + features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | + UFFD_FEATURE_EVENT_MADVDONTNEED; + if (userfaultfd_open(features) < 0) + return 1; + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + fprintf(stderr, "register failure\n"), exit(1); + + expected_ioctls = EXPECTED_IOCTLS; + if ((uffdio_register.ioctls & expected_ioctls) != + expected_ioctls) + fprintf(stderr, + "unexpected missing ioctl for anon memory\n"), + exit(1); + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL)) + perror("uffd_poll_thread create"), exit(1); + + pid = fork(); + if (pid < 0) + perror("fork"), exit(1); + + if (!pid) + return faulting_process(); + + waitpid(pid, &err, 0); + if (err) + fprintf(stderr, "faulting process failed\n"), exit(1); + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + perror("pipe write"), exit(1); + if (pthread_join(uffd_mon, (void **)&userfaults)) + return 1; + + close(uffd); + printf("userfaults: %ld\n", userfaults); + + return userfaults != nr_pages; +} + +static int userfaultfd_stress(void) +{ + void *area; + char *tmp_area; + unsigned long nr; + struct uffdio_register uffdio_register; + unsigned long cpu; + int err; + unsigned long userfaults[nr_cpus]; + + allocate_area((void **)&area_src); + if (!area_src) + return 1; + allocate_area((void **)&area_dst); + if (!area_dst) + return 1; + + if (userfaultfd_open(0) < 0) + return 1; + count_verify = malloc(nr_pages * sizeof(unsigned long long)); if (!count_verify) { perror("count_verify"); @@ -528,9 +858,7 @@ static int userfaultfd_stress(void) fprintf(stderr, "register failure\n"); return 1; } - expected_ioctls = (1 << _UFFDIO_WAKE) | - (1 << _UFFDIO_COPY) | - (1 << _UFFDIO_ZEROPAGE); + expected_ioctls = EXPECTED_IOCTLS; if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { fprintf(stderr, @@ -562,10 +890,8 @@ static int userfaultfd_stress(void) * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's * required to MADV_DONTNEED here. */ - if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) { - perror("madvise 2"); + if (release_pages(area_dst)) return 1; - } /* bounce pass */ if (stress(userfaults)) @@ -603,9 +929,15 @@ static int userfaultfd_stress(void) printf("\n"); } - return err; + if (err) + return err; + + close(uffd); + return userfaultfd_zeropage_test() || userfaultfd_events_test(); } +#ifndef HUGETLB_TEST + int main(int argc, char **argv) { if (argc < 3) @@ -632,6 +964,74 @@ int main(int argc, char **argv) return userfaultfd_stress(); } +#else /* HUGETLB_TEST */ + +/* + * Copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +int main(int argc, char **argv) +{ + if (argc < 4) + fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"), + exit(1); + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + page_size = default_huge_page_size(); + if (!page_size) + fprintf(stderr, "Unable to determine huge page size\n"), + exit(2); + if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 + > page_size) + fprintf(stderr, "Impossible to run this test\n"), exit(2); + nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size / + nr_cpus; + if (!nr_pages_per_cpu) { + fprintf(stderr, "invalid MiB\n"); + fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); + } + bounces = atoi(argv[2]); + if (bounces <= 0) { + fprintf(stderr, "invalid bounces\n"); + fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); + } + nr_pages = nr_pages_per_cpu * nr_cpus; + huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755); + if (huge_fd < 0) { + fprintf(stderr, "Open of %s failed", argv[3]); + perror("open"); + exit(1); + } + if (ftruncate(huge_fd, 0)) { + fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]); + perror("ftruncate"); + exit(1); + } + printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", + nr_pages, nr_pages_per_cpu); + return userfaultfd_stress(); +} + +#endif #else /* __NR_userfaultfd */ #warning "missing __NR_userfaultfd definition" diff --git a/tools/vm/Makefile b/tools/vm/Makefile index 93aadaf7ff63..006029456988 100644 --- a/tools/vm/Makefile +++ b/tools/vm/Makefile @@ -9,6 +9,8 @@ CC = $(CROSS_COMPILE)gcc CFLAGS = -Wall -Wextra -I../lib/ LDFLAGS = $(LIBS) +all: $(TARGETS) + $(TARGETS): $(LIBS) $(LIBS): @@ -20,3 +22,9 @@ $(LIBS): clean: $(RM) page-types slabinfo page_owner_sort make -C $(LIB_DIR) clean + +sbindir ?= /usr/sbin + +install: all + install -d $(DESTDIR)$(sbindir) + install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir) |