From b5330628546616af14ff23075fbf8d4ad91f6e25 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 8 Sep 2015 14:58:28 -0700 Subject: mm: introduce vma_is_anonymous(vma) helper special_mapping_fault() is absolutely broken. It seems it was always wrong, but this didn't matter until vdso/vvar started to use more than one page. And after this change vma_is_anonymous() becomes really trivial, it simply checks vm_ops == NULL. However, I do think the helper makes sense. There are a lot of ->vm_ops != NULL checks, the helper makes the caller's code more understandable (self-documented) and this is more grep-friendly. This patch (of 3): Preparation. Add the new simple helper, vma_is_anonymous(vma), and change handle_pte_fault() to use it. It will have more users. The name is not accurate, say a hpet_mmap()'ed vma is not anonymous. Perhaps it should be named vma_has_fault() instead. But it matches the logic in mmap.c/memory.c (see next changes). "True" just means that a page fault will use do_anonymous_page(). Signed-off-by: Oleg Nesterov Acked-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Hugh Dickins Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b257c43855b..dfb7ce05f1e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1260,6 +1260,11 @@ static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr) return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); } +static inline bool vma_is_anonymous(struct vm_area_struct *vma) +{ + return !vma->vm_ops; +} + static inline int stack_guard_page_start(struct vm_area_struct *vma, unsigned long addr) { -- cgit v1.2.3 From e1b9996b85ba3ff143ded04523cd015762d20f03 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 8 Sep 2015 14:58:37 -0700 Subject: thp: vma_adjust_trans_huge(): adjust file-backed VMA too This series of patches adds support for using PMD page table entries to map DAX files. We expect NV-DIMMs to start showing up that are many gigabytes in size and the memory consumption of 4kB PTEs will be astronomical. The patch series leverages much of the Transparant Huge Pages infrastructure, going so far as to borrow one of Kirill's patches from his THP page cache series. This patch (of 10): Since we're going to have huge pages in page cache, we need to call adjust file-backed VMA, which potentially can contain huge pages. For now we call it for all VMAs. Probably later we will need to introduce a flag to indicate that the VMA has huge pages. Signed-off-by: Kirill A. Shutemov Signed-off-by: Matthew Wilcox Acked-by: Hillf Danton Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f10b20f05159..1c53c7d7ef7e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -122,7 +122,7 @@ extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, #endif extern int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); -extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, +extern void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); @@ -138,15 +138,6 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, else return 0; } -static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - long adjust_next) -{ - if (!vma->anon_vma || vma->vm_ops) - return; - __vma_adjust_trans_huge(vma, start, end, adjust_next); -} static inline int hpage_nr_pages(struct page *page) { if (unlikely(PageTransHuge(page))) -- cgit v1.2.3 From c94c2acf84dc16cf4b989bb0bc849785b7ff52f5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:58:40 -0700 Subject: dax: move DAX-related functions to a new header In order to handle the !CONFIG_TRANSPARENT_HUGEPAGES case, we need to return VM_FAULT_FALLBACK from the inlined dax_pmd_fault(), which is defined in linux/mm.h. Given that we don't want to include in , the easiest solution is to move the DAX-related functions to a new header, . We could also have moved VM_FAULT_* definitions to a new header, or a different header that isn't quite such a boil-the-ocean header as , but this felt like the best option. Signed-off-by: Matthew Wilcox Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 21 +++++++++++++++++++++ include/linux/fs.h | 14 -------------- 2 files changed, 21 insertions(+), 14 deletions(-) create mode 100644 include/linux/dax.h (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h new file mode 100644 index 000000000000..4f27d3dbf6e8 --- /dev/null +++ b/include/linux/dax.h @@ -0,0 +1,21 @@ +#ifndef _LINUX_DAX_H +#define _LINUX_DAX_H + +#include +#include +#include + +ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, + get_block_t, dio_iodone_t, int flags); +int dax_clear_blocks(struct inode *, sector_t block, long size); +int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); +int dax_truncate_page(struct inode *, loff_t from, get_block_t); +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); +int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); +int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); +#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) +#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) + +#endif diff --git a/include/linux/fs.h b/include/linux/fs.h index b2f9b9c25e41..72d8a844c692 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -52,7 +52,6 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; -struct vm_fault; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2678,19 +2677,6 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset, extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); -ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, - get_block_t, dio_iodone_t, int flags); -int dax_clear_blocks(struct inode *, sector_t block, long size); -int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); -int dax_truncate_page(struct inode *, loff_t from, get_block_t); -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, - dax_iodone_t); -int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, - dax_iodone_t); -int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) -#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) - #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, loff_t file_offset); -- cgit v1.2.3 From 4897c7655d9419ba7e62bac145ec6a1847134d93 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:58:45 -0700 Subject: thp: prepare for DAX huge pages Add a vma_is_dax() helper macro to test whether the VMA is DAX, and use it in zap_huge_pmd() and __split_huge_page_pmd(). [akpm@linux-foundation.org: fix build] Signed-off-by: Matthew Wilcox Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 4f27d3dbf6e8..9b51f9d40ad9 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -18,4 +18,8 @@ int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); #define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) #define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) +static inline bool vma_is_dax(struct vm_area_struct *vma) +{ + return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); +} #endif -- cgit v1.2.3 From b96375f74a6d4f39fc6cbdc0bce5175115c7f96f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:58:48 -0700 Subject: mm: add a pmd_fault handler Allow non-anonymous VMAs to provide huge pages in response to a page fault. Signed-off-by: Matthew Wilcox Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index dfb7ce05f1e3..e1fbd18b8c4b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -249,6 +249,8 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); + int (*pmd_fault)(struct vm_area_struct *, unsigned long address, + pmd_t *, unsigned int flags); void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); /* notification that a previously read-only page is about to become -- cgit v1.2.3 From fc43704437ebe40f642ac53f7ee73661fe74e6b8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:58:51 -0700 Subject: mm: export various functions for the benefit of DAX To use the huge zero page in DAX, we need these functions exported. Signed-off-by: Matthew Wilcox Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1c53c7d7ef7e..70587ea079c3 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -155,6 +155,16 @@ static inline bool is_huge_zero_page(struct page *page) return ACCESS_ONCE(huge_zero_page) == page; } +static inline bool is_huge_zero_pmd(pmd_t pmd) +{ + return is_huge_zero_page(pmd_page(pmd)); +} + +struct page *get_huge_zero_page(void); +bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long haddr, + pmd_t *pmd, struct page *zero_page); + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) -- cgit v1.2.3 From 5cad465d7fa646bad3d677df276bfc8e2ad709e3 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:58:54 -0700 Subject: mm: add vmf_insert_pfn_pmd() Similar to vm_insert_pfn(), but for PMDs rather than PTEs. The 'vmf_' prefix instead of 'vm_' prefix is intended to indicate that it returns a VMF_ value rather than an errno (which would only have to be converted into a VMF_ value anyway). Signed-off-by: Matthew Wilcox Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 70587ea079c3..f9b612fec4dd 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -33,6 +33,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma, extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa); +int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, + unsigned long pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, -- cgit v1.2.3 From 844f35db1088dd1a9de37b53d4d823626232bd19 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Sep 2015 14:58:57 -0700 Subject: dax: add huge page fault support This is the support code for DAX-enabled filesystems to allow them to provide huge pages in response to faults. Signed-off-by: Matthew Wilcox Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Theodore Ts'o Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 9b51f9d40ad9..b415e521528d 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,20 @@ int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, + unsigned int flags, get_block_t, dax_iodone_t); +int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, + unsigned int flags, get_block_t, dax_iodone_t); +#else +static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags, get_block_t gb, + dax_iodone_t di) +{ + return VM_FAULT_FALLBACK; +} +#define __dax_pmd_fault dax_pmd_fault +#endif int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); #define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) #define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) -- cgit v1.2.3 From d295e3415a88ae63a37a22652808b20c7fcb970e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 8 Sep 2015 14:59:34 -0700 Subject: dax: don't use set_huge_zero_page() This is another place where DAX assumed that pgtable_t was a pointer. Open code the important parts of set_huge_zero_page() in DAX and make set_huge_zero_page() static again. Signed-off-by: Kirill A. Shutemov Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f9b612fec4dd..ecb080d6ff42 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -163,9 +163,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) } struct page *get_huge_zero_page(void); -bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long haddr, - pmd_t *pmd, struct page *zero_page); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) -- cgit v1.2.3 From b5e3aa0a4d5e35329203fd09acb0dbc7f7fd64de Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 8 Sep 2015 14:59:56 -0700 Subject: mm: remove put_page_unless_one() It has no callers. Signed-off-by: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index e1fbd18b8c4b..4ec72ef1f04a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -361,18 +361,6 @@ static inline int get_page_unless_zero(struct page *page) return atomic_inc_not_zero(&page->_count); } -/* - * Try to drop a ref unless the page has a refcount of one, return false if - * that is the case. - * This is to make sure that the refcount won't become zero after this drop. - * This can be called when MMU is off so it must not access - * any of the virtual mappings. - */ -static inline int put_page_unless_one(struct page *page) -{ - return atomic_add_unless(&page->_count, -1, 1); -} - extern int page_is_ram(unsigned long pfn); extern int region_is_ram(resource_size_t phys_addr, unsigned long size); -- cgit v1.2.3 From 8334b96221ff0dcbde4873d31eb4d84774ed8ed4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 8 Sep 2015 15:00:24 -0700 Subject: mm: /proc/pid/smaps:: show proportional swap share of the mapping We want to know per-process workingset size for smart memory management on userland and we use swap(ex, zram) heavily to maximize memory efficiency so workingset includes swap as well as RSS. On such system, if there are lots of shared anonymous pages, it's really hard to figure out exactly how many each process consumes memory(ie, rss + wap) if the system has lots of shared anonymous memory(e.g, android). This patch introduces SwapPss field on /proc//smaps so we can get more exact workingset size per process. Bongkyu tested it. Result is below. 1. 50M used swap SwapTotal: 461976 kB SwapFree: 411192 kB $ adb shell cat /proc/*/smaps | grep "SwapPss:" | awk '{sum += $2} END {print sum}'; 48236 $ adb shell cat /proc/*/smaps | grep "Swap:" | awk '{sum += $2} END {print sum}'; 141184 2. 240M used swap SwapTotal: 461976 kB SwapFree: 216808 kB $ adb shell cat /proc/*/smaps | grep "SwapPss:" | awk '{sum += $2} END {print sum}'; 230315 $ adb shell cat /proc/*/smaps | grep "Swap:" | awk '{sum += $2} END {print sum}'; 1387744 [akpm@linux-foundation.org: simplify kunmap_atomic() call] Signed-off-by: Minchan Kim Reported-by: Bongkyu Kim Tested-by: Bongkyu Kim Cc: Hugh Dickins Cc: Sergey Senozhatsky Cc: Jonathan Corbet Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 31496d201fdc..6282f1eb3d6a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -431,6 +431,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); +extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); @@ -522,6 +523,11 @@ static inline int page_swapcount(struct page *page) return 0; } +static inline int swp_swapcount(swp_entry_t entry) +{ + return 0; +} + #define reuse_swap_page(page) (page_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) -- cgit v1.2.3 From 28c015d07507e164d93b33498b4e482ff81c0e9b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 8 Sep 2015 15:00:31 -0700 Subject: mm: improve __GFP_NORETRY comment based on implementation Explicitly state that __GFP_NORETRY will attempt direct reclaim and memory compaction before returning NULL and that the oom killer is not called in the current implementation of the page allocator. [akpm@linux-foundation.org: s/has/have/] Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ad35f300b9a4..3bd64b115999 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -63,7 +63,10 @@ struct vm_area_struct; * but it is definitely preferable to use the flag rather than opencode endless * loop around allocator. * - * __GFP_NORETRY: The VM implementation must not retry indefinitely. + * __GFP_NORETRY: The VM implementation must not retry indefinitely and will + * return NULL when direct reclaim and memory compaction have failed to allow + * the allocation to succeed. The OOM killer is not called with the current + * implementation. * * __GFP_MOVABLE: Flag that this page will be movable by the page migration * mechanism or reclaimed -- cgit v1.2.3 From 6e0fc46dc2152d3e2d25a5d5b640ae3586c247c6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 8 Sep 2015 15:00:36 -0700 Subject: mm, oom: organize oom context into struct There are essential elements to an oom context that are passed around to multiple functions. Organize these elements into a new struct, struct oom_control, that specifies the context for an oom condition. This patch introduces no functional change. Signed-off-by: David Rientjes Acked-by: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/oom.h b/include/linux/oom.h index 7deecb7bca5e..cb29085ded37 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -12,6 +12,14 @@ struct notifier_block; struct mem_cgroup; struct task_struct; +struct oom_control { + struct zonelist *zonelist; + nodemask_t *nodemask; + gfp_t gfp_mask; + int order; + bool force_kill; +}; + /* * Types of limitations to the nodes from which allocations may occur */ @@ -57,21 +65,18 @@ extern unsigned long oom_badness(struct task_struct *p, extern int oom_kills_count(void); extern void note_oom_kill(void); -extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, +extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, nodemask_t *nodemask, - const char *message); + struct mem_cgroup *memcg, const char *message); -extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask, +extern void check_panic_on_oom(struct oom_control *oc, + enum oom_constraint constraint, struct mem_cgroup *memcg); -extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, - unsigned long totalpages, const nodemask_t *nodemask, - bool force_kill); +extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, + struct task_struct *task, unsigned long totalpages); -extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *mask, bool force_kill); +extern bool out_of_memory(struct oom_control *oc); extern void exit_oom_victim(void); -- cgit v1.2.3 From 54e9e29132d7caefcad470281cae06ac34a982c8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 8 Sep 2015 15:00:39 -0700 Subject: mm, oom: pass an oom order of -1 when triggered by sysrq The force_kill member of struct oom_control isn't needed if an order of -1 is used instead. This is the same as order == -1 in struct compact_control which requires full memory compaction. This patch introduces no functional change. Signed-off-by: David Rientjes Cc: Sergey Senozhatsky Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/oom.h b/include/linux/oom.h index cb29085ded37..8fb67b9e6110 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -17,7 +17,6 @@ struct oom_control { nodemask_t *nodemask; gfp_t gfp_mask; int order; - bool force_kill; }; /* -- cgit v1.2.3 From 8989e4c7d4e3c30b55c998a1138cd06c92df7295 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 8 Sep 2015 15:00:44 -0700 Subject: mm, oom: add description of struct oom_control Describe the purpose of struct oom_control and what each member does. Also make gfp_mask and order const since they are never manipulated or passed to functions that discard the qualifier. Signed-off-by: David Rientjes Cc: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/oom.h b/include/linux/oom.h index 8fb67b9e6110..03e6257321f0 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -12,11 +12,25 @@ struct notifier_block; struct mem_cgroup; struct task_struct; +/* + * Details of the page allocation that triggered the oom killer that are used to + * determine what should be killed. + */ struct oom_control { + /* Used to determine cpuset */ struct zonelist *zonelist; - nodemask_t *nodemask; - gfp_t gfp_mask; - int order; + + /* Used to determine mempolicy */ + nodemask_t *nodemask; + + /* Used to determine cpuset and node locality requirement */ + const gfp_t gfp_mask; + + /* + * order == -1 means the oom kill is required by sysrq, otherwise only + * for display purposes. + */ + const int order; }; /* -- cgit v1.2.3 From 33398cf2f360c5ce24c8a22436d52a06ad4e5eb5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 8 Sep 2015 15:01:02 -0700 Subject: memcg: export struct mem_cgroup mem_cgroup structure is defined in mm/memcontrol.c currently which means that the code outside of this file has to use external API even for trivial access stuff. This patch exports mm_struct with its dependencies and makes some of the exported functions inlines. This even helps to reduce the code size a bit (make defconfig + CONFIG_MEMCG=y) text data bss dec hex filename 12355346 1823792 1089536 15268674 e8fb42 vmlinux.before 12354970 1823792 1089536 15268298 e8f9ca vmlinux.after This is not much (370B) but better than nothing. We also save a function call in some hot paths like callers of mem_cgroup_count_vm_event which is used for accounting. The patch doesn't introduce any functional changes. [vdavykov@parallels.com: inline memcg_kmem_is_active] [vdavykov@parallels.com: do not expose type outside of CONFIG_MEMCG] [akpm@linux-foundation.org: memcontrol.h needs eventfd.h for eventfd_ctx] [akpm@linux-foundation.org: export mem_cgroup_from_task() to modules] Signed-off-by: Michal Hocko Reviewed-by: Vladimir Davydov Suggested-by: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 370 +++++++++++++++++++++++++++++++++++++++++---- include/linux/swap.h | 10 +- include/net/sock.h | 28 ---- 3 files changed, 346 insertions(+), 62 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 73b02b0a8f60..ab2f6880e27b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -23,6 +23,11 @@ #include #include #include +#include +#include +#include +#include +#include struct mem_cgroup; struct page; @@ -67,12 +72,221 @@ enum mem_cgroup_events_index { MEMCG_NR_EVENTS, }; +/* + * Per memcg event counter is incremented at every pagein/pageout. With THP, + * it will be incremated by the number of pages. This counter is used for + * for trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + */ +enum mem_cgroup_events_target { + MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_TARGET_NUMAINFO, + MEM_CGROUP_NTARGETS, +}; + +/* + * Bits in struct cg_proto.flags + */ +enum cg_proto_flags { + /* Currently active and new sockets should be assigned to cgroups */ + MEMCG_SOCK_ACTIVE, + /* It was ever activated; we must disarm static keys on destruction */ + MEMCG_SOCK_ACTIVATED, +}; + +struct cg_proto { + struct page_counter memory_allocated; /* Current allocated memory. */ + struct percpu_counter sockets_allocated; /* Current number of sockets. */ + int memory_pressure; + long sysctl_mem[3]; + unsigned long flags; + /* + * memcg field is used to find which memcg we belong directly + * Each memcg struct can hold more than one cg_proto, so container_of + * won't really cut. + * + * The elegant solution would be having an inverse function to + * proto_cgroup in struct proto, but that means polluting the structure + * for everybody, instead of just for memcg users. + */ + struct mem_cgroup *memcg; +}; + #ifdef CONFIG_MEMCG +struct mem_cgroup_stat_cpu { + long count[MEM_CGROUP_STAT_NSTATS]; + unsigned long events[MEMCG_NR_EVENTS]; + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct mem_cgroup_reclaim_iter { + struct mem_cgroup *position; + /* scan generation, increased every round-trip */ + unsigned int generation; +}; + +/* + * per-zone information in memory controller. + */ +struct mem_cgroup_per_zone { + struct lruvec lruvec; + unsigned long lru_size[NR_LRU_LISTS]; + + struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; + + struct rb_node tree_node; /* RB tree node */ + unsigned long usage_in_excess;/* Set to the value by which */ + /* the soft limit is exceeded*/ + bool on_tree; + struct mem_cgroup *memcg; /* Back pointer, we cannot */ + /* use container_of */ +}; + +struct mem_cgroup_per_node { + struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; +}; + +struct mem_cgroup_threshold { + struct eventfd_ctx *eventfd; + unsigned long threshold; +}; + +/* For threshold */ +struct mem_cgroup_threshold_ary { + /* An array index points to threshold just below or equal to usage. */ + int current_threshold; + /* Size of entries[] */ + unsigned int size; + /* Array of thresholds */ + struct mem_cgroup_threshold entries[0]; +}; + +struct mem_cgroup_thresholds { + /* Primary thresholds array */ + struct mem_cgroup_threshold_ary *primary; + /* + * Spare threshold array. + * This is needed to make mem_cgroup_unregister_event() "never fail". + * It must be able to store at least primary->size - 1 entries. + */ + struct mem_cgroup_threshold_ary *spare; +}; + +/* + * The memory controller data structure. The memory controller controls both + * page cache and RSS per cgroup. We would eventually like to provide + * statistics based on the statistics developed by Rik Van Riel for clock-pro, + * to help the administrator determine what knobs to tune. + */ +struct mem_cgroup { + struct cgroup_subsys_state css; + + /* Accounted resources */ + struct page_counter memory; + struct page_counter memsw; + struct page_counter kmem; + + /* Normal memory consumption range */ + unsigned long low; + unsigned long high; + + unsigned long soft_limit; + + /* vmpressure notifications */ + struct vmpressure vmpressure; + + /* css_online() has been completed */ + int initialized; + + /* + * Should the accounting and control be hierarchical, per subtree? + */ + bool use_hierarchy; + + /* protected by memcg_oom_lock */ + bool oom_lock; + int under_oom; + + int swappiness; + /* OOM-Killer disable */ + int oom_kill_disable; + + /* protect arrays of thresholds */ + struct mutex thresholds_lock; + + /* thresholds for memory usage. RCU-protected */ + struct mem_cgroup_thresholds thresholds; + + /* thresholds for mem+swap usage. RCU-protected */ + struct mem_cgroup_thresholds memsw_thresholds; + + /* For oom notifier event fd */ + struct list_head oom_notify; + + /* + * Should we move charges of a task when a task is moved into this + * mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + struct task_struct *move_lock_task; + unsigned long move_lock_flags; + /* + * percpu counter. + */ + struct mem_cgroup_stat_cpu __percpu *stat; + spinlock_t pcp_counter_lock; + +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) + struct cg_proto tcp_mem; +#endif +#if defined(CONFIG_MEMCG_KMEM) + /* Index in the kmem_cache->memcg_params.memcg_caches array */ + int kmemcg_id; + bool kmem_acct_activated; + bool kmem_acct_active; +#endif + + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + atomic_t numainfo_events; + atomic_t numainfo_updating; +#endif + +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; + struct wb_domain cgwb_domain; +#endif + + /* List of events which userspace want to receive */ + struct list_head event_list; + spinlock_t event_list_lock; + + struct mem_cgroup_per_node *nodeinfo[0]; + /* WARNING: nodeinfo must be the last member here */ +}; extern struct cgroup_subsys_state *mem_cgroup_root_css; -void mem_cgroup_events(struct mem_cgroup *memcg, +/** + * mem_cgroup_events - count memory events against a cgroup + * @memcg: the memory cgroup + * @idx: the event index + * @nr: the number of events to account for + */ +static inline void mem_cgroup_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx, - unsigned int nr); + unsigned int nr) +{ + this_cpu_add(memcg->stat->events[idx], nr); +} bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); @@ -90,15 +304,31 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); -bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, - struct mem_cgroup *root); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); -extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); +static inline +struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ + return css ? container_of(css, struct mem_cgroup, css) : NULL; +} + +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, + struct mem_cgroup *, + struct mem_cgroup_reclaim_cookie *); +void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); + +static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, + struct mem_cgroup *root) +{ + if (root == memcg) + return true; + if (!root->use_hierarchy) + return false; + return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); +} static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) @@ -114,22 +344,65 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); -struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, - struct mem_cgroup *, - struct mem_cgroup_reclaim_cookie *); -void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); +static inline bool mem_cgroup_disabled(void) +{ + if (memory_cgrp_subsys.disabled) + return true; + return false; +} /* * For memory reclaim. */ -int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); -bool mem_cgroup_lruvec_online(struct lruvec *lruvec); int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); -unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); -void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); + +void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, + int nr_pages); + +static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) +{ + struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return true; + + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + memcg = mz->memcg; + + return !!(memcg->css.flags & CSS_ONLINE); +} + +static inline +unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +{ + struct mem_cgroup_per_zone *mz; + + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + return mz->lru_size[lru]; +} + +static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) +{ + unsigned long inactive_ratio; + unsigned long inactive; + unsigned long active; + unsigned long gb; + + inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); + active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); + + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); + else + inactive_ratio = 1; + + return inactive * inactive_ratio < active; +} + extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -156,18 +429,26 @@ bool mem_cgroup_oom_synchronize(bool wait); extern int do_swap_account; #endif -static inline bool mem_cgroup_disabled(void) -{ - if (memory_cgrp_subsys.disabled) - return true; - return false; -} - struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page); -void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx, int val); void mem_cgroup_end_page_stat(struct mem_cgroup *memcg); +/** + * mem_cgroup_update_page_stat - update page state statistics + * @memcg: memcg to account against + * @idx: page state item to account + * @val: number of pages (positive or negative) + * + * See mem_cgroup_begin_page_stat() for locking requirements. + */ +static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx, int val) +{ + VM_BUG_ON(!rcu_read_lock_held()); + + if (memcg) + this_cpu_add(memcg->stat->count[idx], val); +} + static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { @@ -184,13 +465,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, unsigned long *total_scanned); -void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { + struct mem_cgroup *memcg; + if (mem_cgroup_disabled()) return; - __mem_cgroup_count_vm_event(mm, idx); + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) + goto out; + + switch (idx) { + case PGFAULT: + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); + break; + case PGMAJFAULT: + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); + break; + default: + BUG(); + } +out: + rcu_read_unlock(); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE void mem_cgroup_split_huge_fixup(struct page *head); @@ -275,12 +574,6 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, return true; } -static inline struct cgroup_subsys_state - *mem_cgroup_css(struct mem_cgroup *memcg) -{ - return NULL; -} - static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -444,7 +737,10 @@ static inline bool memcg_kmem_enabled(void) return static_key_false(&memcg_kmem_enabled_key); } -bool memcg_kmem_is_active(struct mem_cgroup *memcg); +static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return memcg->kmem_acct_active; +} /* * In general, we'll do everything in our power to not incur in any overhead @@ -463,7 +759,15 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order); void __memcg_kmem_uncharge_pages(struct page *page, int order); -int memcg_cache_id(struct mem_cgroup *memcg); +/* + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. + */ +static inline int memcg_cache_id(struct mem_cgroup *memcg) +{ + return memcg ? memcg->kmemcg_id : -1; +} struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); diff --git a/include/linux/swap.h b/include/linux/swap.h index 6282f1eb3d6a..2ce190709280 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -351,7 +351,15 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages); extern int kswapd_run(int nid); extern void kswapd_stop(int nid); #ifdef CONFIG_MEMCG -extern int mem_cgroup_swappiness(struct mem_cgroup *mem); +static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) +{ + /* root ? */ + if (mem_cgroup_disabled() || !memcg->css.parent) + return vm_swappiness; + + return memcg->swappiness; +} + #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { diff --git a/include/net/sock.h b/include/net/sock.h index 43c6abcf06ab..a98c71ea40c5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1042,34 +1042,6 @@ struct proto { #endif }; -/* - * Bits in struct cg_proto.flags - */ -enum cg_proto_flags { - /* Currently active and new sockets should be assigned to cgroups */ - MEMCG_SOCK_ACTIVE, - /* It was ever activated; we must disarm static keys on destruction */ - MEMCG_SOCK_ACTIVATED, -}; - -struct cg_proto { - struct page_counter memory_allocated; /* Current allocated memory. */ - struct percpu_counter sockets_allocated; /* Current number of sockets. */ - int memory_pressure; - long sysctl_mem[3]; - unsigned long flags; - /* - * memcg field is used to find which memcg we belong directly - * Each memcg struct can hold more than one cg_proto, so container_of - * won't really cut. - * - * The elegant solution would be having an inverse function to - * proto_cgroup in struct proto, but that means polluting the structure - * for everybody, instead of just for memcg users. - */ - struct mem_cgroup *memcg; -}; - int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); -- cgit v1.2.3 From fabc3fdde00b54825ba23230aedbf88a735b4e49 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 8 Sep 2015 15:01:04 -0700 Subject: memcg: get rid of mem_cgroup_root_css for !CONFIG_MEMCG The only user is cgwb_bdi_init and that one depends on CONFIG_CGROUP_WRITEBACK which in turn depends on CONFIG_MEMCG so it doesn't make much sense to definte an empty stub for !CONFIG_MEMCG. Moreover ERR_PTR(-EINVAL) is ugly and would lead to runtime crashes if used in unguarded code paths. Better fail during compilation. Signed-off-by: Michal Hocko Reviewed-by: Vladimir Davydov Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ab2f6880e27b..91bbe4ba6233 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -498,8 +498,6 @@ void mem_cgroup_split_huge_fixup(struct page *head); #else /* CONFIG_MEMCG */ struct mem_cgroup; -#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) - static inline void mem_cgroup_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx, unsigned int nr) -- cgit v1.2.3 From 64219994898c8689c3d57668996f476f8c2d398c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 8 Sep 2015 15:01:07 -0700 Subject: memcg: get rid of extern for functions in memcontrol.h Most of the exported functions in this header are not marked extern so change the rest to follow the same style. Signed-off-by: Michal Hocko Reviewed-by: Vladimir Davydov Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 91bbe4ba6233..d92b80b63c5c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -306,10 +306,10 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); -extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); -extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); +struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); +struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); -extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); +struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -344,7 +344,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); +struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); static inline bool mem_cgroup_disabled(void) { @@ -403,8 +403,8 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } -extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, - struct task_struct *p); +void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, + struct task_struct *p); static inline void mem_cgroup_oom_enable(void) { @@ -719,8 +719,8 @@ static inline void sock_release_memcg(struct sock *sk) extern struct static_key memcg_kmem_enabled_key; extern int memcg_nr_cache_ids; -extern void memcg_get_cache_ids(void); -extern void memcg_put_cache_ids(void); +void memcg_get_cache_ids(void); +void memcg_put_cache_ids(void); /* * Helper macro to loop through all memcg-specific caches. Callers must still -- cgit v1.2.3 From e752eb68811aeece2220e183e23369a34122fb5e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 8 Sep 2015 15:01:16 -0700 Subject: memcg: move memcg_proto_active from sock.h The only user is sock_update_memcg which is living in memcontrol.c so it doesn't make much sense to pollute sock.h by this inline helper. Move it to memcontrol.c and open code it into its only caller. Signed-off-by: Michal Hocko Cc: Vladimir Davydov Cc: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/net/sock.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index a98c71ea40c5..7aa78440559a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1045,11 +1045,6 @@ struct proto { int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); -static inline bool memcg_proto_active(struct cg_proto *cg_proto) -{ - return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); -} - #ifdef SOCK_REFCNT_DEBUG static inline void sk_refcnt_debug_inc(struct sock *sk) { -- cgit v1.2.3 From bb14c2c75db972a1bf65fd63c8d5a0b41a8f263a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 8 Sep 2015 15:01:25 -0700 Subject: mm: rename and move get/set_freepage_migratetype The pair of get/set_freepage_migratetype() functions are used to cache pageblock migratetype for a page put on a pcplist, so that it does not have to be retrieved again when the page is put on a free list (e.g. when pcplists become full). Historically it was also assumed that the value is accurate for pages on freelists (as the functions' names unfortunately suggest), but that cannot be guaranteed without affecting various allocator fast paths. It is in fact not needed and all such uses have been removed. The last remaining (but pointless) usage related to pages of freelists is in move_freepages(), which this patch removes. To prevent further confusion, rename the functions to get/set_pcppage_migratetype() and expand their description. Since all the users are now in mm/page_alloc.c, move the functions there from the shared header. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Joonsoo Kim Cc: Minchan Kim Acked-by: Michal Nazarewicz Cc: Laura Abbott Reviewed-by: Naoya Horiguchi Cc: Seungho Park Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ec72ef1f04a..bab8ff89da50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -309,18 +309,6 @@ struct inode; #define page_private(page) ((page)->private) #define set_page_private(page, v) ((page)->private = (v)) -/* It's valid only if the page is free path or free_list */ -static inline void set_freepage_migratetype(struct page *page, int migratetype) -{ - page->index = migratetype; -} - -/* It's valid only if the page is free path or free_list */ -static inline int get_freepage_migratetype(struct page *page) -{ - return page->index; -} - /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) -- cgit v1.2.3 From 5e9113731a3ce616e8b5aa128ffc1aeaa4942571 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:28 -0700 Subject: mm/hugetlb: add cache of descriptors to resv_map for region_add hugetlbfs is used today by applications that want a high degree of control over huge page usage. Often, large hugetlbfs files are used to map a large number huge pages into the application processes. The applications know when page ranges within these large files will no longer be used, and ideally would like to release them back to the subpool or global pools for other uses. The fallocate() system call provides an interface for preallocation and hole punching within files. This patch set adds fallocate functionality to hugetlbfs. fallocate hole punch will want to remove a specific range of pages. When pages are removed, their associated entries in the region/reserve map will also be removed. This will break an assumption in the region_chg/region_add calling sequence. If a new region descriptor must be allocated, it is done as part of the region_chg processing. In this way, region_add can not fail because it does not need to attempt an allocation. To prepare for fallocate hole punch, create a "cache" of descriptors that can be used by region_add if necessary. region_chg will ensure there are sufficient entries in the cache. It will be necessary to track the number of in progress add operations to know a sufficient number of descriptors reside in the cache. A new routine region_abort is added to adjust this in progress count when add operations are aborted. vma_abort_reservation is also added for callers creating reservations with vma_needs_reservation/vma_commit_reservation. [akpm@linux-foundation.org: fix typo in comment, use more cols] Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d891f949466a..e2d94960b38b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -35,6 +35,9 @@ struct resv_map { struct kref refs; spinlock_t lock; struct list_head regions; + long adds_in_progress; + struct list_head region_cache; + long region_cache_count; }; extern struct resv_map *resv_map_alloc(void); void resv_map_release(struct kref *ref); -- cgit v1.2.3 From c672c7f29f2fdb73e1f72911bf499675c81fcdbb Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:35 -0700 Subject: mm/hugetlb: expose hugetlb fault mutex for use by fallocate hugetlb page faults are currently synchronized by the table of mutexes (htlb_fault_mutex_table). fallocate code will need to synchronize with the page fault code when it allocates or deletes pages. Expose interfaces so that fallocate operations can be synchronized with page faults. Minor name changes to be more consistent with other global hugetlb symbols. Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e2d94960b38b..530cf6fc24c7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -88,6 +88,11 @@ int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); void free_huge_page(struct page *page); +extern struct mutex *hugetlb_fault_mutex_table; +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -- cgit v1.2.3 From b5cec28d36f5ee6b4e6f68a0a40aa1e4045d6d99 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:41 -0700 Subject: hugetlbfs: truncate_hugepages() takes a range of pages Modify truncate_hugepages() to take a range of pages (start, end) instead of simply start. If an end value of LLONG_MAX is passed, the current "truncate" functionality is maintained. Existing callers are modified to pass LLONG_MAX as end of range. By keying off end == LLONG_MAX, the routine behaves differently for truncate and hole punch. Page removal is now synchronized with page allocation via faults by using the fault mutex table. The hole punch case can experience the rare region_del error and must handle accordingly. Add the routine hugetlb_fix_reserve_counts to fix up reserve counts in the case where region_del returns an error. Since the routine handles more than just the truncate case, it is renamed to remove_inode_hugepages(). To be consistent, the routine truncate_huge_page() is renamed remove_huge_page(). Downstream of remove_inode_hugepages(), the routine hugetlb_unreserve_pages() is also modified to take a range of pages. hugetlb_unreserve_pages is modified to detect an error from region_del and pass it back to the caller. Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 530cf6fc24c7..35afca1692fb 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -83,11 +83,13 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); -void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); +long hugetlb_unreserve_pages(struct inode *inode, long start, long end, + long freed); int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); void free_huge_page(struct page *page); +void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.2.3 From ab76ad540a50191308e5bb6b5e2d9e26c78616d3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:50 -0700 Subject: hugetlbfs: New huge_add_to_page_cache helper routine Currently, there is only a single place where hugetlbfs pages are added to the page cache. The new fallocate code be adding a second one, so break the functionality out into its own helper. Signed-off-by: Dave Hansen Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 35afca1692fb..1222fb07a746 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -333,6 +333,8 @@ struct huge_bootmem_page { struct page *alloc_huge_page_node(struct hstate *h, int nid); struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); +int huge_add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t idx); /* arch callback */ int __init alloc_bootmem_huge_page(struct hstate *h); -- cgit v1.2.3 From 70c3547e36f5c9fbc4caecfeca98f0effa6932c5 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:54 -0700 Subject: hugetlbfs: add hugetlbfs_fallocate() This is based on the shmem version, but it has diverged quite a bit. We have no swap to worry about, nor the new file sealing. Add synchronication via the fault mutex table to coordinate page faults, fallocate allocation and fallocate hole punch. What this allows us to do is move physical memory in and out of a hugetlbfs file without having it mapped. This also gives us the ability to support MADV_REMOVE since it is currently implemented using fallocate(). MADV_REMOVE lets madvise() remove pages from the middle of a hugetlbfs file, which wasn't possible before. hugetlbfs fallocate only operates on whole huge pages. Based on code by Dave Hansen. Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1222fb07a746..5e35379f58a5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -330,6 +330,8 @@ struct huge_bootmem_page { #endif }; +struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_node(struct hstate *h, int nid); struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); @@ -483,6 +485,7 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +#define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL #define alloc_huge_page_noerr(v, a, r) NULL #define alloc_bootmem_huge_page(h) NULL -- cgit v1.2.3 From c5c5c9d1008fb15945d0173b3ca75931ef53ae1f Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 8 Sep 2015 15:02:00 -0700 Subject: mm/memblock.c: make memblock_overlaps_region() return bool. memblock_overlaps_region() checks if the given memblock region intersects a region in memblock. If so, it returns the index of the intersected region. But its only caller is memblock_is_region_reserved(), and it returns 0 if false, non-zero if true. Both of these should return bool. Signed-off-by: Tang Chen Cc: Thomas Gleixner Cc: Tejun Heo Cc: Yasuaki Ishimatsu Cc: Luiz Capitulino Cc: Xishi Qiu Cc: Will Deacon Cc: Vladimir Murzin Cc: Fabian Frederick Cc: Alexander Kuleshov Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index cc4b01972060..d312ae3b51fc 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -323,7 +323,7 @@ void memblock_enforce_memory_limit(phys_addr_t memory_limit); int memblock_is_memory(phys_addr_t addr); int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); int memblock_is_reserved(phys_addr_t addr); -int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); +bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); extern void __memblock_dump_all(void); -- cgit v1.2.3 From 95cf82ecc1fcb44df1768162343cc8eb88083b86 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 8 Sep 2015 15:02:03 -0700 Subject: mem-hotplug: handle node hole when initializing numa_meminfo. When parsing SRAT, all memory ranges are added into numa_meminfo. In numa_init(), before entering numa_cleanup_meminfo(), all possible memory ranges are in numa_meminfo. And numa_cleanup_meminfo() removes all ranges over max_pfn or empty. But, this only works if the nodes are continuous. Let's have a look at the following example: We have an SRAT like this: SRAT: Node 0 PXM 0 [mem 0x00000000-0x5fffffff] SRAT: Node 0 PXM 0 [mem 0x100000000-0x1ffffffffff] SRAT: Node 1 PXM 1 [mem 0x20000000000-0x3ffffffffff] SRAT: Node 4 PXM 2 [mem 0x40000000000-0x5ffffffffff] hotplug SRAT: Node 5 PXM 3 [mem 0x60000000000-0x7ffffffffff] hotplug SRAT: Node 2 PXM 4 [mem 0x80000000000-0x9ffffffffff] hotplug SRAT: Node 3 PXM 5 [mem 0xa0000000000-0xbffffffffff] hotplug SRAT: Node 6 PXM 6 [mem 0xc0000000000-0xdffffffffff] hotplug SRAT: Node 7 PXM 7 [mem 0xe0000000000-0xfffffffffff] hotplug On boot, only node 0,1,2,3 exist. And the numa_meminfo will look like this: numa_meminfo.nr_blks = 9 1. on node 0: [0, 60000000] 2. on node 0: [100000000, 20000000000] 3. on node 1: [20000000000, 40000000000] 4. on node 4: [40000000000, 60000000000] 5. on node 5: [60000000000, 80000000000] 6. on node 2: [80000000000, a0000000000] 7. on node 3: [a0000000000, a0800000000] 8. on node 6: [c0000000000, a0800000000] 9. on node 7: [e0000000000, a0800000000] And numa_cleanup_meminfo() will merge 1 and 2, and remove 8,9 because the end address is over max_pfn, which is a0800000000. But 4 and 5 are not removed because their end addresses are less then max_pfn. But in fact, node 4 and 5 don't exist. In a word, numa_cleanup_meminfo() is not able to handle holes between nodes. Since memory ranges in node 4 and 5 are in numa_meminfo, in numa_register_memblks(), node 4 and 5 will be mistakenly set to online. If you run lscpu, it will show: NUMA node0 CPU(s): 0-14,128-142 NUMA node1 CPU(s): 15-29,143-157 NUMA node2 CPU(s): NUMA node3 CPU(s): NUMA node4 CPU(s): 62-76,190-204 NUMA node5 CPU(s): 78-92,206-220 In this patch, we use memblock_overlaps_region() to check if ranges in numa_meminfo overlap with ranges in memory_block. Since memory_block contains all available memory at boot time, if they overlap, it means the ranges exist. If not, then remove them from numa_meminfo. After this patch, lscpu will show: NUMA node0 CPU(s): 0-14,128-142 NUMA node1 CPU(s): 15-29,143-157 NUMA node4 CPU(s): 62-76,190-204 NUMA node5 CPU(s): 78-92,206-220 Signed-off-by: Tang Chen Reviewed-by: Yasuaki Ishimatsu Cc: Thomas Gleixner Cc: Tejun Heo Cc: Luiz Capitulino Cc: Xishi Qiu Cc: Will Deacon Cc: Vladimir Murzin Cc: Fabian Frederick Cc: Alexander Kuleshov Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index d312ae3b51fc..c518eb589260 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -77,6 +77,8 @@ int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_free(phys_addr_t base, phys_addr_t size); int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); +bool memblock_overlaps_region(struct memblock_type *type, + phys_addr_t base, phys_addr_t size); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); -- cgit v1.2.3 From c5b4e1b02f2a0c2309ecd58a235a2f5ee4eb0074 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 8 Sep 2015 15:02:09 -0700 Subject: mm, page_isolation: make set/unset_migratetype_isolate() file-local Nowaday, set/unset_migratetype_isolate() is defined and used only in mm/page_isolation, so let's limit the scope within the file. Signed-off-by: Naoya Horiguchi Acked-by: David Rientjes Acked-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 2dc1e1697b45..047d64706f2a 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -65,11 +65,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages); -/* - * Internal functions. Changes pageblock's migrate type. - */ -int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages); -void unset_migratetype_isolate(struct page *page, unsigned migratetype); struct page *alloc_migrate_target(struct page *page, unsigned long private, int **resultp); -- cgit v1.2.3 From 64b990d2957cb535fe1c17b9694d5d4f7de69962 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 8 Sep 2015 15:02:15 -0700 Subject: mm: drop __nocast from vm_flags_t definition __nocast does no good for vm_flags_t. It only produces useless sparse warnings. Let's drop it. Signed-off-by: Kirill A. Shutemov Cc: Oleg Nesterov Acked-by: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c8d0a73d64c4..3d6baa7d4534 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -235,7 +235,7 @@ struct page_frag_cache { bool pfmemalloc; }; -typedef unsigned long __nocast vm_flags_t; +typedef unsigned long vm_flags_t; /* * A region containing a mapping of a non-memory backed file under NOMMU -- cgit v1.2.3 From ad82362b2defd4adad87d8538617b2f51a4bf9c3 Mon Sep 17 00:00:00 2001 From: "Sean O. Stalley" Date: Tue, 8 Sep 2015 15:02:27 -0700 Subject: mm: add dma_pool_zalloc() call to DMA API Add a wrapper function for dma_pool_alloc() to get zeroed memory. Signed-off-by: Sean O. Stalley Cc: Vinod Koul Cc: Bjorn Helgaas Cc: Gilles Muller Cc: Nicolas Palix Cc: Michal Marek Cc: Sebastian Andrzej Siewior Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dmapool.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h index e1043f79122f..53ba737505df 100644 --- a/include/linux/dmapool.h +++ b/include/linux/dmapool.h @@ -24,6 +24,12 @@ void dma_pool_destroy(struct dma_pool *pool); void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle); +static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags, + dma_addr_t *handle) +{ + return dma_pool_alloc(pool, mem_flags | __GFP_ZERO, handle); +} + void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr); /* -- cgit v1.2.3 From 01a7fd337b2c2af97e9c55bb9406a222a2e209d3 Mon Sep 17 00:00:00 2001 From: "Sean O. Stalley" Date: Tue, 8 Sep 2015 15:02:30 -0700 Subject: pci: mm: add pci_pool_zalloc() call Add a wrapper function for pci_pool_alloc() to get zeroed memory. Signed-off-by: Sean O. Stalley Cc: Vinod Koul Cc: Bjorn Helgaas Cc: Gilles Muller Cc: Nicolas Palix Cc: Michal Marek Cc: Sebastian Andrzej Siewior Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 1a64733c48c7..e90eb22de628 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1227,6 +1227,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode, dma_pool_create(name, &pdev->dev, size, align, allocation) #define pci_pool_destroy(pool) dma_pool_destroy(pool) #define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle) +#define pci_pool_zalloc(pool, flags, handle) \ + dma_pool_zalloc(pool, flags, handle) #define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr) struct msix_entry { -- cgit v1.2.3 From 6b0f68e32ea8749ff7d4a66cd5761e915e48e59d Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Tue, 8 Sep 2015 15:03:01 -0700 Subject: mm: add utility for early copy from unmapped ram When booting an arm64 kernel w/initrd using UEFI/grub, use of mem= will likely cut off part or all of the initrd. This leaves it outside the kernel linear map which leads to failure when unpacking. The x86 code has a similar need to relocate an initrd outside of mapped memory in some cases. The current x86 code uses early_memremap() to copy the original initrd from unmapped to mapped RAM. This patchset creates a generic copy_from_early_mem() utility based on that x86 code and has arm64 and x86 share it in their respective initrd relocation code. This patch (of 3): In some early boot circumstances, it may be necessary to copy from RAM outside the kernel linear mapping to mapped RAM. The need to relocate an initrd is one example in the x86 code. This patch creates a helper function based on current x86 code. Signed-off-by: Mark Salter Cc: Catalin Marinas Cc: Will Deacon Cc: Arnd Bergmann Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Russell King Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/early_ioremap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index a5de55c04fb2..e539f27ec51b 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -33,6 +33,12 @@ extern void early_ioremap_setup(void); */ extern void early_ioremap_reset(void); +/* + * Early copy from unmapped memory to kernel mapped memory. + */ +extern void copy_from_early_mem(void *dest, phys_addr_t src, + unsigned long size); + #else static inline void early_ioremap_init(void) { } static inline void early_ioremap_setup(void) { } -- cgit v1.2.3 From 94bf4ec84a84d3ab2513b4e681fd3d083328d76d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 8 Sep 2015 15:03:15 -0700 Subject: mm/hwpoison: introduce put_hwpoison_page to put refcount for memory error handling Introduce put_hwpoison_page to put refcount for memory error handling. Signed-off-by: Wanpeng Li Suggested-by: Naoya Horiguchi Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index bab8ff89da50..11df1a8ea38b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2169,6 +2169,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); extern int get_hwpoison_page(struct page *page); +extern void put_hwpoison_page(struct page *page); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); -- cgit v1.2.3 From 8e30456b6c56029ecbb43b777519175e478adfbf Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 8 Sep 2015 15:03:24 -0700 Subject: mm/hwpoison: introduce num_poisoned_pages wrappers num_poisoned_pages counter will be changed outside mm/memory-failure.c by a subsequent patch, so this patch prepares wrappers to manipulate it. Signed-off-by: Naoya Horiguchi Tested-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index cedf3d3c373f..ec04669f2a3b 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -164,6 +164,9 @@ static inline int is_write_migration_entry(swp_entry_t entry) #endif #ifdef CONFIG_MEMORY_FAILURE + +extern atomic_long_t num_poisoned_pages __read_mostly; + /* * Support for hardware poisoned pages */ @@ -177,6 +180,26 @@ static inline int is_hwpoison_entry(swp_entry_t entry) { return swp_type(entry) == SWP_HWPOISON; } + +static inline void num_poisoned_pages_inc(void) +{ + atomic_long_inc(&num_poisoned_pages); +} + +static inline void num_poisoned_pages_dec(void) +{ + atomic_long_dec(&num_poisoned_pages); +} + +static inline void num_poisoned_pages_add(long num) +{ + atomic_long_add(num, &num_poisoned_pages); +} + +static inline void num_poisoned_pages_sub(long num) +{ + atomic_long_sub(num, &num_poisoned_pages); +} #else static inline swp_entry_t make_hwpoison_entry(struct page *page) -- cgit v1.2.3 From da1b13ccfbebe0b9d69b5d61eff0a675e19e69a5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 8 Sep 2015 15:03:27 -0700 Subject: mm/hwpoison: fix race between soft_offline_page and unpoison_memory Wanpeng Li reported a race between soft_offline_page() and unpoison_memory(), which causes the following kernel panic: BUG: Bad page state in process bash pfn:97000 page:ffffea00025c0000 count:0 mapcount:1 mapping: (null) index:0x7f4fdbe00 flags: 0x1fffff80080048(uptodate|active|swapbacked) page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set bad because of flags: flags: 0x40(active) Modules linked in: snd_hda_codec_hdmi i915 rpcsec_gss_krb5 nfsv4 dns_resolver bnep rfcomm nfsd bluetooth auth_rpcgss nfs_acl nfs rfkill lockd grace sunrpc i2c_algo_bit drm_kms_helper snd_hda_codec_realtek snd_hda_codec_generic drm snd_hda_intel fscache snd_hda_codec x86_pkg_temp_thermal coretemp kvm_intel snd_hda_core snd_hwdep kvm snd_pcm snd_seq_dummy snd_seq_oss crct10dif_pclmul snd_seq_midi crc32_pclmul snd_seq_midi_event ghash_clmulni_intel snd_rawmidi aesni_intel lrw gf128mul snd_seq glue_helper ablk_helper snd_seq_device cryptd fuse snd_timer dcdbas serio_raw mei_me parport_pc snd mei ppdev i2c_core video lp soundcore parport lpc_ich shpchp mfd_core ext4 mbcache jbd2 sd_mod e1000e ahci ptp libahci crc32c_intel libata pps_core CPU: 3 PID: 2211 Comm: bash Not tainted 4.2.0-rc5-mm1+ #45 Hardware name: Dell Inc. OptiPlex 7020/0F5C5X, BIOS A03 01/08/2015 Call Trace: dump_stack+0x48/0x5c bad_page+0xe6/0x140 free_pages_prepare+0x2f9/0x320 ? uncharge_list+0xdd/0x100 free_hot_cold_page+0x40/0x170 __put_single_page+0x20/0x30 put_page+0x25/0x40 unmap_and_move+0x1a6/0x1f0 migrate_pages+0x100/0x1d0 ? kill_procs+0x100/0x100 ? unlock_page+0x6f/0x90 __soft_offline_page+0x127/0x2a0 soft_offline_page+0xa6/0x200 This race is explained like below: CPU0 CPU1 soft_offline_page __soft_offline_page TestSetPageHWPoison unpoison_memory PageHWPoison check (true) TestClearPageHWPoison put_page -> release refcount held by get_hwpoison_page in unpoison_memory put_page -> release refcount held by isolate_lru_page in __soft_offline_page migrate_pages The second put_page() releases refcount held by isolate_lru_page() which will lead to unmap_and_move() releases the last refcount of page and w/ mapcount still 1 since try_to_unmap() is not called if there is only one user map the page. Anyway, the page refcount and mapcount will still mess if the page is mapped by multiple users. This race was introduced by commit 4491f71260 ("mm/memory-failure: set PageHWPoison before migrate_pages()"), which focuses on preventing the reuse of successfully migrated page. Before this commit we prevent the reuse by changing the migratetype to MIGRATE_ISOLATE during soft offlining, which has the following problems, so simply reverting the commit is not a best option: 1) it doesn't eliminate the reuse completely, because set_migratetype_isolate() can fail to set MIGRATE_ISOLATE to the target page if the pageblock of the page contains one or more unmovable pages (i.e. has_unmovable_pages() returns true). 2) the original code changes migratetype to MIGRATE_ISOLATE forcibly, and sets it to MIGRATE_MOVABLE forcibly after soft offline, regardless of the original migratetype state, which could impact other subsystems like memory hotplug or compaction. This patch moves PageSetHWPoison just after put_page() in unmap_and_move(), which closes up the reported race window and minimizes another race window b/w SetPageHWPoison and reallocation (which causes the reuse of soft-offlined page.) The latter race window still exists but it's acceptable, because it's rare and effectively the same as ordinary "containment failure" case even if it happens, so keep the window open is acceptable. Fixes: 4491f71260 ("mm/memory-failure: set PageHWPoison before migrate_pages()") Signed-off-by: Wanpeng Li Signed-off-by: Naoya Horiguchi Reported-by: Wanpeng Li Tested-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index ec04669f2a3b..5c3a5f3e7eec 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -181,6 +181,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } +static inline bool test_set_page_hwpoison(struct page *page) +{ + return TestSetPageHWPoison(page); +} + static inline void num_poisoned_pages_inc(void) { atomic_long_inc(&num_poisoned_pages); @@ -211,6 +216,15 @@ static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } + +static inline bool test_set_page_hwpoison(struct page *page) +{ + return false; +} + +static inline void num_poisoned_pages_inc(void) +{ +} #endif #if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) -- cgit v1.2.3 From 96db800f5d73cd5c49461253d45766e094f0f8c2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 8 Sep 2015 15:03:50 -0700 Subject: mm: rename alloc_pages_exact_node() to __alloc_pages_node() alloc_pages_exact_node() was introduced in commit 6484eb3e2a81 ("page allocator: do not check NUMA node ID when the caller knows the node is valid") as an optimized variant of alloc_pages_node(), that doesn't fallback to current node for nid == NUMA_NO_NODE. Unfortunately the name of the function can easily suggest that the allocation is restricted to the given node and fails otherwise. In truth, the node is only preferred, unless __GFP_THISNODE is passed among the gfp flags. The misleading name has lead to mistakes in the past, see for example commits 5265047ac301 ("mm, thp: really limit transparent hugepage allocation to local node") and b360edb43f8e ("mm, mempolicy: migrate_to_node should only migrate to node"). Another issue with the name is that there's a family of alloc_pages_exact*() functions where 'exact' means exact size (instead of page order), which leads to more confusion. To prevent further mistakes, this patch effectively renames alloc_pages_exact_node() to __alloc_pages_node() to better convey that it's an optimized variant of alloc_pages_node() not intended for general usage. Both functions get described in comments. It has been also considered to really provide a convenience function for allocations restricted to a node, but the major opinion seems to be that __GFP_THISNODE already provides that functionality and we shouldn't duplicate the API needlessly. The number of users would be small anyway. Existing callers of alloc_pages_exact_node() are simply converted to call __alloc_pages_node(), with the exception of sba_alloc_coherent() which open-codes the check for NUMA_NO_NODE, so it is converted to use alloc_pages_node() instead. This means it no longer performs some VM_BUG_ON checks, and since the current check for nid in alloc_pages_node() uses a 'nid < 0' comparison (which includes NUMA_NO_NODE), it may hide wrong values which would be previously exposed. Both differences will be rectified by the next patch. To sum up, this patch makes no functional changes, except temporarily hiding potentially buggy callers. Restricting the checks in alloc_pages_node() is left for the next patch which can in turn expose more existing buggy callers. Signed-off-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Robin Holt Acked-by: Michal Hocko Acked-by: Christoph Lameter Acked-by: Michael Ellerman Cc: Mel Gorman Cc: David Rientjes Cc: Greg Thelen Cc: Aneesh Kumar K.V Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Naoya Horiguchi Cc: Tony Luck Cc: Fenghua Yu Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Cliff Whickman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3bd64b115999..d2c142bc872e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -303,20 +303,28 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); } -static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, - unsigned int order) +/* + * Allocate pages, preferring the node given as nid. The node must be valid and + * online. For more general interface, see alloc_pages_node(). + */ +static inline struct page * +__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { - /* Unknown node is current node */ - if (nid < 0) - nid = numa_node_id(); + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)); return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } -static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask, +/* + * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, + * prefer the current CPU's node. + */ +static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)); + /* Unknown node is current node */ + if (nid < 0) + nid = numa_node_id(); return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } @@ -357,7 +365,6 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask); void free_pages_exact(void *virt, size_t size); -/* This is different from alloc_pages_exact_node !!! */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); #define __get_free_page(gfp_mask) \ -- cgit v1.2.3 From 0bc35a970c01c50e3bcc4b5a612787346024e5db Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 8 Sep 2015 15:03:53 -0700 Subject: mm: unify checks in alloc_pages_node() and __alloc_pages_node() Perform the same debug checks in alloc_pages_node() as are done in __alloc_pages_node(), by making the former function a wrapper of the latter one. In addition to better diagnostics in DEBUG_VM builds for situations which have been already fatal (e.g. out-of-bounds node id), there are two visible changes for potential existing buggy callers of alloc_pages_node(): - calling alloc_pages_node() with any negative nid (e.g. due to arithmetic overflow) was treated as passing NUMA_NO_NODE and fallback to local node was applied. This will now be fatal. - calling alloc_pages_node() with an offline node will now be checked for DEBUG_VM builds. Since it's not fatal if the node has been previously online, and this patch may expose some existing buggy callers, change the VM_BUG_ON in __alloc_pages_node() to VM_WARN_ON. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Johannes Weiner Acked-by: Christoph Lameter Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d2c142bc872e..4a12cae2fb0c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -310,23 +310,23 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, static inline struct page * __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)); + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + VM_WARN_ON(!node_online(nid)); return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, - * prefer the current CPU's node. + * prefer the current CPU's node. Otherwise node must be valid and online. */ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { - /* Unknown node is current node */ - if (nid < 0) + if (nid == NUMA_NO_NODE) nid = numa_node_id(); - return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); + return __alloc_pages_node(nid, gfp_mask, order); } #ifdef CONFIG_NUMA -- cgit v1.2.3 From 82c1fc714763b823169958a98196d9be56c63b30 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 8 Sep 2015 15:03:56 -0700 Subject: mm: use numa_mem_id() in alloc_pages_node() alloc_pages_node() might fail when called with NUMA_NO_NODE and __GFP_THISNODE on a CPU belonging to a memoryless node. To make the local-node fallback more robust and prevent such situations, use numa_mem_id(), which was introduced for similar scenarios in the slab context. Suggested-by: Christoph Lameter Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Mel Gorman Acked-by: Christoph Lameter Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4a12cae2fb0c..f92cbd2f4450 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -318,13 +318,14 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, - * prefer the current CPU's node. Otherwise node must be valid and online. + * prefer the current CPU's closest node. Otherwise node must be valid and + * online. */ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { if (nid == NUMA_NO_NODE) - nid = numa_node_id(); + nid = numa_mem_id(); return __alloc_pages_node(nid, gfp_mask, order); } -- cgit v1.2.3 From 7d3f3938236b4bb878214e6791e76fd8409bdeee Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 8 Sep 2015 15:04:35 -0700 Subject: zsmalloc/zram: introduce zs_pool_stats api `zs_compact_control' accounts the number of migrated objects but it has a limited lifespan -- we lose it as soon as zs_compaction() returns back to zram. It worked fine, because (a) zram had it's own counter of migrated objects and (b) only zram could trigger compaction. However, this does not work for automatic pool compaction (not issued by zram). To account objects migrated during auto-compaction (issued by the shrinker) we need to store this number in zs_pool. Define a new `struct zs_pool_stats' structure to keep zs_pool's stats there. It provides only `num_migrated', as of this writing, but it surely can be extended. A new zsmalloc zs_pool_stats() symbol exports zs_pool's stats back to caller. Use zs_pool_stats() in zram and remove `num_migrated' from zram_stats. Signed-off-by: Sergey Senozhatsky Suggested-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zsmalloc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 1338190b5478..ad3d23239043 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -34,6 +34,11 @@ enum zs_mapmode { */ }; +struct zs_pool_stats { + /* How many objects were migrated */ + unsigned long num_migrated; +}; + struct zs_pool; struct zs_pool *zs_create_pool(char *name, gfp_t flags); @@ -49,4 +54,5 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle); unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); +void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); #endif -- cgit v1.2.3 From 860c707dca155a56dfa115ddd6c00959296144a6 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 8 Sep 2015 15:04:38 -0700 Subject: zsmalloc: account the number of compacted pages Compaction returns back to zram the number of migrated objects, which is quite uninformative -- we have objects of different sizes so user space cannot obtain any valuable data from that number. Change compaction to operate in terms of pages and return back to compaction issuer the number of pages that were freed during compaction. So from now on we will export more meaningful value in zram/mm_stat -- the number of freed (compacted) pages. This requires: (a) a rename of `num_migrated' to 'pages_compacted' (b) a internal API change -- return first_page's fullness_group from putback_zspage(), so we know when putback_zspage() did free_zspage(). It helps us to account compaction stats correctly. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zsmalloc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index ad3d23239043..6398dfae53f1 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -35,8 +35,8 @@ enum zs_mapmode { }; struct zs_pool_stats { - /* How many objects were migrated */ - unsigned long num_migrated; + /* How many pages were migrated (freed) */ + unsigned long pages_compacted; }; struct zs_pool; -- cgit v1.2.3 From 5b999aadbae65696a148f55250d94b6f3d74071e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Tue, 8 Sep 2015 15:05:00 -0700 Subject: mm: swap: zswap: maybe_preload & refactoring zswap_get_swap_cache_page and read_swap_cache_async have pretty much the same code with only significant difference in return value and usage of swap_readpage. I a helper __read_swap_cache_async() with the common code. Behavior change: now zswap_get_swap_cache_page will use radix_tree_maybe_preload instead radix_tree_preload. Looks like, this wasn't changed only by the reason of code duplication. Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Michal Hocko Cc: Hugh Dickins Cc: Minchan Kim Cc: Tejun Heo Cc: Jens Axboe Cc: Christoph Hellwig Cc: David Herrmann Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2ce190709280..7ba7dccaf0e7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -406,6 +406,9 @@ extern void free_pages_and_swap_cache(struct page **, int); extern struct page *lookup_swap_cache(swp_entry_t); extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr); +extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, + struct vm_area_struct *vma, unsigned long addr, + bool *new_page_allocated); extern struct page *swapin_readahead(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr); -- cgit v1.2.3 From 786727799a85aeabc20cab5ecfb72771bcbd6b85 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 8 Sep 2015 15:05:03 -0700 Subject: mm: zpool: constify the zpool_ops The structure zpool_ops is not modified so make the pointer to it a pointer to const. Signed-off-by: Krzysztof Kozlowski Acked-by: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index d30eff3d84d5..c924a28d9805 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -37,7 +37,7 @@ enum zpool_mapmode { }; struct zpool *zpool_create_pool(char *type, char *name, - gfp_t gfp, struct zpool_ops *ops); + gfp_t gfp, const struct zpool_ops *ops); char *zpool_get_type(struct zpool *pool); @@ -81,7 +81,7 @@ struct zpool_driver { atomic_t refcount; struct list_head list; - void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops, + void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops, struct zpool *zpool); void (*destroy)(void *pool); -- cgit v1.2.3 From c83db4f419e7105af38cdcca80cc51213214a2c8 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 8 Sep 2015 15:05:06 -0700 Subject: mm: zbud: constify the zbud_ops The structure zbud_ops is not modified so make the pointer to it a pointer to const. Signed-off-by: Krzysztof Kozlowski Acked-by: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zbud.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/zbud.h b/include/linux/zbud.h index f9d41a6e361f..e183a0a65ac1 100644 --- a/include/linux/zbud.h +++ b/include/linux/zbud.h @@ -9,7 +9,7 @@ struct zbud_ops { int (*evict)(struct zbud_pool *pool, unsigned long handle); }; -struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops); +struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops); void zbud_destroy_pool(struct zbud_pool *pool); int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle); -- cgit v1.2.3