From d281ee6145183594788ab6d5b55f8d144e69eace Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:16 -0800 Subject: rmap: add argument to charge compound page We're going to allow mapping of individual 4k pages of THP compound page. It means we cannot rely on PageTransHuge() check to decide if map/unmap small page or THP. The patch adds new argument to rmap functions to indicate whether we want to operate on whole compound page or only the small page. [n-horiguchi@ah.jp.nec.com: fix mapcount mismatch in hugepage migration] Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb0669169716..060c7a0edfdf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -175,7 +175,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, goto unlock; get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr); + page_add_new_anon_rmap(kpage, vma, addr, false); mem_cgroup_commit_charge(kpage, memcg, false); lru_cache_add_active_or_unevictable(kpage, vma); @@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); pte_unmap_unlock(ptep, ptl); -- cgit v1.2.3 From f627c2f53786b0445abca47f6aa84c96a1fffec2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:20 -0800 Subject: memcg: adjust to support new THP refcounting As with rmap, with new refcounting we cannot rely on PageTransHuge() to check if we need to charge size of huge page form the cgroup. We need to get information from caller to know whether it was mapped with PMD or PTE. We do uncharge when last reference on the page gone. At that point if we see PageTransHuge() it means we need to unchange whole huge page. The tricky part is partial unmap -- when we try to unmap part of huge page. We don't do a special handing of this situation, meaning we don't uncharge the part of huge page unless last user is gone or split_huge_page() is triggered. In case of cgroup memory pressure happens the partial unmapped page will be split through shrinker. This should be good enough. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 060c7a0edfdf..0167679182c0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; - err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, + false); if (err) return err; @@ -176,7 +177,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr, false); - mem_cgroup_commit_charge(kpage, memcg, false); + mem_cgroup_commit_charge(kpage, memcg, false, false); lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { @@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg); + mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; -- cgit v1.2.3 From 14d27abd1d12a64c89df1ce8c00ef1403226db5a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:00 -0800 Subject: futex, thp: remove special case for THP in get_futex_key With new THP refcounting, we don't need tricks to stabilize huge page. If we've got reference to tail page, it can't split under us. This patch effectively reverts a5b338f2b0b1 ("thp: update futex compound knowledge"). Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Tested-by: Artem Savkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 63 +++++++++++++--------------------------------------------- 1 file changed, 14 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8a310e240cda..eed92a8a4c49 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page, *page_head; + struct page *page; + struct address_space *mapping; int err, ro = 0; /* @@ -519,46 +520,9 @@ again: else err = 0; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - page_head = page; - if (unlikely(PageTail(page))) { - put_page(page); - /* serialize against __split_huge_page_splitting() */ - local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { - page_head = compound_head(page); - /* - * page_head is valid pointer but we must pin - * it before taking the PG_lock and/or - * PG_compound_lock. The moment we re-enable - * irqs __split_huge_page_splitting() can - * return and the head page can be freed from - * under us. We can't take the PG_lock and/or - * PG_compound_lock on a page that could be - * freed from under us. - */ - if (page != page_head) { - get_page(page_head); - put_page(page); - } - local_irq_enable(); - } else { - local_irq_enable(); - goto again; - } - } -#else - page_head = compound_head(page); - if (page != page_head) { - get_page(page_head); - put_page(page); - } -#endif - - lock_page(page_head); - + lock_page(page); /* - * If page_head->mapping is NULL, then it cannot be a PageAnon + * If page->mapping is NULL, then it cannot be a PageAnon * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast @@ -570,12 +534,13 @@ again: * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page_head->mapping. + * an unlikely race, but we do need to retry for page->mapping. */ - if (!page_head->mapping) { - int shmem_swizzled = PageSwapCache(page_head); - unlock_page(page_head); - put_page(page_head); + mapping = compound_head(page)->mapping; + if (!mapping) { + int shmem_swizzled = PageSwapCache(page); + unlock_page(page); + put_page(page); if (shmem_swizzled) goto again; return -EFAULT; @@ -588,7 +553,7 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page_head)) { + if (PageAnon(page)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. @@ -603,15 +568,15 @@ again: key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page_head->mapping->host; + key->shared.inode = mapping->host; key->shared.pgoff = basepage_index(page); } get_futex_key_refs(key); /* implies MB (B) */ out: - unlock_page(page_head); - put_page(page_head); + unlock_page(page); + put_page(page); return err; } -- cgit v1.2.3 From 34c0fd540e79fb49ef9ce864dae1058cca265780 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:14 -0800 Subject: mm, dax, pmem: introduce pfn_t For the purpose of communicating the optional presence of a 'struct page' for the pfn returned from ->direct_access(), introduce a type that encapsulates a page-frame-number plus flags. These flags contain the historical "page_link" encoding for a scatterlist entry, but can also denote "device memory". Where "device memory" is a set of pfns that are not part of the kernel's linear mapping by default, but are accessed via the same memory controller as ram. The motivation for this new type is large capacity persistent memory that needs struct page entries in the 'memmap' to support 3rd party DMA (i.e. O_DIRECT I/O with a persistent memory source/target). However, we also need it in support of maintaining a list of mapped inodes which need to be unmapped at driver teardown or freeze_bdev() time. Signed-off-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Hansen Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 7658d32c5c78..449cb6a5d9a1 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -12,6 +12,7 @@ */ #include #include +#include #include #include #include @@ -147,6 +148,12 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); +pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +{ + return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); +} +EXPORT_SYMBOL(phys_to_pfn_t); + #ifdef CONFIG_ZONE_DEVICE struct page_map { struct resource res; -- cgit v1.2.3 From 9476df7d80dfc425b37bfecf1d89edf8ec81fcb6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:19 -0800 Subject: mm: introduce find_dev_pagemap() There are several scenarios where we need to retrieve and update metadata associated with a given devm_memremap_pages() mapping, and the only lookup key available is a pfn in the range: 1/ We want to augment vmemmap_populate() (called via arch_add_memory()) to allocate memmap storage from pre-allocated pages reserved by the device driver. At vmemmap_alloc_block_buf() time it grabs device pages rather than page allocator pages. This is in support of devm_memremap_pages() mappings where the memmap is too large to fit in main memory (i.e. large persistent memory devices). 2/ Taking a reference against the mapping when inserting device pages into the address_space radix of a given inode. This facilitates unmap_mapping_range() and truncate_inode_pages() operations when the driver is tearing down the mapping. 3/ get_user_pages() operations on ZONE_DEVICE memory require taking a reference against the mapping so that the driver teardown path can revoke and drain usage of device pages. Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 77 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 449cb6a5d9a1..61cfbf4d3054 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -10,6 +10,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include +#include #include #include #include @@ -155,22 +157,57 @@ pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) EXPORT_SYMBOL(phys_to_pfn_t); #ifdef CONFIG_ZONE_DEVICE +static DEFINE_MUTEX(pgmap_lock); +static RADIX_TREE(pgmap_radix, GFP_KERNEL); +#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) +#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) + struct page_map { struct resource res; + struct percpu_ref *ref; + struct dev_pagemap pgmap; }; -static void devm_memremap_pages_release(struct device *dev, void *res) +static void pgmap_radix_release(struct resource *res) +{ + resource_size_t key; + + mutex_lock(&pgmap_lock); + for (key = res->start; key <= res->end; key += SECTION_SIZE) + radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); + mutex_unlock(&pgmap_lock); +} + +static void devm_memremap_pages_release(struct device *dev, void *data) { - struct page_map *page_map = res; + struct page_map *page_map = data; + struct resource *res = &page_map->res; + resource_size_t align_start, align_size; + + pgmap_radix_release(res); /* pages are dead and unused, undo the arch mapping */ - arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + arch_remove_memory(align_start, align_size); +} + +/* assumes rcu_read_lock() held at entry */ +struct dev_pagemap *find_dev_pagemap(resource_size_t phys) +{ + struct page_map *page_map; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); + return page_map ? &page_map->pgmap : NULL; } void *devm_memremap_pages(struct device *dev, struct resource *res) { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); + resource_size_t key, align_start, align_size; struct page_map *page_map; int error, nid; @@ -190,18 +227,50 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) memcpy(&page_map->res, res, sizeof(*res)); + page_map->pgmap.dev = dev; + mutex_lock(&pgmap_lock); + error = 0; + for (key = res->start; key <= res->end; key += SECTION_SIZE) { + struct dev_pagemap *dup; + + rcu_read_lock(); + dup = find_dev_pagemap(key); + rcu_read_unlock(); + if (dup) { + dev_err(dev, "%s: %pr collides with mapping for %s\n", + __func__, res, dev_name(dup->dev)); + error = -EBUSY; + break; + } + error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, + page_map); + if (error) { + dev_err(dev, "%s: failed: %d\n", __func__, error); + break; + } + } + mutex_unlock(&pgmap_lock); + if (error) + goto err_radix; + nid = dev_to_node(dev); if (nid < 0) nid = numa_mem_id(); - error = arch_add_memory(nid, res->start, resource_size(res), true); - if (error) { - devres_free(page_map); - return ERR_PTR(error); - } + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + error = arch_add_memory(nid, align_start, align_size, true); + if (error) + goto err_add_memory; devres_add(dev, page_map); return __va(res->start); + + err_add_memory: + err_radix: + pgmap_radix_release(res); + devres_free(page_map); + return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); #endif /* CONFIG_ZONE_DEVICE */ -- cgit v1.2.3 From 4b94ffdc4163bae1ec73b6e977ffb7a7da3d06d3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:22 -0800 Subject: x86, mm: introduce vmem_altmap to augment vmemmap_populate() In support of providing struct page for large persistent memory capacities, use struct vmem_altmap to change the default policy for allocating memory for the memmap array. The default vmemmap_populate() allocates page table storage area from the page allocator. Given persistent memory capacities relative to DRAM it may not be feasible to store the memmap in 'System Memory'. Instead vmem_altmap represents pre-allocated "device pages" to satisfy vmemmap_alloc_block_buf() requests. Signed-off-by: Dan Williams Reported-by: kbuild test robot Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 61cfbf4d3054..562f6471fe90 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -166,6 +166,7 @@ struct page_map { struct resource res; struct percpu_ref *ref; struct dev_pagemap pgmap; + struct vmem_altmap altmap; }; static void pgmap_radix_release(struct resource *res) @@ -183,6 +184,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) struct page_map *page_map = data; struct resource *res = &page_map->res; resource_size_t align_start, align_size; + struct dev_pagemap *pgmap = &page_map->pgmap; pgmap_radix_release(res); @@ -190,6 +192,8 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, + "%s: failed to free all reserved pages\n", __func__); } /* assumes rcu_read_lock() held at entry */ @@ -203,11 +207,23 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) return page_map ? &page_map->pgmap : NULL; } -void *devm_memremap_pages(struct device *dev, struct resource *res) +/** + * devm_memremap_pages - remap and provide memmap backing for the given resource + * @dev: hosting device for @res + * @res: "host memory" address range + * @altmap: optional descriptor for allocating the memmap from @res + * + * Note, the expectation is that @res is a host memory range that could + * feasibly be treated as a "System RAM" range, i.e. not a device mmio + * range, but this is not enforced. + */ +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct vmem_altmap *altmap) { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); resource_size_t key, align_start, align_size; + struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid; @@ -220,14 +236,27 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) if (is_ram == REGION_INTERSECTS) return __va(res->start); + if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { + dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", + __func__); + return ERR_PTR(-ENXIO); + } + page_map = devres_alloc_node(devm_memremap_pages_release, sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); if (!page_map) return ERR_PTR(-ENOMEM); + pgmap = &page_map->pgmap; memcpy(&page_map->res, res, sizeof(*res)); - page_map->pgmap.dev = dev; + pgmap->dev = dev; + if (altmap) { + memcpy(&page_map->altmap, altmap, sizeof(*altmap)); + pgmap->altmap = &page_map->altmap; + } + pgmap->res = &page_map->res; + mutex_lock(&pgmap_lock); error = 0; for (key = res->start; key <= res->end; key += SECTION_SIZE) { @@ -273,4 +302,43 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + return altmap->reserve + altmap->free; +} + +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +{ + /* + * 'memmap_start' is the virtual address for the first "struct + * page" in this range of the vmemmap array. In the case of + * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple + * pointer arithmetic, so we can perform this to_vmem_altmap() + * conversion without concern for the initialization state of + * the struct page fields. + */ + struct page *page = (struct page *) memmap_start; + struct dev_pagemap *pgmap; + + /* + * Uncoditionally retrieve a dev_pagemap associated with the + * given physical address, this is only for use in the + * arch_{add|remove}_memory() for setting up and tearing down + * the memmap. + */ + rcu_read_lock(); + pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); + rcu_read_unlock(); + + return pgmap ? pgmap->altmap : NULL; +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ -- cgit v1.2.3 From 5c2c2587b13235bf8b5c9027589f22eff68bdf49 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:49 -0800 Subject: mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup get_dev_page() enables paths like get_user_pages() to pin a dynamically mapped pfn-range (devm_memremap_pages()) while the resulting struct page objects are in use. Unlike get_page() it may fail if the device is, or is in the process of being, disabled. While the initial lookup of the range may be an expensive list walk, the result is cached to speed up subsequent lookups which are likely to be in the same mapped range. devm_memremap_pages() now requires a reference counter to be specified at init time. For pmem this means moving request_queue allocation into pmem_alloc() so the existing queue usage counter can track "device pages". ZONE_DEVICE pages always have an elevated count and will never be on an lru reclaim list. That space in 'struct page' can be redirected for other uses, but for safety introduce a poison value that will always trip __list_add() to assert. This allows half of the struct list_head storage to be reclaimed with some assurance to back up the assumption that the page count never goes to zero and a list_add() is never attempted. Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Dave Hansen Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 562f6471fe90..3eb8944265d5 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -179,6 +179,29 @@ static void pgmap_radix_release(struct resource *res) mutex_unlock(&pgmap_lock); } +static unsigned long pfn_first(struct page_map *page_map) +{ + struct dev_pagemap *pgmap = &page_map->pgmap; + const struct resource *res = &page_map->res; + struct vmem_altmap *altmap = pgmap->altmap; + unsigned long pfn; + + pfn = res->start >> PAGE_SHIFT; + if (altmap) + pfn += vmem_altmap_offset(altmap); + return pfn; +} + +static unsigned long pfn_end(struct page_map *page_map) +{ + const struct resource *res = &page_map->res; + + return (res->start + resource_size(res)) >> PAGE_SHIFT; +} + +#define for_each_device_pfn(pfn, map) \ + for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) + static void devm_memremap_pages_release(struct device *dev, void *data) { struct page_map *page_map = data; @@ -186,6 +209,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data) resource_size_t align_start, align_size; struct dev_pagemap *pgmap = &page_map->pgmap; + if (percpu_ref_tryget_live(pgmap->ref)) { + dev_WARN(dev, "%s: page mapping is still live!\n", __func__); + percpu_ref_put(pgmap->ref); + } + pgmap_radix_release(res); /* pages are dead and unused, undo the arch mapping */ @@ -211,20 +239,26 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res * @res: "host memory" address range + * @ref: a live per-cpu reference count * @altmap: optional descriptor for allocating the memmap from @res * - * Note, the expectation is that @res is a host memory range that could - * feasibly be treated as a "System RAM" range, i.e. not a device mmio - * range, but this is not enforced. + * Notes: + * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time + * (or devm release event). + * + * 2/ @res is expected to be a host memory range that could feasibly be + * treated as a "System RAM" range, i.e. not a device mmio range, but + * this is not enforced. */ void *devm_memremap_pages(struct device *dev, struct resource *res, - struct vmem_altmap *altmap) + struct percpu_ref *ref, struct vmem_altmap *altmap) { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); resource_size_t key, align_start, align_size; struct dev_pagemap *pgmap; struct page_map *page_map; + unsigned long pfn; int error, nid; if (is_ram == REGION_MIXED) { @@ -242,6 +276,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return ERR_PTR(-ENXIO); } + if (!ref) + return ERR_PTR(-EINVAL); + page_map = devres_alloc_node(devm_memremap_pages_release, sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); if (!page_map) @@ -255,6 +292,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, memcpy(&page_map->altmap, altmap, sizeof(*altmap)); pgmap->altmap = &page_map->altmap; } + pgmap->ref = ref; pgmap->res = &page_map->res; mutex_lock(&pgmap_lock); @@ -292,6 +330,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (error) goto err_add_memory; + for_each_device_pfn(pfn, page_map) { + struct page *page = pfn_to_page(pfn); + + /* ZONE_DEVICE pages must never appear on a slab lru */ + list_force_poison(&page->lru); + page->pgmap = pgmap; + } devres_add(dev, page_map); return __va(res->start); -- cgit v1.2.3 From 3565fce3a6597e91b8dee3e8e36ebf70f8b7ef9b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:55 -0800 Subject: mm, x86: get_user_pages() for dax mappings A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver has established a devm_memremap_pages() mapping, i.e. when the pfn_t return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when encountering _PAGE_DEVMAP during a page table walk we lookup and pin a struct dev_pagemap instance to keep the result of pfn_to_page() valid until put_page(). Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Dave Hansen Cc: Mel Gorman Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 3eb8944265d5..e517a16cb426 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -169,6 +169,18 @@ struct page_map { struct vmem_altmap altmap; }; +void get_zone_device_page(struct page *page) +{ + percpu_ref_get(page->pgmap->ref); +} +EXPORT_SYMBOL(get_zone_device_page); + +void put_zone_device_page(struct page *page) +{ + put_dev_pagemap(page->pgmap); +} +EXPORT_SYMBOL(put_zone_device_page); + static void pgmap_radix_release(struct resource *res) { resource_size_t key; -- cgit v1.2.3 From 4a9e1cda274893eca7d178d7dc265503ccb9d87a Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 15 Jan 2016 16:57:04 -0800 Subject: mm: bring in additional flag for fixup_user_fault to signal unlock During Jason's work with postcopy migration support for s390 a problem regarding gmap faults was discovered. The gmap code will call fixup_user_fault which will end up always in handle_mm_fault. Till now we never cared about retries, but as the userfaultfd code kind of relies on it. this needs some fix. This patchset does not take care of the futex code. I will now look closer at this. This patch (of 2): With the introduction of userfaultfd, kvm on s390 needs fixup_user_fault to pass in FAULT_FLAG_ALLOW_RETRY and give feedback if during the faulting we ever unlocked mmap_sem. This patch brings in the logic to handle retries as well as it cleans up the current documentation. fixup_user_fault was not having the same semantics as filemap_fault. It never indicated if a retry happened and so a caller wasn't able to handle that case. So we now changed the behaviour to always retry a locked mmap_sem. Signed-off-by: Dominik Dingel Reviewed-by: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Christian Borntraeger Cc: "Jason J. Herne" Cc: David Rientjes Cc: Eric B Munson Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Heiko Carstens Cc: Dominik Dingel Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index eed92a8a4c49..c6f514573b28 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -604,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr) down_read(&mm->mmap_sem); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, - FAULT_FLAG_WRITE); + FAULT_FLAG_WRITE, NULL); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; -- cgit v1.2.3 From b493c34309cb4aebc44272897067ebf54cb07271 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Jan 2016 16:58:16 -0800 Subject: kernel/stop_machine.c: remove CONFIG_SMP dependencies stop_machine.o is only built if CONFIG_SMP=y, so this ifdef always evaluates to true. [akpm@linux-foundation.org: remove now-unneeded ifdef] Reported-by: Valentin Rothberg Cc: Chris Wilson Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index edb6de4f5908..a467e6c28a3b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -529,8 +529,6 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) - static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { @@ -628,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, mutex_unlock(&stop_cpus_mutex); return ret ?: done.ret; } - -#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From 81cc26f2bd11ba4421a17a2d5cebe4bba206c239 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 15 Jan 2016 16:58:21 -0800 Subject: printk: only unregister boot consoles when necessary Boot consoles are typically replaced by proper consoles during the boot process. This can be problematic if the boot console data is part of the init section that is reclaimed late during boot. If the proper console does not register before this point in time, the boot console will need to be removed (so that the freed memory is not accessed), leaving the system without output for some time. There are various reasons why the proper console may not register early enough, such as deferred probe or the driver being a loadable module. If that happens, there is some amount of time where no console messages are visible to the user, which in turn can mean that they won't see crashes or other potentially useful information. To avoid this situation, only remove the boot console when it resides in the init section. Code exists to replace the boot console by the proper console when it is registered, keeping a seamless transition between the boot and proper consoles. Signed-off-by: Thierry Reding Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2ce8826f1053..0c9f02506169 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -48,6 +48,7 @@ #include #include +#include #define CREATE_TRACE_POINTS #include @@ -2658,13 +2659,36 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); +/* + * Some boot consoles access data that is in the init section and which will + * be discarded after the initcalls have been run. To make sure that no code + * will access this data, unregister the boot consoles in a late initcall. + * + * If for some reason, such as deferred probe or the driver being a loadable + * module, the real console hasn't registered yet at this point, there will + * be a brief interval in which no messages are logged to the console, which + * makes it difficult to diagnose problems that occur during this time. + * + * To mitigate this problem somewhat, only unregister consoles whose memory + * intersects with the init section. Note that code exists elsewhere to get + * rid of the boot console as soon as the proper console shows up, so there + * won't be side-effects from postponing the removal. + */ static int __init printk_late_init(void) { struct console *con; for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { - unregister_console(con); + /* + * Make sure to unregister boot consoles whose data + * resides in the init section before the init section + * is discarded. Boot consoles whose data will stick + * around will automatically be unregistered when the + * proper console replaces them. + */ + if (init_section_intersects(con, sizeof(*con))) + unregister_console(con); } } hotcpu_notifier(console_cpu_notify, 0); -- cgit v1.2.3 From 8d91f8b15361dfb438ab6eb3b319e2ded43458ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 15 Jan 2016 16:58:24 -0800 Subject: printk: do cond_resched() between lines while outputting to consoles @console_may_schedule tracks whether console_sem was acquired through lock or trylock. If the former, we're inside a sleepable context and console_conditional_schedule() performs cond_resched(). This allows console drivers which use console_lock for synchronization to yield while performing time-consuming operations such as scrolling. However, the actual console outputting is performed while holding irq-safe logbuf_lock, so console_unlock() clears @console_may_schedule before starting outputting lines. Also, only a few drivers call console_conditional_schedule() to begin with. This means that when a lot of lines need to be output by console_unlock(), for example on a console registration, the task doing console_unlock() may not yield for a long time on a non-preemptible kernel. If this happens with a slow console devices, for example a serial console, the outputting task may occupy the cpu for a very long time. Long enough to trigger softlockup and/or RCU stall warnings, which in turn pile more messages, sometimes enough to trigger the next cycle of warnings incapacitating the system. Fix it by making console_unlock() insert cond_resched() between lines if @console_may_schedule. Signed-off-by: Tejun Heo Reported-by: Calvin Owens Acked-by: Jan Kara Cc: Dave Jones Cc: Kyle McMartin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 3 +-- kernel/printk/printk.c | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index b333380c6bb2..d96469de72dc 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -180,8 +180,7 @@ void panic(const char *fmt, ...) * panic() is not being callled from OOPS. */ debug_locks_off(); - console_trylock(); - console_unlock(); + console_flush_on_panic(); if (!panic_blink) panic_blink = no_blink; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0c9f02506169..08934a395c1a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2234,13 +2234,24 @@ void console_unlock(void) static u64 seen_seq; unsigned long flags; bool wake_klogd = false; - bool retry; + bool do_cond_resched, retry; if (console_suspended) { up_console_sem(); return; } + /* + * Console drivers are called under logbuf_lock, so + * @console_may_schedule should be cleared before; however, we may + * end up dumping a lot of lines, for example, if called from + * console registration path, and should invoke cond_resched() + * between lines if allowable. Not doing so can cause a very long + * scheduling stall on a slow console leading to RCU stall and + * softlockup warnings which exacerbate the issue with more + * messages practically incapacitating the system. + */ + do_cond_resched = console_may_schedule; console_may_schedule = 0; /* flush buffered message fragment immediately to console */ @@ -2312,6 +2323,9 @@ skip: call_console_drivers(level, ext_text, ext_len, text, len); start_critical_timings(); local_irq_restore(flags); + + if (do_cond_resched) + cond_resched(); } console_locked = 0; @@ -2379,6 +2393,25 @@ void console_unblank(void) console_unlock(); } +/** + * console_flush_on_panic - flush console content on panic + * + * Immediately output all pending messages no matter what. + */ +void console_flush_on_panic(void) +{ + /* + * If someone else is holding the console lock, trylock will fail + * and may_schedule may be set. Ignore and proceed to unlock so + * that messages are flushed out. As this can be called from any + * context and we don't want to get preempted while flushing, + * ensure may_schedule is cleared. + */ + console_trylock(); + console_may_schedule = 0; + console_unlock(); +} + /* * Return the console tty driver structure and its associated index */ -- cgit v1.2.3 From 06b031de22d28ae76b2e5bfaf22c56a265a1e106 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 15 Jan 2016 16:59:23 -0800 Subject: printk: change recursion_bug type to bool `recursion_bug' is used as recursion_bug toggle, so make it `bool'. Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 08934a395c1a..e79439134978 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1661,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { - static int recursion_bug; + static bool recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len = 0; @@ -1697,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level, * it can be printed at the next appropriate moment: */ if (!oops_in_progress && !lockdep_recursing(current)) { - recursion_bug = 1; + recursion_bug = true; local_irq_restore(flags); return 0; } @@ -1712,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level, static const char recursion_msg[] = "BUG: recent printk recursion!"; - recursion_bug = 0; + recursion_bug = false; /* emit KERN_CRIT message */ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, NULL, 0, recursion_msg, -- cgit v1.2.3