diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-12 20:50:02 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-12 20:50:02 -0800 |
commit | e34bac726d27056081d0250c0e173e4b155aa340 (patch) | |
tree | 85607d0b3b185380fb3267866020c6a4372b9298 | |
parent | fe6bce8d30a86c693bf7cfbf4759cbafd121289f (diff) | |
parent | 39a0e975c37dee93fa1b8ea5f7eacd1c4c8a586e (diff) | |
download | linux-stable-e34bac726d27056081d0250c0e173e4b155aa340.tar.gz linux-stable-e34bac726d27056081d0250c0e173e4b155aa340.tar.bz2 linux-stable-e34bac726d27056081d0250c0e173e4b155aa340.zip |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- various misc bits
- most of MM (quite a lot of MM material is awaiting the merge of
linux-next dependencies)
- kasan
- printk updates
- procfs updates
- MAINTAINERS
- /lib updates
- checkpatch updates
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (123 commits)
init: reduce rootwait polling interval time to 5ms
binfmt_elf: use vmalloc() for allocation of vma_filesz
checkpatch: don't emit unified-diff error for rename-only patches
checkpatch: don't check c99 types like uint8_t under tools
checkpatch: avoid multiple line dereferences
checkpatch: don't check .pl files, improve absolute path commit log test
scripts/checkpatch.pl: fix spelling
checkpatch: don't try to get maintained status when --no-tree is given
lib/ida: document locking requirements a bit better
lib/rbtree.c: fix typo in comment of ____rb_erase_color
lib/Kconfig.debug: make CONFIG_STRICT_DEVMEM depend on CONFIG_DEVMEM
MAINTAINERS: add drm and drm/i915 irc channels
MAINTAINERS: add "C:" for URI for chat where developers hang out
MAINTAINERS: add drm and drm/i915 bug filing info
MAINTAINERS: add "B:" for URI where to file bugs
get_maintainer: look for arbitrary letter prefixes in sections
printk: add Kconfig option to set default console loglevel
printk/sound: handle more message headers
printk/btrfs: handle more message headers
printk/kdb: handle more message headers
...
113 files changed, 1550 insertions, 1041 deletions
diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt index 3f1437fbca6b..280d283304bb 100644 --- a/Documentation/devicetree/booting-without-of.txt +++ b/Documentation/devicetree/booting-without-of.txt @@ -974,6 +974,13 @@ compatibility. 4Gb. Some vendors prefer splitting those ranges into smaller segments, but the kernel doesn't care. + Additional properties: + + - hotpluggable : The presence of this property provides an explicit + hint to the operating system that this memory may potentially be + removed later. The kernel can take this into consideration when + doing nonmovable allocations and when laying out memory zones. + e) The /chosen node This node is a bit "special". Normally, that's where Open Firmware diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 74329fd0add2..c03f2f91c6ab 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -191,6 +191,7 @@ read the file /proc/PID/status: CapPrm: 0000000000000000 CapEff: 0000000000000000 CapBnd: ffffffffffffffff + NoNewPrivs: 0 Seccomp: 0 voluntary_ctxt_switches: 0 nonvoluntary_ctxt_switches: 1 @@ -262,6 +263,7 @@ Table 1-2: Contents of the status files (as of 4.1) CapPrm bitmap of permitted capabilities CapEff bitmap of effective capabilities CapBnd bitmap of capabilities bounding set + NoNewPrivs no_new_privs, like prctl(PR_GET_NO_NEW_PRIV, ...) Seccomp seccomp mode, like prctl(PR_GET_SECCOMP, ...) Cpus_allowed mask of CPUs on which this process may run Cpus_allowed_list Same as previous, but in "list format" diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c5f1546a440f..6c6141c76eaa 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2397,7 +2397,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that the amount of memory usable for all allocations is not too small. - movable_node [KNL,X86] Boot-time switch to enable the effects + movable_node [KNL] Boot-time switch to enable the effects of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details. MTD_Partition= [MTD] diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 2ec6adb5a4ce..c4171e4519c2 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -136,6 +136,11 @@ or enable it back by writing 1: echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page +Some userspace (such as a test program, or an optimized memory allocation +library) may want to know the size (in bytes) of a transparent hugepage: + +cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size + khugepaged will be automatically started when transparent_hugepage/enabled is set to "always" or "madvise, and it'll be automatically shutdown if it's set to "never". diff --git a/MAINTAINERS b/MAINTAINERS index 4e62a0e67df9..88315cfcfb39 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -74,6 +74,10 @@ Descriptions of section entries: These reviewers should be CCed on patches. L: Mailing list that is relevant to this area W: Web-page with status/info + B: URI for where to file bugs. A web-page with detailed bug + filing info, a direct bug tracker link, or a mailto: URI. + C: URI for chat protocol, server and channel where developers + usually hang out, for example irc://server/channel. Q: Patchwork web based patch tracking system site T: SCM tree type and location. Type is one of: git, hg, quilt, stgit, topgit @@ -4024,6 +4028,8 @@ DRM DRIVERS M: David Airlie <airlied@linux.ie> L: dri-devel@lists.freedesktop.org T: git git://people.freedesktop.org/~airlied/linux +B: https://bugs.freedesktop.org/ +C: irc://chat.freenode.net/dri-devel S: Maintained F: drivers/gpu/drm/ F: drivers/gpu/vga/ @@ -4076,6 +4082,8 @@ M: Jani Nikula <jani.nikula@linux.intel.com> L: intel-gfx@lists.freedesktop.org L: dri-devel@lists.freedesktop.org W: https://01.org/linuxgraphics/ +B: https://01.org/linuxgraphics/documentation/how-report-bugs +C: irc://chat.freenode.net/intel-gfx Q: http://patchwork.freedesktop.org/project/intel-gfx/ T: git git://anongit.freedesktop.org/drm-intel S: Supported diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 1e25cd80589e..3f2eb76243e3 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -186,6 +186,8 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr) tlb_add_flush(tlb, addr); } +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, @@ -211,18 +213,17 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { + tlb->pages[tlb->nr++] = page; + VM_WARN_ON(tlb->nr > tlb->max); if (tlb->nr == tlb->max) return true; - tlb->pages[tlb->nr++] = page; return false; } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (__tlb_remove_page(tlb, page)) { + if (__tlb_remove_page(tlb, page)) tlb_flush_mmu(tlb); - __tlb_remove_page(tlb, page); - } } static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, @@ -231,12 +232,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { @@ -284,5 +279,11 @@ tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr #define tlb_migrate_finish(mm) do { } while (0) +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #endif /* CONFIG_MMU */ #endif diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 77e541cf0e5d..fced197b9626 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -207,15 +207,15 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) */ static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (tlb->nr == tlb->max) - return true; - tlb->need_flush = 1; if (!tlb->nr && tlb->pages == tlb->local) __tlb_alloc_page(tlb); tlb->pages[tlb->nr++] = page; + VM_WARN_ON(tlb->nr > tlb->max); + if (tlb->nr == tlb->max) + return true; return false; } @@ -236,10 +236,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (__tlb_remove_page(tlb, page)) { + if (__tlb_remove_page(tlb, page)) tlb_flush_mmu(tlb); - __tlb_remove_page(tlb, page); - } } static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, @@ -248,12 +246,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { @@ -283,6 +275,15 @@ do { \ __tlb_remove_tlb_entry(tlb, ptep, addr); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 3cc8498fe0fe..d227a6988d6b 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -34,7 +34,7 @@ config NO_IOPORT_MAP def_bool y config NO_DMA - def_bool y + def_bool n config HZ int diff --git a/arch/m32r/include/asm/device.h b/arch/m32r/include/asm/device.h index d8f9872b0e2d..4a9f35e0973f 100644 --- a/arch/m32r/include/asm/device.h +++ b/arch/m32r/include/asm/device.h @@ -3,5 +3,9 @@ * * This file is released under the GPLv2 */ -#include <asm-generic/device.h> +struct dev_archdata { + struct dma_map_ops *dma_ops; +}; +struct pdev_archdata { +}; diff --git a/arch/m32r/include/asm/dma-mapping.h b/arch/m32r/include/asm/dma-mapping.h new file mode 100644 index 000000000000..2c43a77fe942 --- /dev/null +++ b/arch/m32r/include/asm/dma-mapping.h @@ -0,0 +1,32 @@ +#ifndef _ASM_M32R_DMA_MAPPING_H +#define _ASM_M32R_DMA_MAPPING_H + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/scatterlist.h> +#include <linux/dma-debug.h> +#include <linux/io.h> + +#define DMA_ERROR_CODE (~(dma_addr_t)0x0) + +static inline struct dma_map_ops *get_dma_ops(struct device *dev) +{ + if (dev && dev->archdata.dma_ops) + return dev->archdata.dma_ops; + return &dma_noop_ops; +} + +static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, + enum dma_data_direction direction) +{ +} + +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + if (!dev->dma_mask) + return false; + return addr + size - 1 <= *dev->dma_mask; +} + +#endif /* _ASM_M32R_DMA_MAPPING_H */ diff --git a/arch/m32r/platforms/m32700ut/setup.c b/arch/m32r/platforms/m32700ut/setup.c index 9a4ba8a8589d..349eb341752c 100644 --- a/arch/m32r/platforms/m32700ut/setup.c +++ b/arch/m32r/platforms/m32700ut/setup.c @@ -201,6 +201,7 @@ static struct irq_chip m32700ut_lanpld_irq_type = #define lcdpldirq2port(x) (unsigned long)((int)M32700UT_LCD_ICUCR1 + \ (((x) - 1) * sizeof(unsigned short))) +#ifdef CONFIG_USB static pld_icu_data_t lcdpld_icu_data[M32700UT_NUM_LCD_PLD_IRQ]; static void disable_m32700ut_lcdpld_irq(unsigned int irq) @@ -253,6 +254,7 @@ static struct irq_chip m32700ut_lcdpld_irq_type = .irq_mask = mask_m32700ut_lcdpld, .irq_unmask = unmask_m32700ut_lcdpld, }; +#endif void __init init_IRQ(void) { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 9fd77f8794a0..0ebfbc8f0449 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1009,7 +1009,8 @@ static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, - struct spinlock *old_pmd_ptl) + struct spinlock *old_pmd_ptl, + struct vm_area_struct *vma) { if (radix_enabled()) return false; @@ -1020,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, */ return true; } + + +#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit +static inline bool arch_needs_pgtable_deposit(void) +{ + if (radix_enabled()) + return false; + return true; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 99e1397b71da..609557569f65 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -28,6 +28,7 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change extern void tlb_flush(struct mmu_gather *tlb); @@ -46,6 +47,21 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, #endif } +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ + if (!tlb->page_size) + tlb->page_size = page_size; + else if (tlb->page_size != page_size) { + tlb_flush_mmu(tlb); + /* + * update the page size after flush for the new + * mmu_gather. + */ + tlb->page_size = page_size; + } +} + #ifdef CONFIG_SMP static inline int mm_is_core_local(struct mm_struct *mm) { diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index a51c188b81f3..0cb6bd8bfccf 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1085,7 +1085,7 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr) int hot_add_scn_to_nid(unsigned long scn_addr) { struct device_node *memory = NULL; - int nid, found = 0; + int nid; if (!numa_enabled || (min_common_depth < 0)) return first_online_node; @@ -1101,17 +1101,6 @@ int hot_add_scn_to_nid(unsigned long scn_addr) if (nid < 0 || !node_online(nid)) nid = first_online_node; - if (NODE_DATA(nid)->node_spanned_pages) - return nid; - - for_each_online_node(nid) { - if (NODE_DATA(nid)->node_spanned_pages) { - found = 1; - break; - } - } - - BUG_ON(!found); return nid; } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 15711de10403..853b2a3d8dee 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -104,12 +104,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { @@ -162,5 +156,13 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) #define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0) #define tlb_migrate_finish(mm) do { } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} #endif /* _S390_TLB_H */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 3ba622702ce4..ec1f0dedb948 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); - radix_tree_replace_slot(slot, rmap); + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 025cdb1032f6..46e0d635e36f 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -65,6 +65,9 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) tlb->end = address + PAGE_SIZE; } +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, @@ -115,18 +118,18 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { return tlb_remove_page(tlb, page); } +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) #define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) #define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 821ff0acfe17..600a2e9bfee2 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -116,12 +116,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { @@ -141,6 +135,15 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) #define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index f09df2ff1bcc..d4a15831ac58 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt) paravirt_free_ldt(ldt->entries, ldt->size); if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(ldt->entries); + vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); kfree(ldt); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9c337b0e8ba7..4cfba947d774 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + x86_report_nx(); /* after early param, so could get panic from serial */ diff --git a/block/blk-settings.c b/block/blk-settings.c index f679ae122843..65f16cf4f850 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -249,6 +249,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); limits->max_sectors = max_sectors; + q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9cc8d7c5439a..ea374e820775 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -212,6 +212,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; + q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(q->queue_lock); return ret; diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index c89d5d231a0e..c9b5cac03b36 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1015,6 +1015,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, const char *type = of_get_flat_dt_prop(node, "device_type", NULL); const __be32 *reg, *endp; int l; + bool hotpluggable; /* We are scanning "memory" nodes only */ if (type == NULL) { @@ -1034,6 +1035,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, return 0; endp = reg + (l / sizeof(__be32)); + hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL); pr_debug("memory scan node %s, reg size %d,\n", uname, l); @@ -1049,6 +1051,13 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, (unsigned long long)size); early_init_dt_add_memory_arch(base, size); + + if (!hotpluggable) + continue; + + if (early_init_dt_mark_hotplug_memory_arch(base, size)) + pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n", + base, base + size); } return 0; @@ -1146,6 +1155,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) memblock_add(base, size); } +int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size) +{ + return memblock_mark_hotplug(base, size); +} + int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool nomap) { @@ -1168,6 +1182,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) WARN_ON(1); } +int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size) +{ + return -ENOSYS; +} + int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool nomap) { diff --git a/drivers/pcmcia/m32r_pcc.c b/drivers/pcmcia/m32r_pcc.c index eb126b98ed8a..e50bbf826188 100644 --- a/drivers/pcmcia/m32r_pcc.c +++ b/drivers/pcmcia/m32r_pcc.c @@ -296,10 +296,11 @@ static int __init is_alive(u_short sock) return 0; } -static void add_pcc_socket(ulong base, int irq, ulong mapaddr, - unsigned int ioaddr) +static int add_pcc_socket(ulong base, int irq, ulong mapaddr, + unsigned int ioaddr) { pcc_socket_t *t = &socket[pcc_sockets]; + int err; /* add sockets */ t->ioaddr = ioaddr; @@ -328,11 +329,16 @@ static void add_pcc_socket(ulong base, int irq, ulong mapaddr, t->socket.irq_mask = 0; t->socket.pci_irq = 2 + pcc_sockets; /* XXX */ - request_irq(irq, pcc_interrupt, 0, "m32r-pcc", pcc_interrupt); + err = request_irq(irq, pcc_interrupt, 0, "m32r-pcc", pcc_interrupt); + if (err) { + if (t->base > 0) + release_region(t->base, 0x20); + return err; + } pcc_sockets++; - return; + return 0; } @@ -683,26 +689,29 @@ static int __init init_m32r_pcc(void) return ret; ret = platform_device_register(&pcc_device); - if (ret){ - platform_driver_unregister(&pcc_driver); - return ret; - } + if (ret) + goto unreg_driv; printk(KERN_INFO "m32r PCC probe:\n"); pcc_sockets = 0; - add_pcc_socket(M32R_PCC0_BASE, PCC0_IRQ, M32R_PCC0_MAPBASE, 0x1000); + ret = add_pcc_socket(M32R_PCC0_BASE, PCC0_IRQ, M32R_PCC0_MAPBASE, + 0x1000); + if (ret) + goto unreg_dev; #ifdef CONFIG_M32RPCC_SLOT2 - add_pcc_socket(M32R_PCC1_BASE, PCC1_IRQ, M32R_PCC1_MAPBASE, 0x2000); + ret = add_pcc_socket(M32R_PCC1_BASE, PCC1_IRQ, M32R_PCC1_MAPBASE, + 0x2000); + if (ret) + goto unreg_dev; #endif if (pcc_sockets == 0) { printk("socket is not found.\n"); - platform_device_unregister(&pcc_device); - platform_driver_unregister(&pcc_driver); - return -ENODEV; + ret = -ENODEV; + goto unreg_dev; } /* Set up interrupt handler(s) */ @@ -728,6 +737,12 @@ static int __init init_m32r_pcc(void) } return 0; + +unreg_dev: + platform_device_unregister(&pcc_device); +unreg_driv: + platform_driver_unregister(&pcc_driver); + return ret; } /* init_m32r_pcc */ static void __exit exit_m32r_pcc(void) diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c index e7899624aa0b..35bbe288ddb4 100644 --- a/drivers/sh/intc/virq.c +++ b/drivers/sh/intc/virq.c @@ -254,7 +254,7 @@ restart: radix_tree_tag_clear(&d->tree, entry->enum_id, INTC_TAG_VIRQ_NEEDS_ALLOC); - radix_tree_replace_slot((void **)entries[i], + radix_tree_replace_slot(&d->tree, (void **)entries[i], &intc_irq_xlate[irq]); } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2472af2798c7..e6c1bd443806 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2204,7 +2204,9 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL); + if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) + goto end_coredump; + vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz)); if (!vma_filesz) goto end_coredump; @@ -2311,7 +2313,7 @@ end_coredump: cleanup: free_note_info(&info); kfree(shdr4extnum); - kfree(vma_filesz); + vfree(vma_filesz); kfree(phdr4note); kfree(elf); out: diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 74ed5aae6cea..180f910339f4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -202,27 +202,31 @@ static struct ratelimit_state printk_limits[] = { void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; - char lvl[4]; + char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1]; struct va_format vaf; va_list args; - const char *type = logtypes[4]; + const char *type = NULL; int kern_level; struct ratelimit_state *ratelimit; va_start(args, fmt); - kern_level = printk_get_level(fmt); - if (kern_level) { + while ((kern_level = printk_get_level(fmt)) != 0) { size_t size = printk_skip_level(fmt) - fmt; - memcpy(lvl, fmt, size); - lvl[size] = '\0'; + + if (kern_level >= '0' && kern_level <= '7') { + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + type = logtypes[kern_level - '0']; + ratelimit = &printk_limits[kern_level - '0']; + } fmt += size; - type = logtypes[kern_level - '0']; - ratelimit = &printk_limits[kern_level - '0']; - } else { + } + + if (!type) { *lvl = '\0'; - /* Default to debug output */ - ratelimit = &printk_limits[7]; + type = logtypes[4]; + ratelimit = &printk_limits[4]; } vaf.fmt = fmt; @@ -342,7 +342,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } @@ -356,7 +356,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } @@ -643,12 +643,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, } mapping->nrexceptional++; } else { + struct radix_tree_node *node; void **slot; void *ret; - ret = __radix_tree_lookup(page_tree, index, NULL, &slot); + ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); - radix_tree_replace_slot(slot, new_entry); + __radix_tree_replace(page_tree, node, slot, + new_entry, NULL, NULL); } if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 05713a5da083..ef600591d96f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1769,15 +1769,13 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - if (!list_empty(&wb->b_more_io)) { - trace_writeback_wait(wb, work); - inode = wb_inode(wb->b_more_io.prev); - spin_lock(&inode->i_lock); - spin_unlock(&wb->list_lock); - /* This function drops i_lock... */ - inode_sleep_on_writeback(inode); - spin_lock(&wb->list_lock); - } + trace_writeback_wait(wb, work); + inode = wb_inode(wb->b_more_io.prev); + spin_lock(&inode->i_lock); + spin_unlock(&wb->list_lock); + /* This function drops i_lock... */ + inode_sleep_on_writeback(inode); + spin_lock(&wb->list_lock); } spin_unlock(&wb->list_lock); blk_finish_plug(&plug); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c5c5b9748ea3..9a88984f9f6f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1950,8 +1950,7 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, } int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + loff_t pos, unsigned len, unsigned copied, void *fsdata) { int i, ret; unsigned from, to, start = pos & (PAGE_SIZE - 1); @@ -2064,7 +2063,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, int ret; struct inode *inode = mapping->host; - ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); + ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata); up_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_inode_unlock(inode, 1); @@ -2241,7 +2240,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, dwc->dw_zero_count++; } - ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc); + ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc); BUG_ON(ret != len); ret = 0; unlock: diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index b1c9f28a57b1..8614ff069d99 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -44,8 +44,7 @@ int walk_page_buffers( handle_t *handle, struct buffer_head *bh)); int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + loff_t pos, unsigned len, unsigned copied, void *fsdata); typedef enum { OCFS2_WRITE_BUFFER = 0, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 636abcbd4650..9158c9825094 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -741,7 +741,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg, hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; memset(hb_block, 0, reg->hr_block_bytes); /* TODO: time stuff */ - cputime = CURRENT_TIME.tv_sec; + cputime = ktime_get_real_seconds(); if (!cputime) cputime = 1; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3f828a187049..a464c8088170 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1609,8 +1609,6 @@ way_up_top: __dlm_insert_mle(dlm, mle); response = DLM_MASTER_RESP_NO; } else { - // mlog(0, "mle was found\n"); - set_maybe = 1; spin_lock(&tmpmle->spinlock); if (tmpmle->master == dlm->node_num) { mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); @@ -1625,8 +1623,7 @@ way_up_top: response = DLM_MASTER_RESP_NO; } else response = DLM_MASTER_RESP_MAYBE; - if (set_maybe) - set_bit(request->node_idx, tmpmle->maybe_map); + set_bit(request->node_idx, tmpmle->maybe_map); spin_unlock(&tmpmle->spinlock); } spin_unlock(&dlm->master_lock); @@ -1644,12 +1641,6 @@ send_response: * dlm_assert_master_worker() isn't called, we drop it here. */ if (dispatch_assert) { - if (response != DLM_MASTER_RESP_YES) - mlog(ML_ERROR, "invalid response %d\n", response); - if (!res) { - mlog(ML_ERROR, "bad lockres while trying to assert!\n"); - BUG(); - } mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", dlm->node_num, res->lockname.len, res->lockname.name); spin_lock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index dd5cb8bcefd1..74407c6dd592 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2966,8 +2966,6 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->spinlock); dlm_kick_recovery_thread(dlm); break; - default: - BUG(); } mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index c56a7679df93..382401d3e88f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -703,7 +703,7 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail_commit; } - di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); + di->i_dtime = cpu_to_le64(ktime_get_real_seconds()); di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index a244f14c6b87..d5e5fa7f0743 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1947,7 +1947,7 @@ static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) */ seqno++; os->os_count++; - os->os_scantime = CURRENT_TIME; + os->os_scantime = ktime_get_seconds(); unlock: ocfs2_orphan_scan_unlock(osb, seqno); out: @@ -2004,7 +2004,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) struct ocfs2_orphan_scan *os; os = &osb->osb_orphan_scan; - os->os_scantime = CURRENT_TIME; + os->os_scantime = ktime_get_seconds(); if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 71545ad4628c..429088786e93 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -120,8 +120,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, ret = VM_FAULT_NOPAGE; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, - fsdata); + ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); BUG_ON(ret != len); ret = VM_FAULT_LOCKED; out: diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8d887c75765c..3b0a10d9b36f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -516,6 +516,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct ocfs2_extent_list *fel; u16 feat; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct timespec64 ts; *new_fe_bh = NULL; @@ -564,10 +565,11 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); + ktime_get_real_ts64(&ts); fe->i_atime = fe->i_ctime = fe->i_mtime = - cpu_to_le64(CURRENT_TIME.tv_sec); + cpu_to_le64(ts.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = - cpu_to_le32(CURRENT_TIME.tv_nsec); + cpu_to_le32(ts.tv_nsec); fe->i_dtime = 0; /* diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index e63af7ddfe68..7e5958b0be6b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -224,7 +224,7 @@ struct ocfs2_orphan_scan { struct ocfs2_super *os_osb; struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ struct delayed_work os_orphan_scan_work; - struct timespec os_scantime; /* time this node ran the scan */ + time64_t os_scantime; /* time this node ran the scan */ u32 os_count; /* tracks node specific scans */ u32 os_seqno; /* tracks cluster wide scans */ atomic_t os_state; /* ACTIVE or INACTIVE */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 19238512a324..738b4ea8e990 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -478,7 +478,6 @@ again: if (ret) { mlog_errno(ret); ocfs2_unlock_refcount_tree(osb, tree, rw); - ocfs2_refcount_tree_put(tree); goto out; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f56fe39fab04..c894d945b084 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -337,7 +337,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) out += snprintf(buf + out, len - out, "Disabled\n"); else out += snprintf(buf + out, len - out, "%lu seconds ago\n", - (get_seconds() - os->os_scantime.tv_sec)); + (unsigned long)(ktime_get_seconds() - os->os_scantime)); out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", "Slots", "Num", "RecoGen"); diff --git a/fs/proc/array.c b/fs/proc/array.c index 81818adb8e9e..51a4213afa2e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -245,7 +245,7 @@ void render_sigset_t(struct seq_file *m, const char *header, if (sigismember(set, i+2)) x |= 2; if (sigismember(set, i+3)) x |= 4; if (sigismember(set, i+4)) x |= 8; - seq_printf(m, "%x", x); + seq_putc(m, hex_asc[x]); } while (i >= 4); seq_putc(m, '\n'); @@ -342,10 +342,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) static inline void task_seccomp(struct seq_file *m, struct task_struct *p) { + seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP - seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode); - seq_putc(m, '\n'); + seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif + seq_putc(m, '\n'); } static inline void task_context_switch_counts(struct seq_file *m, diff --git a/fs/proc/base.c b/fs/proc/base.c index ca651ac00660..9b99df4893a4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -104,9 +104,12 @@ * in /proc for a task before it execs a suid executable. */ +static u8 nlink_tid; +static u8 nlink_tgid; + struct pid_entry { const char *name; - int len; + unsigned int len; umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; @@ -139,13 +142,13 @@ struct pid_entry { * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. */ -static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, +static unsigned int __init pid_entry_nlink(const struct pid_entry *entries, unsigned int n) { unsigned int i; unsigned int count; - count = 0; + count = 2; for (i = 0; i < n; ++i) { if (S_ISDIR(entries[i].mode)) ++count; @@ -1967,7 +1970,7 @@ out: struct map_files_info { fmode_t mode; - unsigned long len; + unsigned int len; unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; @@ -2412,14 +2415,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir, * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ - last = &ents[nents - 1]; - for (p = ents; p <= last; p++) { + last = &ents[nents]; + for (p = ents; p < last; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) break; } - if (p > last) + if (p >= last) goto out; error = proc_pident_instantiate(dir, dentry, task, p); @@ -2444,7 +2447,7 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx, if (ctx->pos >= nents + 2) goto out; - for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) { + for (p = ents + (ctx->pos - 2); p < ents + nents; p++) { if (!proc_fill_cache(file, ctx, p->name, p->len, proc_pident_instantiate, task, p)) break; @@ -3068,8 +3071,7 @@ static int proc_pid_instantiate(struct inode *dir, inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; - set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, - ARRAY_SIZE(tgid_base_stuff))); + set_nlink(inode, nlink_tgid); d_set_d_op(dentry, &pid_dentry_operations); @@ -3361,8 +3363,7 @@ static int proc_task_instantiate(struct inode *dir, inode->i_fop = &proc_tid_base_operations; inode->i_flags|=S_IMMUTABLE; - set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, - ARRAY_SIZE(tid_base_stuff))); + set_nlink(inode, nlink_tid); d_set_d_op(dentry, &pid_dentry_operations); @@ -3552,3 +3553,9 @@ static const struct file_operations proc_task_operations = { .iterate_shared = proc_task_readdir, .llseek = generic_file_llseek, }; + +void __init set_proc_pid_nlink(void) +{ + nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); + nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); +} diff --git a/fs/proc/inode.c b/fs/proc/inode.c index e69ebe648a34..783bc19644d1 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -138,6 +138,16 @@ static void unuse_pde(struct proc_dir_entry *pde) /* pde is locked */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { + /* + * close() (proc_reg_release()) can't delete an entry and proceed: + * ->release hook needs to be available at the right moment. + * + * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: + * "struct file" needs to be available at the right moment. + * + * Therefore, first process to enter this function does ->release() and + * signals its completion to the other process which does nothing. + */ if (pdeo->closing) { /* somebody else is doing that, just wait */ DECLARE_COMPLETION_ONSTACK(c); @@ -147,12 +157,13 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); } else { struct file *file; - pdeo->closing = 1; + pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; pde->proc_fops->release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); - list_del_init(&pdeo->lh); + /* After ->release. */ + list_del(&pdeo->lh); if (pdeo->c) complete(pdeo->c); kfree(pdeo); @@ -167,6 +178,8 @@ void proc_entry_rundown(struct proc_dir_entry *de) if (atomic_add_return(BIAS, &de->in_use) != BIAS) wait_for_completion(&c); + /* ->pde_openers list can't grow from now on. */ + spin_lock(&de->pde_unload_lock); while (!list_empty(&de->pde_openers)) { struct pde_opener *pdeo; @@ -312,16 +325,17 @@ static int proc_reg_open(struct inode *inode, struct file *file) struct pde_opener *pdeo; /* - * What for, you ask? Well, we can have open, rmmod, remove_proc_entry - * sequence. ->release won't be called because ->proc_fops will be - * cleared. Depending on complexity of ->release, consequences vary. + * Ensure that + * 1) PDE's ->release hook will be called no matter what + * either normally by close()/->release, or forcefully by + * rmmod/remove_proc_entry. + * + * 2) rmmod isn't blocked by opening file in /proc and sitting on + * the descriptor (including "rmmod foo </proc/foo" scenario). * - * We can't wait for mercy when close will be done for real, it's - * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release - * by hand in remove_proc_entry(). For this, save opener's credentials - * for later. + * Save every "struct file" with custom ->release hook. */ - pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); + pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); if (!pdeo) return -ENOMEM; @@ -338,7 +352,8 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (rv == 0 && release) { /* To know what to release. */ pdeo->file = file; - /* Strictly for "too late" ->release in proc_reg_release(). */ + pdeo->closing = false; + pdeo->c = NULL; spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); spin_unlock(&pde->pde_unload_lock); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5378441ec1b7..bbba5d22aada 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -203,7 +203,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name); struct pde_opener { struct file *file; struct list_head lh; - int closing; + bool closing; struct completion *c; }; extern const struct inode_operations proc_link_inode_operations; @@ -211,6 +211,7 @@ extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern void proc_init_inodecache(void); +void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); diff --git a/fs/proc/root.c b/fs/proc/root.c index 8d3e484055a6..4bd0373576b5 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -122,6 +122,7 @@ void __init proc_root_init(void) int err; proc_init_inodecache(); + set_proc_pid_nlink(); err = register_filesystem(&proc_fs_type); if (err) return; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 35b92d81692f..958f32545064 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1588,6 +1588,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); + cond_resched(); return 0; } #ifdef CONFIG_HUGETLB_PAGE diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 41b95d82a185..18af2bcefe6a 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -652,18 +652,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif -#ifndef pmd_move_must_withdraw -static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, - spinlock_t *old_pmd_ptl) -{ - /* - * With split pmd lock we also need to move preallocated - * PTE page table if new_pmd is on different PMD page table. - */ - return new_pmd_ptl != old_pmd_ptl; -} +#ifndef arch_needs_pgtable_deposit +#define arch_needs_pgtable_deposit() (false) #endif - /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index c6d667187608..7eed8cf3130a 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -107,11 +107,6 @@ struct mmu_gather { struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; unsigned int batch_count; - /* - * __tlb_adjust_range will track the new addr here, - * that that we can adjust the range after the flush - */ - unsigned long addr; int page_size; }; @@ -125,16 +120,11 @@ extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); static inline void __tlb_adjust_range(struct mmu_gather *tlb, - unsigned long address) + unsigned long address, + unsigned int range_size) { tlb->start = min(tlb->start, address); - tlb->end = max(tlb->end, address + PAGE_SIZE); - /* - * Track the last address with which we adjusted the range. This - * will be used later to adjust again after a mmu_flush due to - * failed __tlb_remove_page - */ - tlb->addr = address; + tlb->end = max(tlb->end, address + range_size); } static inline void __tlb_reset_range(struct mmu_gather *tlb) @@ -150,15 +140,11 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, page_size)) { + if (__tlb_remove_page_size(tlb, page, page_size)) tlb_flush_mmu(tlb); - tlb->page_size = page_size; - __tlb_adjust_range(tlb, tlb->addr); - __tlb_remove_page_size(tlb, page, page_size); - } } -static bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return __tlb_remove_page_size(tlb, page, PAGE_SIZE); } @@ -172,14 +158,21 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) return tlb_remove_page_size(tlb, page, PAGE_SIZE); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) +#ifndef tlb_remove_check_page_size_change +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) { - /* active->nr should be zero when we call this */ - VM_BUG_ON_PAGE(tlb->active->nr, page); - tlb->page_size = PAGE_SIZE; - __tlb_adjust_range(tlb, tlb->addr); - return __tlb_remove_page(tlb, page); + /* + * We don't care about page size change, just update + * mmu_gather page size here so that debug checks + * doesn't throw false warning. + */ +#ifdef CONFIG_DEBUG_VM + tlb->page_size = page_size; +#endif } +#endif /* * In the case of tlb vma handling, we can optimise these away in the @@ -215,10 +208,16 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ - __tlb_adjust_range(tlb, address); \ + __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + do { \ + __tlb_adjust_range(tlb, address, huge_page_size(h)); \ + __tlb_remove_tlb_entry(tlb, ptep, address); \ + } while (0) + /** * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation * This is a nop so far, because only x86 needs it. @@ -227,29 +226,47 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) #endif -#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ - do { \ - __tlb_adjust_range(tlb, address); \ - __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ +#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ + do { \ + __tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE); \ + __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) +/* + * For things like page tables caches (ie caching addresses "inside" the + * page tables, like x86 does), for legacy reasons, flushing an + * individual page had better flush the page table caches behind it. This + * is definitely how x86 works, for example. And if you have an + * architected non-legacy page table cache (which I'm not aware of + * anybody actually doing), you're going to have some architecturally + * explicit flushing for that, likely *separate* from a regular TLB entry + * flush, and thus you'd need more than just some range expansion.. + * + * So if we ever find an architecture + * that would want something that odd, I think it is up to that + * architecture to do its own odd thing, not cause pain for others + * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com + * + * For now w.r.t page table cache, mark the range_size as PAGE_SIZE + */ + #define pte_free_tlb(tlb, ptep, address) \ do { \ - __tlb_adjust_range(tlb, address); \ + __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #ifndef __ARCH_HAS_4LEVEL_HACK #define pud_free_tlb(tlb, pudp, address) \ do { \ - __tlb_adjust_range(tlb, address); \ + __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif #define pmd_free_tlb(tlb, pmdp, address) \ do { \ - __tlb_adjust_range(tlb, address); \ + __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c357f27d5483..0b5b1af35e5e 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -136,12 +136,13 @@ struct bdi_writeback { struct backing_dev_info { struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ - unsigned int capabilities; /* Device capabilities */ + unsigned long io_pages; /* max allowed IO size */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ char *name; + unsigned int capabilities; /* Device capabilities */ unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; diff --git a/include/linux/cma.h b/include/linux/cma.h index 29f9e774ab76..6f0a91b37f68 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -1,6 +1,9 @@ #ifndef __CMA_H__ #define __CMA_H__ +#include <linux/init.h> +#include <linux/types.h> + /* * There is always at least global CMA area and a few optional * areas configured in kernel .config. diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 928e5ca0caee..0444b1336268 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -21,7 +21,7 @@ * clobbered. The issue is as follows: while the inline asm might * access any memory it wants, the compiler could have fit all of * @ptr into memory registers instead, and since @ptr never escaped - * from that, it proofed that the inline asm wasn't touching any of + * from that, it proved that the inline asm wasn't touching any of * it. This version works well with both compilers, i.e. we're telling * the compiler that the inline asm absolutely may see the contents * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e35e6de633b9..1f782aa1d8e6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -189,6 +189,8 @@ static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) +static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, bool freeze, struct page *page) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} diff --git a/include/linux/kthread.h b/include/linux/kthread.h index c1c3e63d52c1..4fec8b775895 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -175,7 +175,7 @@ __printf(2, 3) struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...); -struct kthread_worker * +__printf(3, 4) struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[], ...); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5e5b2969d931..5f4d8281832b 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -7,6 +7,7 @@ #include <linux/mmzone.h> +#include <linux/dax.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> @@ -177,6 +178,13 @@ static inline bool vma_migratable(struct vm_area_struct *vma) if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; + /* + * DAX device mappings require predictable access latency, so avoid + * incurring periodic faults. + */ + if (vma_is_dax(vma)) + return false; + #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION if (vma->vm_flags & VM_HUGETLB) return false; diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 4341f32516d8..271b3fdf0070 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -71,6 +71,7 @@ extern int early_init_dt_scan_chosen_stdout(void); extern void early_init_fdt_scan_reserved_mem(void); extern void early_init_fdt_reserve_self(void); extern void early_init_dt_add_memory_arch(u64 base, u64 size); +extern int early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size); extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool no_map); extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align); diff --git a/include/linux/printk.h b/include/linux/printk.h index eac1af8502bb..3472cc6b7a60 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -10,6 +10,8 @@ extern const char linux_banner[]; extern const char linux_proc_banner[]; +#define PRINTK_MAX_SINGLE_HEADER_LEN 2 + static inline int printk_get_level(const char *buffer) { if (buffer[0] == KERN_SOH_ASCII && buffer[1]) { @@ -31,6 +33,14 @@ static inline const char *printk_skip_level(const char *buffer) return buffer; } +static inline const char *printk_skip_headers(const char *buffer) +{ + while (printk_get_level(buffer)) + buffer = printk_skip_level(buffer); + + return buffer; +} + #define CONSOLE_EXT_LOG_MAX 8192 /* printk's without a loglevel use this.. */ @@ -40,10 +50,15 @@ static inline const char *printk_skip_level(const char *buffer) #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ #define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ #define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */ -#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */ #define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ #define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ +/* + * Default used to be hard-coded at 7, we're now allowing it to be set from + * kernel config. + */ +#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT + extern int console_printk[]; #define console_loglevel (console_printk[0]) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index af3581b8a451..744486057e9e 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -80,14 +80,11 @@ static inline bool radix_tree_is_internal_node(void *ptr) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) -/* Internally used bits of node->count */ -#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) -#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) - struct radix_tree_node { - unsigned char shift; /* Bits remaining in each slot */ - unsigned char offset; /* Slot offset in parent */ - unsigned int count; + unsigned char shift; /* Bits remaining in each slot */ + unsigned char offset; /* Slot offset in parent */ + unsigned char count; /* Total entry count */ + unsigned char exceptional; /* Exceptional entry count */ union { struct { /* Used when ascending tree */ @@ -248,20 +245,6 @@ static inline int radix_tree_exception(void *arg) return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } -/** - * radix_tree_replace_slot - replace item in a slot - * @pslot: pointer to slot, returned by radix_tree_lookup_slot - * @item: new item to store in the slot. - * - * For use with radix_tree_lookup_slot(). Caller must hold tree write locked - * across slot lookup and replacement. - */ -static inline void radix_tree_replace_slot(void **pslot, void *item) -{ - BUG_ON(radix_tree_is_internal_node(item)); - rcu_assign_pointer(*pslot, item); -} - int __radix_tree_create(struct radix_tree_root *root, unsigned long index, unsigned order, struct radix_tree_node **nodep, void ***slotp); @@ -276,7 +259,14 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); -bool __radix_tree_delete_node(struct radix_tree_root *root, +typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *); +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + radix_tree_update_node_t update_node, void *private); +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item); +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b46bb5620a76..15321fb1df6b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -137,11 +137,19 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) * anon_vma helper functions. */ void anon_vma_init(void); /* create anon_vma_cachep */ -int anon_vma_prepare(struct vm_area_struct *); +int __anon_vma_prepare(struct vm_area_struct *); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + static inline void anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 7551d3e2ab70..0e90f2973719 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -540,7 +540,11 @@ static inline int get_dumpable(struct mm_struct *mm) /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ -#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +/* + * This one-shot flag is dropped due to necessity of changing exe once again + * on NFS restore + */ +//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ diff --git a/include/linux/swap.h b/include/linux/swap.h index a56523cefb9b..09b212d37f1d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -246,39 +246,7 @@ struct swap_info_struct { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); -extern struct list_lru workingset_shadow_nodes; - -static inline unsigned int workingset_node_pages(struct radix_tree_node *node) -{ - return node->count & RADIX_TREE_COUNT_MASK; -} - -static inline void workingset_node_pages_inc(struct radix_tree_node *node) -{ - node->count++; -} - -static inline void workingset_node_pages_dec(struct radix_tree_node *node) -{ - VM_WARN_ON_ONCE(!workingset_node_pages(node)); - node->count--; -} - -static inline unsigned int workingset_node_shadows(struct radix_tree_node *node) -{ - return node->count >> RADIX_TREE_COUNT_SHIFT; -} - -static inline void workingset_node_shadows_inc(struct radix_tree_node *node) -{ - node->count += 1U << RADIX_TREE_COUNT_SHIFT; -} - -static inline void workingset_node_shadows_dec(struct radix_tree_node *node) -{ - VM_WARN_ON_ONCE(!workingset_node_shadows(node)); - node->count -= 1U << RADIX_TREE_COUNT_SHIFT; -} +void workingset_update_node(struct radix_tree_node *node, void *private); /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3d9d786a943c..d68edffbf142 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -82,6 +82,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller); extern void vfree(const void *addr); +extern void vfree_atomic(const void *addr); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); diff --git a/init/do_mounts.c b/init/do_mounts.c index dea5de95c2dd..c2de5104aad2 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -588,7 +588,7 @@ void __init prepare_namespace(void) saved_root_name); while (driver_probe_done() != 0 || (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0) - msleep(100); + msleep(5); async_synchronize_full(); } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index fc1ef736253c..98c9011eac78 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -697,7 +697,7 @@ kdb_printit: * Write to all consoles. */ retlen = strlen(kdb_buffer); - cp = (char *) printk_skip_level(kdb_buffer); + cp = (char *) printk_skip_headers(kdb_buffer); if (!dbg_kdb_mode && kgdb_connected) { gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); } else { diff --git a/kernel/fork.c b/kernel/fork.c index 5957cf8b4c4b..7377f414f3ce 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -229,7 +229,7 @@ static inline void free_thread_stack(struct task_struct *tsk) } local_irq_restore(flags); - vfree(tsk->stack); + vfree_atomic(tsk->stack); return; } #endif diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 2b59c82cc3e1..40c07e4fa116 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -106,7 +106,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * complain: */ if (sysctl_hung_task_warnings) { - sysctl_hung_task_warnings--; + if (sysctl_hung_task_warnings > 0) + sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", t->comm, t->pid, timeout); pr_err(" %s %s %.*s\n", diff --git a/kernel/kthread.c b/kernel/kthread.c index 956495f0efaf..2318fba86277 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -261,7 +261,8 @@ static void create_kthread(struct kthread_create_info *create) } } -static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), +static __printf(4, 0) +struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) @@ -635,7 +636,7 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); -static struct kthread_worker * +static __printf(3, 0) struct kthread_worker * __kthread_create_worker(int cpu, unsigned int flags, const char namefmt[], va_list args) { diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 16bab471c7e2..f011aaef583c 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -67,7 +67,8 @@ static int vprintk_nmi(const char *fmt, va_list args) again: len = atomic_read(&s->len); - if (len >= sizeof(s->buffer)) { + /* The trailing '\0' is not counted into len. */ + if (len >= sizeof(s->buffer) - 1) { atomic_inc(&nmi_message_lost); return 0; } @@ -79,7 +80,7 @@ again: if (!len) smp_rmb(); - add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); /* * Do it once again if the buffer has been flushed in the meantime. @@ -113,16 +114,51 @@ static void printk_nmi_flush_line(const char *text, int len) } -/* - * printk one line from the temporary buffer from @start index until - * and including the @end index. - */ -static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s, - int start, int end) +/* printk part of the temporary buffer line by line */ +static int printk_nmi_flush_buffer(const char *start, size_t len) { - const char *buf = s->buffer + start; + const char *c, *end; + bool header; + + c = start; + end = start + len; + header = true; + + /* Print line by line. */ + while (c < end) { + if (*c == '\n') { + printk_nmi_flush_line(start, c - start + 1); + start = ++c; + header = true; + continue; + } + + /* Handle continuous lines or missing new line. */ + if ((c + 1 < end) && printk_get_level(c)) { + if (header) { + c = printk_skip_level(c); + continue; + } + + printk_nmi_flush_line(start, c - start); + start = c++; + header = true; + continue; + } + + header = false; + c++; + } - printk_nmi_flush_line(buf, (end - start) + 1); + /* Check if there was a partial line. Ignore pure header. */ + if (start < end && !header) { + static const char newline[] = KERN_CONT "\n"; + + printk_nmi_flush_line(start, end - start); + printk_nmi_flush_line(newline, strlen(newline)); + } + + return len; } /* @@ -135,8 +171,8 @@ static void __printk_nmi_flush(struct irq_work *work) __RAW_SPIN_LOCK_INITIALIZER(read_lock); struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); unsigned long flags; - size_t len, size; - int i, last_i; + size_t len; + int i; /* * The lock has two functions. First, one reader has to flush all @@ -154,12 +190,14 @@ more: /* * This is just a paranoid check that nobody has manipulated * the buffer an unexpected way. If we printed something then - * @len must only increase. + * @len must only increase. Also it should never overflow the + * buffer size. */ - if (i && i >= len) { + if ((i && i >= len) || len > sizeof(s->buffer)) { const char *msg = "printk_nmi_flush: internal error\n"; printk_nmi_flush_line(msg, strlen(msg)); + len = 0; } if (!len) @@ -167,22 +205,7 @@ more: /* Make sure that data has been written up to the @len */ smp_rmb(); - - size = min(len, sizeof(s->buffer)); - last_i = i; - - /* Print line by line. */ - for (; i < size; i++) { - if (s->buffer[i] == '\n') { - printk_nmi_flush_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < size) { - printk_nmi_flush_seq_line(s, last_i, size - 1); - printk_nmi_flush_line("\n", strlen("\n")); - } + i += printk_nmi_flush_buffer(s->buffer + i, len - i); /* * Check that nothing has got added in the meantime and truncate diff --git a/kernel/sys.c b/kernel/sys.c index 78c9fb7dd680..9758892a2d09 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1697,16 +1697,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) fput(exe_file); } - /* - * The symlink can be changed only once, just to disallow arbitrary - * transitions malicious software might bring in. This means one - * could make a snapshot over all processes running and monitor - * /proc/pid/exe changes to notice unusual activity if needed. - */ - err = -EPERM; - if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) - goto exit; - err = 0; /* set the new file, lockless */ get_file(exe.file); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9bb7d825ba14..e40a0715f422 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -15,6 +15,21 @@ config PRINTK_TIME The behavior is also controlled by the kernel command line parameter printk.time=1. See Documentation/kernel-parameters.txt +config CONSOLE_LOGLEVEL_DEFAULT + int "Default console loglevel (1-15)" + range 1 15 + default "7" + help + Default loglevel to determine what will be printed on the console. + + Setting a default here is equivalent to passing in loglevel=<x> in + the kernel bootargs. loglevel=<x> continues to override whatever + value is specified here as well. + + Note: This does not affect the log level of un-prefixed prink() + usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT + option. + config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" range 1 7 @@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT that are auditing their logs closely may want to set it to a lower priority. + Note: This does not affect what message level gets printed on the console + by default. To change that, use loglevel=<x> in the kernel bootargs, + or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value. + config BOOT_PRINTK_DELAY bool "Delay each boot printk message by N milliseconds" depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY @@ -1986,7 +2005,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED config STRICT_DEVMEM bool "Filter access to /dev/mem" - depends on MMU + depends on MMU && DEVMEM depends on ARCH_HAS_DEVMEM_IS_ALLOWED default y if TILE || PPC ---help--- diff --git a/lib/idr.c b/lib/idr.c index 6098336df267..52d2979a05e8 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get); * and go back to the ida_pre_get() call. If the ida is full, it will * return %-ENOSPC. * + * Note that callers must ensure that concurrent access to @ida is not possible. + * See ida_simple_get() for a varaint which takes care of locking. + * * @p_id returns a value in the range @starting_id ... %0x7fffffff. */ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) @@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy); * Allocates an id in the range start <= id < end, or returns -ENOSPC. * On memory allocation failure, returns -ENOMEM. * + * Compared to ida_get_new_above() this function does its own locking, and + * should be used unless there are special requirements. + * * Use ida_simple_remove() to get rid of an id. */ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, @@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get); * ida_simple_remove - remove an allocated id. * @ida: the (initialized) ida. * @id: the id returned by ida_simple_get. + * + * Use to release an id allocated with ida_simple_get(). + * + * Compared to ida_remove() this function does its own locking, and should be + * used unless there are special requirements. */ void ida_simple_remove(struct ida *ida, unsigned int id) { diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 4b8bb3618b83..2e8c6f7aa56e 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", + pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n", node, node->offset, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->parent); + node->shift, node->count, node->exceptional, node->parent); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << node->shift); @@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) tag_clear(node, i, 0); node->slots[0] = NULL; - node->count = 0; kmem_cache_free(radix_tree_node_cachep, node); } @@ -522,8 +521,13 @@ static int radix_tree_extend(struct radix_tree_root *root, node->offset = 0; node->count = 1; node->parent = NULL; - if (radix_tree_is_internal_node(slot)) + if (radix_tree_is_internal_node(slot)) { entry_to_node(slot)->parent = node; + } else { + /* Moving an exceptional root->rnode to a node */ + if (radix_tree_exceptional_entry(slot)) + node->exceptional = 1; + } node->slots[0] = slot; slot = node_to_entry(node); rcu_assign_pointer(root->rnode, slot); @@ -534,6 +538,104 @@ out: } /** + * radix_tree_shrink - shrink radix tree to minimum height + * @root radix tree root + */ +static inline void radix_tree_shrink(struct radix_tree_root *root, + radix_tree_update_node_t update_node, + void *private) +{ + for (;;) { + struct radix_tree_node *node = root->rnode; + struct radix_tree_node *child; + + if (!radix_tree_is_internal_node(node)) + break; + node = entry_to_node(node); + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost slot, or the child is a multiorder + * entry, we cannot shrink. + */ + if (node->count != 1) + break; + child = node->slots[0]; + if (!child) + break; + if (!radix_tree_is_internal_node(child) && node->shift) + break; + + if (radix_tree_is_internal_node(child)) + entry_to_node(child)->parent = NULL; + + /* + * We don't need rcu_assign_pointer(), since we are simply + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it + * (node->slots[0]), it will be safe to dereference the new + * one (root->rnode) as far as dependent read barriers go. + */ + root->rnode = child; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page has 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + node->count = 0; + if (!radix_tree_is_internal_node(child)) { + node->slots[0] = RADIX_TREE_RETRY; + if (update_node) + update_node(node, private); + } + + radix_tree_node_free(node); + } +} + +static void delete_node(struct radix_tree_root *root, + struct radix_tree_node *node, + radix_tree_update_node_t update_node, void *private) +{ + do { + struct radix_tree_node *parent; + + if (node->count) { + if (node == entry_to_node(root->rnode)) + radix_tree_shrink(root, update_node, private); + return; + } + + parent = node->parent; + if (parent) { + parent->slots[node->offset] = NULL; + parent->count--; + } else { + root_tag_clear_all(root); + root->rnode = NULL; + } + + radix_tree_node_free(node); + + node = parent; + } while (node); +} + +/** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key @@ -649,6 +751,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, if (node) { unsigned offset = get_slot_offset(node, slot); node->count++; + if (radix_tree_exceptional_entry(item)) + node->exceptional++; BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); @@ -746,6 +850,85 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); +static void replace_slot(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + bool warn_typeswitch) +{ + void *old = rcu_dereference_raw(*slot); + int count, exceptional; + + WARN_ON_ONCE(radix_tree_is_internal_node(item)); + + count = !!item - !!old; + exceptional = !!radix_tree_exceptional_entry(item) - + !!radix_tree_exceptional_entry(old); + + WARN_ON_ONCE(warn_typeswitch && (count || exceptional)); + + if (node) { + node->count += count; + node->exceptional += exceptional; + } + + rcu_assign_pointer(*slot, item); +} + +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * @update_node: callback for changing leaf nodes + * @private: private data to pass to @update_node + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + radix_tree_update_node_t update_node, void *private) +{ + /* + * This function supports replacing exceptional entries and + * deleting entries, but that needs accounting against the + * node unless the slot is root->rnode. + */ + replace_slot(root, node, slot, item, + !node && slot != (void **)&root->rnode); + + if (!node) + return; + + if (update_node) + update_node(node, private); + + delete_node(root, node, update_node, private); +} + +/** + * radix_tree_replace_slot - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), + * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked + * across slot lookup and replacement. + * + * NOTE: This cannot be used to switch between non-entries (empty slots), + * regular entries, and exceptional entries, as that requires accounting + * inside the radix tree node. When switching from one type of entry or + * deleting, use __radix_tree_lookup() and __radix_tree_replace(). + */ +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item) +{ + replace_slot(root, NULL, slot, item, true); +} + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root @@ -1394,75 +1577,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) #endif /* CONFIG_SHMEM && CONFIG_SWAP */ /** - * radix_tree_shrink - shrink radix tree to minimum height - * @root radix tree root - */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) -{ - bool shrunk = false; - - for (;;) { - struct radix_tree_node *node = root->rnode; - struct radix_tree_node *child; - - if (!radix_tree_is_internal_node(node)) - break; - node = entry_to_node(node); - - /* - * The candidate node has more than one child, or its child - * is not at the leftmost slot, or the child is a multiorder - * entry, we cannot shrink. - */ - if (node->count != 1) - break; - child = node->slots[0]; - if (!child) - break; - if (!radix_tree_is_internal_node(child) && node->shift) - break; - - if (radix_tree_is_internal_node(child)) - entry_to_node(child)->parent = NULL; - - /* - * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another: if it - * was safe to dereference the old pointer to it - * (node->slots[0]), it will be safe to dereference the new - * one (root->rnode) as far as dependent read barriers go. - */ - root->rnode = child; - - /* - * We have a dilemma here. The node's slot[0] must not be - * NULLed in case there are concurrent lookups expecting to - * find the item. However if this was a bottom-level node, - * then it may be subject to the slot pointer being visible - * to callers dereferencing it. If item corresponding to - * slot[0] is subsequently deleted, these callers would expect - * their slot to become empty sooner or later. - * - * For example, lockless pagecache will look up a slot, deref - * the page pointer, and if the page has 0 refcount it means it - * was concurrently deleted from pagecache so try the deref - * again. Fortunately there is already a requirement for logic - * to retry the entire slot lookup -- the indirect pointer - * problem (replacing direct root node with an indirect pointer - * also results in a stale slot). So tag the slot as indirect - * to force callers to retry. - */ - if (!radix_tree_is_internal_node(child)) - node->slots[0] = RADIX_TREE_RETRY; - - radix_tree_node_free(node); - shrunk = true; - } - - return shrunk; -} - -/** * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root * @node: node containing @index @@ -1470,39 +1584,11 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) * After clearing the slot at @index in @node from radix tree * rooted at @root, call this function to attempt freeing the * node and shrinking the tree. - * - * Returns %true if @node was freed, %false otherwise. */ -bool __radix_tree_delete_node(struct radix_tree_root *root, +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - bool deleted = false; - - do { - struct radix_tree_node *parent; - - if (node->count) { - if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); - return deleted; - } - - parent = node->parent; - if (parent) { - parent->slots[node->offset] = NULL; - parent->count--; - } else { - root_tag_clear_all(root); - root->rnode = NULL; - } - - radix_tree_node_free(node); - deleted = true; - - node = parent; - } while (node); - - return deleted; + delete_node(root, node, NULL, NULL); } static inline void delete_sibling_entries(struct radix_tree_node *node, @@ -1559,10 +1645,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); - node->slots[offset] = NULL; - node->count--; - - __radix_tree_delete_node(root, node); + __radix_tree_replace(root, node, slot, NULL, NULL, NULL); return entry; } diff --git a/lib/rbtree.c b/lib/rbtree.c index eb8a19fee110..1f8b112a7c35 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * * (p) (p) * / \ / \ - * N S --> N Sl + * N S --> N sl * / \ \ - * sl Sr s + * sl Sr S * \ * Sr + * + * Note: p might be red, and then both + * p and sl are red after rotation(which + * breaks property 4). This is fixed in + * Case 4 (in __rb_rotate_set_parents() + * which set sl the color of p + * and set p RB_BLACK) + * + * (p) (sl) + * / \ / \ + * N sl --> P S + * \ / \ + * S N Sr + * \ + * Sr */ tmp1 = tmp2->rb_right; WRITE_ONCE(sibling->rb_left, tmp1); @@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, } break; } - /* Case 3 - right rotate at sibling */ + /* Case 3 - left rotate at sibling */ tmp1 = tmp2->rb_left; WRITE_ONCE(sibling->rb_right, tmp1); WRITE_ONCE(tmp2->rb_left, sibling); @@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, tmp1 = sibling; sibling = tmp2; } - /* Case 4 - left rotate at parent + color flips */ + /* Case 4 - right rotate at parent + color flips */ tmp2 = sibling->rb_right; WRITE_ONCE(parent->rb_left, tmp2); WRITE_ONCE(sibling->rb_right, parent); diff --git a/mm/Kconfig b/mm/Kconfig index 86e3e0e74d20..9b8fccb969dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,7 +153,7 @@ config MOVABLE_NODE bool "Enable to assign a node which has only movable memory" depends on HAVE_MEMBLOCK depends on NO_BOOTMEM - depends on X86_64 + depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG depends on NUMA default n help @@ -447,13 +447,9 @@ choice benefit. endchoice -# -# We don't deposit page tables on file THP mapping, -# but Power makes use of them to address MMU quirk. -# config TRANSPARENT_HUGE_PAGECACHE def_bool y - depends on TRANSPARENT_HUGEPAGE && !PPC + depends on TRANSPARENT_HUGEPAGE # # UP and nommu archs use km based percpu allocator diff --git a/mm/compaction.c b/mm/compaction.c index 0d37192d9423..223464227299 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -634,22 +634,6 @@ isolate_freepages_range(struct compact_control *cc, return pfn; } -/* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, struct compact_control *cc) -{ - struct page *page; - unsigned int count[2] = { 0, }; - - if (list_empty(&cc->migratepages)) - return; - - list_for_each_entry(page, &cc->migratepages, lru) - count[!!page_is_file_cache(page)]++; - - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]); -} - /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(struct zone *zone) { @@ -866,6 +850,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); isolate_success: list_add(&page->lru, &cc->migratepages); @@ -902,7 +888,6 @@ isolate_fail: spin_unlock_irqrestore(zone_lru_lock(zone), flags); locked = false; } - acct_isolated(zone, cc); putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; cc->last_migrated_pfn = 0; @@ -988,7 +973,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) break; } - acct_isolated(cc->zone, cc); return pfn; } @@ -1258,10 +1242,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); - if (!low_pfn || cc->contended) { - acct_isolated(zone, cc); + if (!low_pfn || cc->contended) return ISOLATE_ABORT; - } /* * Either we isolated something and proceed with migration. Or @@ -1271,7 +1253,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, break; } - acct_isolated(zone, cc); /* Record where migration scanner will be restarted. */ cc->migrate_pfn = low_pfn; diff --git a/mm/debug.c b/mm/debug.c index 9feb699c5d25..db1cd26d8752 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason) pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, + sizeof(unsigned long), page, + sizeof(struct page), false); + if (reason) pr_alert("page dumped because: %s\n", reason); diff --git a/mm/filemap.c b/mm/filemap.c index 50b52fe51937..5b4dd03130da 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -132,44 +132,29 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!dax_mapping(mapping)) { if (shadowp) *shadowp = p; - if (node) - workingset_node_shadows_dec(node); } else { /* DAX can replace empty locked entry with a hole */ WARN_ON_ONCE(p != (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | RADIX_DAX_ENTRY_LOCK)); - /* DAX accounts exceptional entries as normal pages */ - if (node) - workingset_node_pages_dec(node); /* Wakeup waiters for exceptional entry lock */ dax_wake_mapping_entry_waiter(mapping, page->index, false); } } - radix_tree_replace_slot(slot, page); + __radix_tree_replace(&mapping->page_tree, node, slot, page, + workingset_update_node, mapping); mapping->nrpages++; - if (node) { - workingset_node_pages_inc(node); - /* - * Don't track node that contains actual pages. - * - * Avoid acquiring the list_lru lock if already - * untracked. The list_empty() test is safe as - * node->private_list is protected by - * mapping->tree_lock. - */ - if (!list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - } return 0; } static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + int i, nr; + + /* hugetlb pages are represented by one entry in the radix tree */ + nr = PageHuge(page) ? 1 : hpage_nr_pages(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); @@ -182,44 +167,11 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index + i, &node, &slot); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - - if (!node) { - VM_BUG_ON_PAGE(nr != 1, page); - /* - * We need a node to properly account shadow - * entries. Don't plant any without. XXX - */ - shadow = NULL; - } - - radix_tree_replace_slot(slot, shadow); + VM_BUG_ON_PAGE(!node && nr != 1, page); - if (!node) - break; - - workingset_node_pages_dec(node); - if (shadow) - workingset_node_shadows_inc(node); - else - if (__radix_tree_delete_node(&mapping->page_tree, node)) - continue; - - /* - * Track node that only contains shadow entries. DAX mappings - * contain no shadow entries and may contain other exceptional - * entries so skip those. - * - * Avoid acquiring the list_lru lock if already tracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!dax_mapping(mapping) && !workingset_node_pages(node) && - list_empty(&node->private_list)) { - node->private_data = mapping; - list_lru_add(&workingset_shadow_nodes, - &node->private_list); - } + radix_tree_clear_tags(&mapping->page_tree, node, slot); + __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + workingset_update_node, mapping); } if (shadow) { @@ -632,7 +632,8 @@ next_page: return i; } -bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) +static bool vma_permits_fault(struct vm_area_struct *vma, + unsigned int fault_flags) { bool write = !!(fault_flags & FAULT_FLAG_WRITE); bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); @@ -857,14 +858,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages_locked); /* - * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to - * pass additional gup_flags as last parameter (like FOLL_HWPOISON). + * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for + * tsk, mm to be specified. * * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the - * caller if required (just like with __get_user_pages). "FOLL_GET", - * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed - * according to the parameters "pages", "write", "force" - * respectively. + * caller if required (just like with __get_user_pages). "FOLL_GET" + * is set implicitly if "pages" is non-NULL. */ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -894,10 +893,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * get_user_pages_unlocked(tsk, mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so - * get_user_pages_fast should be used instead, if the two parameters - * "tsk" and "mm" are respectively equal to current and current->mm, - * or if "force" shall be set to 1 (get_user_pages_fast misses the - * "force" parameter). + * get_user_pages_fast should be used instead if specific gup_flags + * (e.g. FOLL_FORCE) are not required. */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8e35cc66d32..cee42cf05477 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -285,6 +285,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj, } static struct kobj_attribute use_zero_page_attr = __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); + +static ssize_t hpage_pmd_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE); +} +static struct kobj_attribute hpage_pmd_size_attr = + __ATTR_RO(hpage_pmd_size); + #ifdef CONFIG_DEBUG_VM static ssize_t debug_cow_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -307,6 +316,7 @@ static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, + &hpage_pmd_size_attr.attr, #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &shmem_enabled_attr.attr, #endif @@ -1323,6 +1333,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = tlb->mm; bool ret = false; + tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked; @@ -1378,12 +1390,23 @@ out_unlocked: return ret; } +static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) +{ + pgtable_t pgtable; + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pte_free(mm, pgtable); + atomic_long_dec(&mm->nr_ptes); +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { pmd_t orig_pmd; spinlock_t *ptl; + tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; @@ -1399,12 +1422,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_dax(vma)) { spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) - tlb_remove_page(tlb, pmd_page(orig_pmd)); + tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else if (is_huge_zero_pmd(orig_pmd)) { pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(ptl); - tlb_remove_page(tlb, pmd_page(orig_pmd)); + tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); @@ -1417,6 +1440,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, atomic_long_dec(&tlb->mm->nr_ptes); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); } spin_unlock(ptl); @@ -1425,6 +1450,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, + spinlock_t *old_pmd_ptl, + struct vm_area_struct *vma) +{ + /* + * With split pmd lock we also need to move preallocated + * PTE page table if new_pmd is on different PMD page table. + * + * We also don't deposit and withdraw tables for file pages. + */ + return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); +} +#endif + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) @@ -1462,8 +1502,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); - if (pmd_move_must_withdraw(new_ptl, old_ptl) && - vma_is_anonymous(vma)) { + if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); @@ -1589,6 +1628,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * We are going to unmap this huge page. So + * just go ahead and zap it + */ + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 418bf01a50ed..3edb759c5c7d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3286,6 +3286,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + /* + * This is a hugetlb vma, all the pte entries should point + * to huge page. + */ + tlb_remove_check_page_size_change(tlb, sz); tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; @@ -3336,7 +3341,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } pte = huge_ptep_get_and_clear(mm, address, ptep); - tlb_remove_tlb_entry(tlb, ptep, address); + tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); @@ -3450,15 +3455,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page, spinlock_t *ptl) + unsigned long address, pte_t *ptep, + struct page *pagecache_page, spinlock_t *ptl) { + pte_t pte; struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + pte = huge_ptep_get(ptep); old_page = pte_page(pte); retry_avoidcopy: @@ -3711,8 +3718,7 @@ retry: vma_end_reservation(h, vma, address); } - ptl = huge_pte_lockptr(h, mm, ptep); - spin_lock(ptl); + ptl = huge_pte_lock(h, mm, ptep); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -3733,7 +3739,7 @@ retry: hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); } spin_unlock(ptl); @@ -3888,8 +3894,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, + pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); @@ -4330,8 +4336,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); - spin_lock(ptl); + ptl = huge_pte_lock(hstate_vma(vma), mm, spte); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index baabaad4a4aa..dae929c02bbb 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -86,24 +86,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to) qlist_init(from); } -static void qlist_move(struct qlist_head *from, struct qlist_node *last, - struct qlist_head *to, size_t size) -{ - if (unlikely(last == from->tail)) { - qlist_move_all(from, to); - return; - } - if (qlist_empty(to)) - to->head = from->head; - else - to->tail->next = from->head; - to->tail = last; - from->head = last->next; - last->next = NULL; - from->bytes -= size; - to->bytes += size; -} - +#define QUARANTINE_PERCPU_SIZE (1 << 20) +#define QUARANTINE_BATCHES \ + (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS) /* * The object quarantine consists of per-cpu queues and a global queue, @@ -111,11 +96,22 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last, */ static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); -static struct qlist_head global_quarantine; +/* Round-robin FIFO array of batches. */ +static struct qlist_head global_quarantine[QUARANTINE_BATCHES]; +static int quarantine_head; +static int quarantine_tail; +/* Total size of all objects in global_quarantine across all batches. */ +static unsigned long quarantine_size; static DEFINE_SPINLOCK(quarantine_lock); /* Maximum size of the global queue. */ -static unsigned long quarantine_size; +static unsigned long quarantine_max_size; + +/* + * Target size of a batch in global_quarantine. + * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM. + */ +static unsigned long quarantine_batch_size; /* * The fraction of physical memory the quarantine is allowed to occupy. @@ -124,9 +120,6 @@ static unsigned long quarantine_size; */ #define QUARANTINE_FRACTION 32 -#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4) -#define QUARANTINE_PERCPU_SIZE (1 << 20) - static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) { return virt_to_head_page(qlink)->slab_cache; @@ -191,21 +184,30 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (unlikely(!qlist_empty(&temp))) { spin_lock_irqsave(&quarantine_lock, flags); - qlist_move_all(&temp, &global_quarantine); + WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); + qlist_move_all(&temp, &global_quarantine[quarantine_tail]); + if (global_quarantine[quarantine_tail].bytes >= + READ_ONCE(quarantine_batch_size)) { + int new_tail; + + new_tail = quarantine_tail + 1; + if (new_tail == QUARANTINE_BATCHES) + new_tail = 0; + if (new_tail != quarantine_head) + quarantine_tail = new_tail; + } spin_unlock_irqrestore(&quarantine_lock, flags); } } void quarantine_reduce(void) { - size_t new_quarantine_size, percpu_quarantines; + size_t total_size, new_quarantine_size, percpu_quarantines; unsigned long flags; struct qlist_head to_free = QLIST_INIT; - size_t size_to_free = 0; - struct qlist_node *last; - if (likely(READ_ONCE(global_quarantine.bytes) <= - READ_ONCE(quarantine_size))) + if (likely(READ_ONCE(quarantine_size) <= + READ_ONCE(quarantine_max_size))) return; spin_lock_irqsave(&quarantine_lock, flags); @@ -214,24 +216,23 @@ void quarantine_reduce(void) * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ - new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); - new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? - 0 : new_quarantine_size - percpu_quarantines; - WRITE_ONCE(quarantine_size, new_quarantine_size); - - last = global_quarantine.head; - while (last) { - struct kmem_cache *cache = qlink_to_cache(last); - - size_to_free += cache->size; - if (!last->next || size_to_free > - global_quarantine.bytes - QUARANTINE_LOW_SIZE) - break; - last = last->next; + new_quarantine_size = (total_size < percpu_quarantines) ? + 0 : total_size - percpu_quarantines; + WRITE_ONCE(quarantine_max_size, new_quarantine_size); + /* Aim at consuming at most 1/2 of slots in quarantine. */ + WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE, + 2 * total_size / QUARANTINE_BATCHES)); + + if (likely(quarantine_size > quarantine_max_size)) { + qlist_move_all(&global_quarantine[quarantine_head], &to_free); + WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes); + quarantine_head++; + if (quarantine_head == QUARANTINE_BATCHES) + quarantine_head = 0; } - qlist_move(&global_quarantine, last, &to_free, size_to_free); spin_unlock_irqrestore(&quarantine_lock, flags); @@ -275,13 +276,14 @@ static void per_cpu_remove_cache(void *arg) void quarantine_remove_cache(struct kmem_cache *cache) { - unsigned long flags; + unsigned long flags, i; struct qlist_head to_free = QLIST_INIT; on_each_cpu(per_cpu_remove_cache, cache, 1); spin_lock_irqsave(&quarantine_lock, flags); - qlist_move_cache(&global_quarantine, &to_free, cache); + for (i = 0; i < QUARANTINE_BATCHES; i++) + qlist_move_cache(&global_quarantine[i], &to_free, cache); spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 073325aedc68..b82b3e215157 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -136,6 +136,8 @@ static void kasan_end_report(unsigned long *flags) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); + if (panic_on_warn) + panic("panic_on_warn set ...\n"); kasan_enable_current(); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 87e1a7ca3846..09460955e818 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; unsigned long addr; pmd_t *pmd, _pmd; + bool deposited = false; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); + /* + * now deposit the pgtable for arch that need it + * otherwise free it. + */ + if (arch_needs_pgtable_deposit()) { + /* + * The deposit should be visibile only after + * collapse is seen by others. + */ + smp_wmb(); + pgtable_trans_huge_deposit(vma->vm_mm, pmd, + pmd_pgtable(_pmd)); + deposited = true; + } spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); - pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + if (!deposited) { + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } } } i_mmap_unlock_write(mapping); @@ -1403,6 +1420,9 @@ static void collapse_shmem(struct mm_struct *mm, spin_lock_irq(&mapping->tree_lock); + slot = radix_tree_lookup_slot(&mapping->page_tree, index); + VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, + &mapping->tree_lock), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* @@ -1423,9 +1443,10 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(slot, + radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); + slot = radix_tree_iter_next(&iter); index++; continue; out_lru: @@ -1521,9 +1542,11 @@ tree_unlocked: if (!page || iter.index < page->index) { if (!nr_none) break; - /* Put holes back where they were */ - radix_tree_replace_slot(slot, NULL); nr_none--; + /* Put holes back where they were */ + radix_tree_delete(&mapping->page_tree, + iter.index); + slot = radix_tree_iter_next(&iter); continue; } @@ -1532,11 +1555,13 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, + slot, page); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); spin_lock_irq(&mapping->tree_lock); + slot = radix_tree_iter_next(&iter); } VM_BUG_ON(nr_none); spin_unlock_irq(&mapping->tree_lock); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d1380ed93fdf..da3436953022 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -19,7 +19,7 @@ * * * For more information on the algorithm and kmemleak usage, please see - * Documentation/kmemleak.txt. + * Documentation/dev-tools/kmemleak.rst. * * Notes on locking * ---------------- diff --git a/mm/madvise.c b/mm/madvise.c index 93fb63e88b5e..0e3828eae9f8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -281,6 +281,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c2043509fb5..175ec51c346d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2145,6 +2145,8 @@ struct memcg_kmem_cache_create_work { struct work_struct work; }; +static struct workqueue_struct *memcg_kmem_cache_create_wq; + static void memcg_kmem_cache_create_func(struct work_struct *w) { struct memcg_kmem_cache_create_work *cw = @@ -2176,7 +2178,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - schedule_work(&cw->work); + queue_work(memcg_kmem_cache_create_wq, &cw->work); } static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, @@ -5774,6 +5776,17 @@ static int __init mem_cgroup_init(void) { int cpu, node; +#ifndef CONFIG_SLOB + /* + * Kmem cache creation is mostly done with the slab_mutex held, + * so use a special workqueue to avoid stalling all worker + * threads in case lots of cgroups are created simultaneously. + */ + memcg_kmem_cache_create_wq = + alloc_ordered_workqueue("memcg_kmem_cache_create", 0); + BUG_ON(!memcg_kmem_cache_create_wq); +#endif + cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); diff --git a/mm/memory.c b/mm/memory.c index 33f45edf8272..32e9b7aec366 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); - - if (!tlb->page_size) - tlb->page_size = page_size; - else { - if (page_size != tlb->page_size) - return true; - } + VM_WARN_ON(tlb->page_size != page_size); batch = tlb->active; + /* + * Add the page and check if we are full. If so + * force a flush. + */ + batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; @@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ } VM_BUG_ON_PAGE(batch->nr > batch->max, page); - batch->pages[batch->nr++] = page; return false; } @@ -528,7 +526,11 @@ void free_pgd_range(struct mmu_gather *tlb, end -= PMD_SIZE; if (addr > end - 1) return; - + /* + * We add page table cache pages with PAGE_SIZE, + * (see pte_free_tlb()), flush the tlb if we need + */ + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -1118,8 +1120,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; - struct page *pending_page = NULL; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1172,7 +1174,6 @@ again: print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; - pending_page = page; addr += PAGE_SIZE; break; } @@ -1213,11 +1214,6 @@ again: if (force_flush) { force_flush = 0; tlb_flush_mmu_free(tlb); - if (pending_page) { - /* remove the page with new size */ - __tlb_remove_pte_page(tlb, pending_page); - pending_page = NULL; - } if (addr != end) goto again; } @@ -1240,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -2939,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } +static void deposit_prealloc_pte(struct fault_env *fe) +{ + struct vm_area_struct *vma = fe->vma; + + pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + /* + * We are going to consume the prealloc table, + * count that as nr_ptes. + */ + atomic_long_inc(&vma->vm_mm->nr_ptes); + fe->prealloc_pte = 0; +} + static int do_set_pmd(struct fault_env *fe, struct page *page) { struct vm_area_struct *vma = fe->vma; @@ -2953,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = VM_FAULT_FALLBACK; page = compound_head(page); + /* + * Archs like ppc64 need additonal space to store information + * related to pte entry. Use the preallocated table for that. + */ + if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { + fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); + if (!fe->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); if (unlikely(!pmd_none(*fe->pmd))) goto out; @@ -2966,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); page_add_file_rmap(page, true); + /* + * deposit and withdraw with pmd lock held + */ + if (arch_needs_pgtable_deposit()) + deposit_prealloc_pte(fe); set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); @@ -2975,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = 0; count_vm_event(THP_FILE_MAPPED); out: + /* + * If we are going to fallback to pte mapping, do a + * withdraw with pmd lock held. + */ + if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) + fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + fe->pmd); spin_unlock(fe->ptl); return ret; } @@ -3014,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, ret = do_set_pmd(fe, page); if (ret != VM_FAULT_FALLBACK) - return ret; + goto fault_handled; } if (!fe->pte) { ret = pte_alloc_one_map(fe); if (ret) - return ret; + goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) - return VM_FAULT_NOPAGE; + if (unlikely(!pte_none(*fe->pte))) { + ret = VM_FAULT_NOPAGE; + goto fault_handled; + } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3045,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, fe->address, fe->pte); + ret = 0; - return 0; +fault_handled: + /* preallocated pagetable is unused: free it */ + if (fe->prealloc_pte) { + pte_free(fe->vma->vm_mm, fe->prealloc_pte); + fe->prealloc_pte = 0; + } + return ret; } static unsigned long fault_around_bytes __read_mostly = @@ -3145,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); - /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; - } /* Huge page is mapped? Page fault is solved */ if (pmd_trans_huge(*fe->pmd)) { ret = VM_FAULT_NOPAGE; @@ -3454,7 +3490,7 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - split_huge_pmd(fe->vma, fe->pmd, fe->address); + __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); return VM_FAULT_FALLBACK; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cad4b9125695..e43142c15631 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1727,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) static int __init cmdline_parse_movable_node(char *p) { #ifdef CONFIG_MOVABLE_NODE - /* - * Memory used by the kernel cannot be hot-removed because Linux - * cannot migrate the kernel pages. When memory hotplug is - * enabled, we should prevent memblock from allocating memory - * for the kernel. - * - * ACPI SRAT records all hotpluggable memory ranges. But before - * SRAT is parsed, we don't know about it. - * - * The kernel image is loaded into memory at very early time. We - * cannot prevent this anyway. So on NUMA system, we set any - * node the kernel resides in as un-hotpluggable. - * - * Since on modern servers, one node could have double-digit - * gigabytes memory, we can assume the memory around the kernel - * image is also un-hotpluggable. So before SRAT is parsed, just - * allocate memory near the kernel image to try the best to keep - * the kernel away from hotpluggable memory. - */ - memblock_set_bottom_up(true); movable_node_enabled = true; #else pr_warn("movable_node option not supported\n"); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0b859af06b87..6d3639e1f254 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -276,7 +276,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return ERR_PTR(-EINVAL); } } else if (mode == MPOL_LOCAL) { - if (!nodes_empty(*nodes)) + if (!nodes_empty(*nodes) || + (flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) @@ -496,7 +498,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, page = pmd_page(*pmd); if (is_huge_zero_page(page)) { spin_unlock(ptl); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else { get_page(page); spin_unlock(ptl); @@ -1679,25 +1681,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, int nd) { - switch (policy->mode) { - case MPOL_PREFERRED: - if (!(policy->flags & MPOL_F_LOCAL)) - nd = policy->v.preferred_node; - break; - case MPOL_BIND: + if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) + nd = policy->v.preferred_node; + else { /* - * Normally, MPOL_BIND allocations are node-local within the - * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node isn't part of the mask, we use the zonelist for - * the first node in the mask instead. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - if (unlikely(gfp & __GFP_THISNODE) && - unlikely(!node_isset(nd, policy->v.nodes))) - nd = first_node(policy->v.nodes); - break; - default: - BUG(); + WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + return node_zonelist(nd, gfp); } diff --git a/mm/migrate.c b/mm/migrate.c index 99250aee1ac1..0ed24b1fa77b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -168,8 +168,6 @@ void putback_movable_pages(struct list_head *l) continue; } list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); /* * We isolated non-lru movable page so here we can use * __PageMovable because LRU page's mapping cannot have @@ -186,6 +184,8 @@ void putback_movable_pages(struct list_head *l) put_page(page); } else { putback_lru_page(page); + dec_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } } } @@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); @@ -1121,8 +1121,15 @@ out: * restored. */ list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + + /* + * Compaction can migrate also non-LRU pages which are + * not accounted to NR_ISOLATED_*. They can be recognized + * as __PageMovable + */ + if (likely(!__PageMovable(page))) + dec_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } /* diff --git a/mm/mprotect.c b/mm/mprotect.c index 11936526b08b..cc2459c57f60 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -69,11 +69,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; + int target_node = NUMA_NO_NODE; pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); if (!pte) return 0; + /* Get target node for single threaded private VMAs */ + if (prot_numa && !(vma->vm_flags & VM_SHARED) && + atomic_read(&vma->vm_mm->mm_users) == 1) + target_node = numa_node_id(); + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; @@ -95,6 +101,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + if (target_node == page_to_nid(page)) + continue; } ptent = ptep_modify_prot_start(mm, addr, pte); @@ -163,7 +176,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); if (pmd_trans_unstable(pmd)) continue; } else { @@ -484,6 +497,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, return do_mprotect_pkey(start, len, prot, -1); } +#ifdef CONFIG_ARCH_HAS_PKEYS + SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, unsigned long, prot, int, pkey) { @@ -534,3 +549,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) */ return ret; } + +#endif /* CONFIG_ARCH_HAS_PKEYS */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3dcc54da5637..f64e7bcb43b7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2058,8 +2058,12 @@ out_unlock: * potentially hurts the reliability of high-order allocations when under * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. + * + * If @force is true, try to unreserve a pageblock even though highatomic + * pageblock is exhausted. */ -static void unreserve_highatomic_pageblock(const struct alloc_context *ac) +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + bool force) { struct zonelist *zonelist = ac->zonelist; unsigned long flags; @@ -2067,11 +2071,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) struct zone *zone; struct page *page; int order; + bool ret; for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, ac->nodemask) { - /* Preserve at least one pageblock */ - if (zone->nr_reserved_highatomic <= pageblock_nr_pages) + /* + * Preserve at least one pageblock unless memory pressure + * is really high. + */ + if (!force && zone->nr_reserved_highatomic <= + pageblock_nr_pages) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2085,13 +2094,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) continue; /* - * It should never happen but changes to locking could - * inadvertently allow a per-cpu drain to add pages - * to MIGRATE_HIGHATOMIC while unreserving so be safe - * and watch for underflows. + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock + * in this loop althoug we changed the pageblock type + * from highatomic to ac->migratetype. So we should + * adjust the count once. */ - zone->nr_reserved_highatomic -= min(pageblock_nr_pages, - zone->nr_reserved_highatomic); + if (get_pageblock_migratetype(page) == + MIGRATE_HIGHATOMIC) { + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + zone->nr_reserved_highatomic -= min( + pageblock_nr_pages, + zone->nr_reserved_highatomic); + } /* * Convert to ac->migratetype and avoid the normal @@ -2103,12 +2124,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - move_freepages_block(zone, page, ac->migratetype); - spin_unlock_irqrestore(&zone->lock, flags); - return; + ret = move_freepages_block(zone, page, ac->migratetype); + if (ret) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } } spin_unlock_irqrestore(&zone->lock, flags); } + + return false; } /* Remove an element from the buddy allocator from the fallback list */ @@ -2133,7 +2158,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal) + if (can_steal && + get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) steal_suitable_fallback(zone, page, start_migratetype); /* Remove the page from the freelists */ @@ -2192,7 +2218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, bool cold) { - int i; + int i, alloced = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2217,13 +2243,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, else list_add_tail(&page->lru, list); list = &page->lru; + alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } + + /* + * i pages were removed from the buddy list even if some leak due + * to check_pcp_refill failing so adjust NR_FREE_PAGES based + * on i. Do not confuse with 'alloced' which is the number of + * pages added to the pcp list. + */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return i; + return alloced; } #ifdef CONFIG_NUMA @@ -2534,7 +2568,8 @@ int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) + if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) + && mt != MIGRATE_HIGHATOMIC) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -3305,7 +3340,7 @@ retry: * Shrink them them and try again */ if (!page && !drained) { - unreserve_highatomic_pageblock(ac); + unreserve_highatomic_pageblock(ac, false); drain_all_pages(NULL); drained = true; goto retry; @@ -3422,8 +3457,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Make sure we converge to OOM if we cannot make any progress * several times in the row. */ - if (*no_progress_loops > MAX_RECLAIM_RETRIES) - return false; + if (*no_progress_loops > MAX_RECLAIM_RETRIES) { + /* Before OOM, exhaust highatomic_reserve */ + return unreserve_highatomic_pageblock(ac, true); + } /* * Keep reclaiming pages while there is a chance this will lead diff --git a/mm/percpu.c b/mm/percpu.c index 255714302394..f696385bcc44 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2093,6 +2093,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, size_t pages_size; struct page **pages; int unit, i, j, rc; + int upa; + int nr_g0_units; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); @@ -2100,7 +2102,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size, if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); - BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + upa = ai->alloc_size/ai->unit_size; + nr_g0_units = roundup(num_possible_cpus(), upa); + if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) { + pcpu_free_alloc_info(ai); + return -EINVAL; + } unit_pages = ai->unit_size >> PAGE_SHIFT; @@ -2111,21 +2118,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* allocate pages */ j = 0; - for (unit = 0; unit < num_possible_cpus(); unit++) + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; for (i = 0; i < unit_pages; i++) { - unsigned int cpu = ai->groups[0].cpu_map[unit]; void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", - psize_str, cpu); + psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ kmemleak_free(ptr); pages[j++] = virt_to_page(ptr); } + } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; diff --git a/mm/readahead.c b/mm/readahead.c index c8a955b1297e..c4ca70239233 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -207,12 +207,21 @@ out: * memory at once. */ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) + pgoff_t offset, unsigned long nr_to_read) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct file_ra_state *ra = &filp->f_ra; + unsigned long max_pages; + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; - nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages); + /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); + nr_to_read = min(nr_to_read, max_pages); while (nr_to_read) { int err; @@ -369,10 +378,18 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - unsigned long max = ra->ra_pages; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + unsigned long max_pages = ra->ra_pages; pgoff_t prev_offset; /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + if (req_size > max_pages && bdi->io_pages > max_pages) + max_pages = min(req_size, bdi->io_pages); + + /* * start of file */ if (!offset) @@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping, if ((offset == (ra->start + ra->size - ra->async_size) || offset == (ra->start + ra->size))) { ra->start += ra->size; - ra->size = get_next_ra_size(ra, max); + ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } @@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_hole(mapping, offset + 1, max); + start = page_cache_next_hole(mapping, offset + 1, max_pages); rcu_read_unlock(); - if (!start || start - offset > max) + if (!start || start - offset > max_pages) return 0; ra->start = start; ra->size = start - offset; /* old async_size */ ra->size += req_size; - ra->size = get_next_ra_size(ra, max); + ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } @@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping, /* * oversize read */ - if (req_size > max) + if (req_size > max_pages) goto initial_readahead; /* @@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping, * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, offset, req_size, max)) + if (try_context_readahead(mapping, ra, offset, req_size, max_pages)) goto readit; /* @@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping, initial_readahead: ra->start = offset; - ra->size = get_init_ra_size(req_size, max); + ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; readit: @@ -454,7 +471,7 @@ readit: * the resulted next readahead window into the current one. */ if (offset == ra->start && ra->size == ra->async_size) { - ra->async_size = get_next_ra_size(ra, max); + ra->async_size = get_next_ra_size(ra, max_pages); ra->size += ra->async_size; } diff --git a/mm/rmap.c b/mm/rmap.c index 1ef36404e7b2..91619fd70939 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -141,14 +141,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, } /** - * anon_vma_prepare - attach an anon_vma to a memory region + * __anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the * anonymous pages mapped into it with that anon_vma. * - * The common case will be that we already have one, but if + * The common case will be that we already have one, which + * is handled inline by anon_vma_prepare(). But if * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we @@ -167,48 +168,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * * This must be called with the mmap_sem held for reading. */ -int anon_vma_prepare(struct vm_area_struct *vma) +int __anon_vma_prepare(struct vm_area_struct *vma) { - struct anon_vma *anon_vma = vma->anon_vma; + struct mm_struct *mm = vma->vm_mm; + struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; might_sleep(); - if (unlikely(!anon_vma)) { - struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated; - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto out_enomem; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto out_enomem; + + anon_vma = find_mergeable_anon_vma(vma); + allocated = NULL; + if (!anon_vma) { + anon_vma = anon_vma_alloc(); + if (unlikely(!anon_vma)) + goto out_enomem_free_avc; + allocated = anon_vma; + } - anon_vma = find_mergeable_anon_vma(vma); + anon_vma_lock_write(anon_vma); + /* page_table_lock to protect against threads */ + spin_lock(&mm->page_table_lock); + if (likely(!vma->anon_vma)) { + vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + /* vma reference or self-parent link for new root */ + anon_vma->degree++; allocated = NULL; - if (!anon_vma) { - anon_vma = anon_vma_alloc(); - if (unlikely(!anon_vma)) - goto out_enomem_free_avc; - allocated = anon_vma; - } + avc = NULL; + } + spin_unlock(&mm->page_table_lock); + anon_vma_unlock_write(anon_vma); - anon_vma_lock_write(anon_vma); - /* page_table_lock to protect against threads */ - spin_lock(&mm->page_table_lock); - if (likely(!vma->anon_vma)) { - vma->anon_vma = anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); - /* vma reference or self-parent link for new root */ - anon_vma->degree++; - allocated = NULL; - avc = NULL; - } - spin_unlock(&mm->page_table_lock); - anon_vma_unlock_write(anon_vma); + if (unlikely(allocated)) + put_anon_vma(allocated); + if (unlikely(avc)) + anon_vma_chain_free(avc); - if (unlikely(allocated)) - put_anon_vma(allocated); - if (unlikely(avc)) - anon_vma_chain_free(avc); - } return 0; out_enomem_free_avc: diff --git a/mm/shmem.c b/mm/shmem.c index 9d32e1cb9f38..abd7403aba41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -300,18 +300,19 @@ void shmem_uncharge(struct inode *inode, long pages) static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { + struct radix_tree_node *node; void **pslot; void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (!pslot) + item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); + if (!item) return -ENOENT; - item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - radix_tree_replace_slot(pslot, replacement); + __radix_tree_replace(&mapping->page_tree, node, pslot, + replacement, NULL, NULL); return 0; } @@ -370,6 +371,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, int shmem_huge __read_mostly; +#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) { if (!strcmp(str, "never")) @@ -407,6 +409,7 @@ static const char *shmem_format_huge(int huge) return "bad_val"; } } +#endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) @@ -1539,7 +1542,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct mm_struct *fault_mm, int *fault_type) { struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info; + struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct mem_cgroup *memcg; @@ -1589,7 +1592,6 @@ repeat: * Fast cache lookup did not find it: * bring it back from swap or allocate. */ - info = SHMEM_I(inode); sbinfo = SHMEM_SB(inode->i_sb); charge_mm = fault_mm ? : current->mm; @@ -1837,7 +1839,6 @@ unlock: put_page(page); } if (error == -ENOSPC && !once++) { - info = SHMEM_I(inode); spin_lock_irq(&info->lock); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); diff --git a/mm/slab.c b/mm/slab.c index 0b0550ca85b4..87b29e76cafd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) INIT_LIST_HEAD(&parent->slabs_full); INIT_LIST_HEAD(&parent->slabs_partial); INIT_LIST_HEAD(&parent->slabs_free); + parent->total_slabs = 0; + parent->free_slabs = 0; parent->shared = NULL; parent->alien = NULL; parent->colour_next = 0; spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; - parent->num_slabs = 0; } #define MAKE_LIST(cachep, listp, slab, nodeid) \ @@ -1366,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { #if DEBUG struct kmem_cache_node *n; - struct page *page; unsigned long flags; int node; static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -1381,32 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { - unsigned long active_objs = 0, num_objs = 0, free_objects = 0; - unsigned long active_slabs = 0, num_slabs = 0; - unsigned long num_slabs_partial = 0, num_slabs_free = 0; - unsigned long num_slabs_full; + unsigned long total_slabs, free_slabs, free_objs; spin_lock_irqsave(&n->list_lock, flags); - num_slabs = n->num_slabs; - list_for_each_entry(page, &n->slabs_partial, lru) { - active_objs += page->active; - num_slabs_partial++; - } - list_for_each_entry(page, &n->slabs_free, lru) - num_slabs_free++; - - free_objects += n->free_objects; + total_slabs = n->total_slabs; + free_slabs = n->free_slabs; + free_objs = n->free_objects; spin_unlock_irqrestore(&n->list_lock, flags); - num_objs = num_slabs * cachep->num; - active_slabs = num_slabs - num_slabs_free; - num_slabs_full = num_slabs - - (num_slabs_partial + num_slabs_free); - active_objs += (num_slabs_full * cachep->num); - - pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", - node, active_slabs, num_slabs, active_objs, num_objs, - free_objects); + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", + node, total_slabs - free_slabs, total_slabs, + (total_slabs * cachep->num) - free_objs, + total_slabs * cachep->num); } #endif } @@ -2318,7 +2304,8 @@ static int drain_freelist(struct kmem_cache *cache, page = list_entry(p, struct page, lru); list_del(&page->lru); - n->num_slabs--; + n->free_slabs--; + n->total_slabs--; /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2332,7 +2319,7 @@ out: return nr_freed; } -int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0; int node; @@ -2352,7 +2339,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) int __kmem_cache_shutdown(struct kmem_cache *cachep) { - return __kmem_cache_shrink(cachep, false); + return __kmem_cache_shrink(cachep); } void __kmem_cache_release(struct kmem_cache *cachep) @@ -2753,12 +2740,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) n = get_node(cachep, page_to_nid(page)); spin_lock(&n->list_lock); - if (!page->active) + n->total_slabs++; + if (!page->active) { list_add_tail(&page->lru, &(n->slabs_free)); - else + n->free_slabs++; + } else fixup_slab_list(cachep, n, page, &list); - n->num_slabs++; STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; spin_unlock(&n->list_lock); @@ -2903,9 +2891,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, /* Move pfmemalloc slab to the end of list to speed up next search */ list_del(&page->lru); - if (!page->active) + if (!page->active) { list_add_tail(&page->lru, &n->slabs_free); - else + n->free_slabs++; + } else list_add_tail(&page->lru, &n->slabs_partial); list_for_each_entry(page, &n->slabs_partial, lru) { @@ -2913,9 +2902,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, return page; } + n->free_touched = 1; list_for_each_entry(page, &n->slabs_free, lru) { - if (!PageSlabPfmemalloc(page)) + if (!PageSlabPfmemalloc(page)) { + n->free_slabs--; return page; + } } return NULL; @@ -2925,16 +2917,18 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; - page = list_first_entry_or_null(&n->slabs_partial, - struct page, lru); + assert_spin_locked(&n->list_lock); + page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); if (!page) { n->free_touched = 1; - page = list_first_entry_or_null(&n->slabs_free, - struct page, lru); + page = list_first_entry_or_null(&n->slabs_free, struct page, + lru); + if (page) + n->free_slabs--; } if (sk_memalloc_socks()) - return get_valid_first_slab(n, page, pfmemalloc); + page = get_valid_first_slab(n, page, pfmemalloc); return page; } @@ -3434,9 +3428,10 @@ static void free_block(struct kmem_cache *cachep, void **objpp, STATS_DEC_ACTIVE(cachep); /* fixup slab chains */ - if (page->active == 0) + if (page->active == 0) { list_add(&page->lru, &n->slabs_free); - else { + n->free_slabs++; + } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. @@ -3450,7 +3445,8 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = list_last_entry(&n->slabs_free, struct page, lru); list_move(&page->lru, list); - n->num_slabs--; + n->free_slabs--; + n->total_slabs--; } } @@ -4102,64 +4098,33 @@ out: #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct page *page; - unsigned long active_objs; - unsigned long num_objs; - unsigned long active_slabs = 0; - unsigned long num_slabs, free_objects = 0, shared_avail = 0; - unsigned long num_slabs_partial = 0, num_slabs_free = 0; - unsigned long num_slabs_full = 0; - const char *name; - char *error = NULL; + unsigned long active_objs, num_objs, active_slabs; + unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0; + unsigned long free_slabs = 0; int node; struct kmem_cache_node *n; - active_objs = 0; - num_slabs = 0; for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); spin_lock_irq(&n->list_lock); - num_slabs += n->num_slabs; + total_slabs += n->total_slabs; + free_slabs += n->free_slabs; + free_objs += n->free_objects; - list_for_each_entry(page, &n->slabs_partial, lru) { - if (page->active == cachep->num && !error) - error = "slabs_partial accounting error"; - if (!page->active && !error) - error = "slabs_partial accounting error"; - active_objs += page->active; - num_slabs_partial++; - } - - list_for_each_entry(page, &n->slabs_free, lru) { - if (page->active && !error) - error = "slabs_free accounting error"; - num_slabs_free++; - } - - free_objects += n->free_objects; if (n->shared) shared_avail += n->shared->avail; spin_unlock_irq(&n->list_lock); } - num_objs = num_slabs * cachep->num; - active_slabs = num_slabs - num_slabs_free; - num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free); - active_objs += (num_slabs_full * cachep->num); - - if (num_objs - active_objs != free_objects && !error) - error = "free_objects accounting error"; - - name = cachep->name; - if (error) - pr_err("slab: cache %s error: %s\n", name, error); + num_objs = total_slabs * cachep->num; + active_slabs = total_slabs - free_slabs; + active_objs = num_objs - free_objs; sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; sinfo->active_slabs = active_slabs; - sinfo->num_slabs = num_slabs; + sinfo->num_slabs = total_slabs; sinfo->shared_avail = shared_avail; sinfo->limit = cachep->limit; sinfo->batchcount = cachep->batchcount; diff --git a/mm/slab.h b/mm/slab.h index bc05fdc3edce..de6579dc362c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -142,11 +142,26 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define SLAB_CACHE_FLAGS (0) #endif +/* Common flags available with current configuration */ #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_NOTRACK | \ + SLAB_ACCOUNT) + int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); -int __kmem_cache_shrink(struct kmem_cache *, bool); +int __kmem_cache_shrink(struct kmem_cache *); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; @@ -432,7 +447,8 @@ struct kmem_cache_node { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; - unsigned long num_slabs; + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 329b03843863..ae323841adb1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -404,6 +404,12 @@ kmem_cache_create(const char *name, size_t size, size_t align, goto out_unlock; } + /* Refuse requests with allocator specific flags */ + if (flags & ~SLAB_FLAGS_PERMITTED) { + err = -EINVAL; + goto out_unlock; + } + /* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this @@ -573,6 +579,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) get_online_cpus(); get_online_mems(); +#ifdef CONFIG_SLUB + /* + * In case of SLUB, we need to disable empty slab caching to + * avoid pinning the offline memory cgroup by freeable kmem + * pages charged to it. SLAB doesn't need this, as it + * periodically purges unused slabs. + */ + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL; + if (c) { + c->cpu_partial = 0; + c->min_partial = 0; + } + } + mutex_unlock(&slab_mutex); + /* + * kmem_cache->cpu_partial is checked locklessly (see + * put_cpu_partial()). Make sure the change is visible. + */ + synchronize_sched(); +#endif + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { if (!is_root_cache(s)) @@ -584,7 +613,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) if (!c) continue; - __kmem_cache_shrink(c, true); + __kmem_cache_shrink(c); arr->entries[idx] = NULL; } mutex_unlock(&slab_mutex); @@ -755,7 +784,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep, false); + ret = __kmem_cache_shrink(cachep); put_online_mems(); put_online_cpus(); return ret; diff --git a/mm/slob.c b/mm/slob.c index 5ec158054ffe..eac04d4357ec 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c) { } -int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *d) { return 0; } diff --git a/mm/slub.c b/mm/slub.c index 2b3e740609e9..067598a00849 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3076,7 +3076,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) struct detached_freelist df; size = build_detached_freelist(s, size, p, &df); - if (unlikely(!df.page)) + if (!df.page) continue; slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); @@ -3883,7 +3883,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -3895,21 +3895,6 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) unsigned long flags; int ret = 0; - if (deactivate) { - /* - * Disable empty slabs caching. Used to avoid pinning offline - * memory cgroups by kmem pages that can be freed. - */ - s->cpu_partial = 0; - s->min_partial = 0; - - /* - * s->cpu_partial is checked locklessly (see put_cpu_partial), - * so we have to make sure the change is visible. - */ - synchronize_sched(); - } - flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); @@ -3966,7 +3951,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s, false); + __kmem_cache_shrink(s); mutex_unlock(&slab_mutex); return 0; diff --git a/mm/swapfile.c b/mm/swapfile.c index f30438970cd1..1c6e0321205d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1234,6 +1234,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { + cond_resched(); next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; @@ -1313,6 +1314,7 @@ static int unuse_mm(struct mm_struct *mm, for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) break; + cond_resched(); } up_read(&mm->mmap_sem); return (ret < 0)? ret: 0; @@ -1350,15 +1352,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } - if (frontswap) { - if (frontswap_test(si, i)) - break; - else - continue; - } count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) - break; + if (!frontswap || frontswap_test(si, i)) + break; + if ((i % LATENCY_LIMIT) == 0) + cond_resched(); } return i; } diff --git a/mm/truncate.c b/mm/truncate.c index 8d8c62d89e6d..fd97f1dbce29 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping, * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, - &slot)) + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(slot, NULL); + __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + workingset_update_node, mapping); mapping->nrexceptional--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); unlock: spin_unlock_irq(&mapping->tree_lock); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f2481cb4e6b2..a5584384eabc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -365,7 +365,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); + might_sleep(); va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); @@ -601,6 +601,13 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* + * Serialize vmap purging. There is no actual criticial section protected + * by this look, but we want to avoid concurrent calls for performance + * reasons and to make the pcpu_get_vm_areas more deterministic. + */ +static DEFINE_MUTEX(vmap_purge_lock); + /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); @@ -615,59 +622,40 @@ void set_iounmap_nonlazy(void) /* * Purges all lazily-freed vmap areas. - * - * If sync is 0 then don't purge if there is already a purge in progress. - * If force_flush is 1, then flush kernel TLBs between *start and *end even - * if we found no lazy vmap areas to unmap (callers can use this to optimise - * their own TLB flushing). - * Returns with *start = min(*start, lowest purged address) - * *end = max(*end, highest purged address) */ -static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, - int sync, int force_flush) +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { - static DEFINE_SPINLOCK(purge_lock); struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - int nr = 0; + bool do_free = false; - /* - * If sync is 0 but force_flush is 1, we'll go sync anyway but callers - * should not expect such behaviour. This just simplifies locking for - * the case that isn't actually used at the moment anyway. - */ - if (!sync && !force_flush) { - if (!spin_trylock(&purge_lock)) - return; - } else - spin_lock(&purge_lock); - - if (sync) - purge_fragmented_blocks_allcpus(); + lockdep_assert_held(&vmap_purge_lock); valist = llist_del_all(&vmap_purge_list); llist_for_each_entry(va, valist, purge_list) { - if (va->va_start < *start) - *start = va->va_start; - if (va->va_end > *end) - *end = va->va_end; - nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + if (va->va_start < start) + start = va->va_start; + if (va->va_end > end) + end = va->va_end; + do_free = true; } - if (nr) - atomic_sub(nr, &vmap_lazy_nr); + if (!do_free) + return false; - if (nr || force_flush) - flush_tlb_kernel_range(*start, *end); + flush_tlb_kernel_range(start, end); - if (nr) { - spin_lock(&vmap_area_lock); - llist_for_each_entry_safe(va, n_va, valist, purge_list) - __free_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_lock(&vmap_area_lock); + llist_for_each_entry_safe(va, n_va, valist, purge_list) { + int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + + __free_vmap_area(va); + atomic_sub(nr, &vmap_lazy_nr); + cond_resched_lock(&vmap_area_lock); } - spin_unlock(&purge_lock); + spin_unlock(&vmap_area_lock); + return true; } /* @@ -676,9 +664,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, */ static void try_purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 0, 0); + if (mutex_trylock(&vmap_purge_lock)) { + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); + } } /* @@ -686,9 +675,10 @@ static void try_purge_vmap_area_lazy(void) */ static void purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 1, 0); + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); } /* @@ -711,22 +701,13 @@ static void free_vmap_area_noflush(struct vmap_area *va) } /* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. - */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) -{ - unmap_vmap_area(va); - free_vmap_area_noflush(va); -} - -/* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - free_unmap_vmap_area_noflush(va); + unmap_vmap_area(va); + free_vmap_area_noflush(va); } static struct vmap_area *find_vmap_area(unsigned long addr) @@ -740,16 +721,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) return va; } -static void free_unmap_vmap_area_addr(unsigned long addr) -{ - struct vmap_area *va; - - va = find_vmap_area(addr); - BUG_ON(!va); - free_unmap_vmap_area(va); -} - - /*** Per cpu kva allocator ***/ /* @@ -1070,6 +1041,8 @@ void vm_unmap_aliases(void) if (unlikely(!vmap_initialized)) return; + might_sleep(); + for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; @@ -1094,7 +1067,11 @@ void vm_unmap_aliases(void) rcu_read_unlock(); } - __purge_vmap_area_lazy(&start, &end, 1, flush); + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + if (!__purge_vmap_area_lazy(start, end) && flush) + flush_tlb_kernel_range(start, end); + mutex_unlock(&vmap_purge_lock); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); @@ -1107,7 +1084,9 @@ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)mem; + struct vmap_area *va; + might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); @@ -1116,10 +1095,14 @@ void vm_unmap_ram(const void *mem, unsigned int count) debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) + if (likely(count <= VMAP_MAX_ALLOC)) { vb_free(mem, size); - else - free_unmap_vmap_area_addr(addr); + return; + } + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1455,6 +1438,8 @@ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; + might_sleep(); + va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; @@ -1510,7 +1495,39 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); return; } - + +static inline void __vfree_deferred(const void *addr) +{ + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * nother cpu's list. schedule_work() should be fine with this too. + */ + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); +} + +/** + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address + * + * This one is just like vfree() but can be called in any atomic context + * except NMIs. + */ +void vfree_atomic(const void *addr) +{ + BUG_ON(in_nmi()); + + kmemleak_free(addr); + + if (!addr) + return; + __vfree_deferred(addr); +} + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1533,11 +1550,9 @@ void vfree(const void *addr) if (!addr) return; - if (unlikely(in_interrupt())) { - struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); - } else + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -2574,32 +2589,13 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_area_lock) { - loff_t n = *pos; - struct vmap_area *va; - spin_lock(&vmap_area_lock); - va = list_first_entry(&vmap_area_list, typeof(*va), list); - while (n > 0 && &va->list != &vmap_area_list) { - n--; - va = list_next_entry(va, list); - } - if (!n && &va->list != &vmap_area_list) - return va; - - return NULL; - + return seq_list_start(&vmap_area_list, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct vmap_area *va = p, *next; - - ++*pos; - next = list_next_entry(va, list); - if (&next->list != &vmap_area_list) - return next; - - return NULL; + return seq_list_next(p, &vmap_area_list, pos); } static void s_stop(struct seq_file *m, void *p) @@ -2634,9 +2630,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static int s_show(struct seq_file *m, void *p) { - struct vmap_area *va = p; + struct vmap_area *va; struct vm_struct *v; + va = list_entry(p, struct vmap_area, list); + /* * s_show can encounter race with remove_vm_area, !VM_VM_AREA on * behalf of vmap area is being tear down or vm_map_ram allocation. diff --git a/mm/vmscan.c b/mm/vmscan.c index 0c8f28a6d89f..6aa5b01d3e75 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -291,6 +291,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; + long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0) @@ -312,7 +313,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; - } + next_deferred = nr; + } else + next_deferred = total_scan; /* * We need to avoid excessive windup on filesystem shrinkers @@ -369,17 +372,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, count_vm_events(SLABS_SCANNED, nr_to_scan); total_scan -= nr_to_scan; + scanned += nr_to_scan; cond_resched(); } + if (next_deferred >= scanned) + next_deferred -= scanned; + else + next_deferred = 0; /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, + if (next_deferred > 0) + new_nr = atomic_long_add_return(next_deferred, &shrinker->nr_deferred[nid]); else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); diff --git a/mm/workingset.c b/mm/workingset.c index fb1f9183d89a..241fa5d6b3b2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -10,6 +10,7 @@ #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> +#include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> @@ -334,48 +335,81 @@ out: * point where they would still be useful. */ -struct list_lru workingset_shadow_nodes; +static struct list_lru shadow_nodes; + +void workingset_update_node(struct radix_tree_node *node, void *private) +{ + struct address_space *mapping = private; + + /* Only regular page cache has shadow entries */ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. + * + * Avoid acquiring the list_lru lock when the nodes are + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by &mapping->tree_lock. + */ + if (node->count && node->count == node->exceptional) { + if (list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&shadow_nodes, &node->private_list); + } + } else { + if (!list_empty(&node->private_list)) + list_lru_del(&shadow_nodes, &node->private_list); + } +} static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long shadow_nodes; unsigned long max_nodes; - unsigned long pages; + unsigned long nodes; + unsigned long cache; /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); + nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); - if (sc->memcg) { - pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - /* - * Active cache pages are limited to 50% of memory, and shadow - * entries that represent a refault distance bigger than that - * do not have any effect. Limit the number of shadow nodes - * such that shadow entries do not exceed the number of active - * cache pages, assuming a worst-case node population density - * of 1/8th on average. + * Approximate a reasonable limit for the radix tree nodes + * containing shadow entries. We don't need to keep more + * shadow entries than possible pages on the active list, + * since refault distances bigger than that are dismissed. + * + * The size of the active list converges toward 100% of + * overall page cache as memory grows, with only a tiny + * inactive list. Assume the total cache size for that. + * + * Nodes might be sparsely populated, with only one shadow + * entry in the extreme case. Obviously, we cannot keep one + * node for every eligible shadow entry, so compromise on a + * worst-case density of 1/8th. Below that, not all eligible + * refaults can be detected anymore. * * On 64-bit with 7 radix_tree_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume - * ~2% of available memory: + * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE */ - max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + if (sc->memcg) { + cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL_FILE); + } else { + cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); + } + max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); - if (shadow_nodes <= max_nodes) + if (nodes <= max_nodes) return 0; - - return shadow_nodes - max_nodes; + return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, @@ -418,23 +452,30 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - BUG_ON(!workingset_node_shadows(node)); - BUG_ON(workingset_node_pages(node)); - + if (WARN_ON_ONCE(!node->exceptional)) + goto out_invalid; + if (WARN_ON_ONCE(node->count != node->exceptional)) + goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { - BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) + goto out_invalid; + if (WARN_ON_ONCE(!node->exceptional)) + goto out_invalid; + if (WARN_ON_ONCE(!mapping->nrexceptional)) + goto out_invalid; node->slots[i] = NULL; - workingset_node_shadows_dec(node); - BUG_ON(!mapping->nrexceptional); + node->exceptional--; + node->count--; mapping->nrexceptional--; } } - BUG_ON(workingset_node_shadows(node)); + if (WARN_ON_ONCE(node->exceptional)) + goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - if (!__radix_tree_delete_node(&mapping->page_tree, node)) - BUG(); + __radix_tree_delete_node(&mapping->page_tree, node); +out_invalid: spin_unlock(&mapping->tree_lock); ret = LRU_REMOVED_RETRY; out: @@ -452,8 +493,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, - shadow_lru_isolate, NULL); + ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); return ret; } @@ -492,7 +532,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key); if (ret) goto err; ret = register_shrinker(&workingset_shadow_shrinker); @@ -500,7 +540,7 @@ static int __init workingset_init(void) goto err_list_lru; return 0; err_list_lru: - list_lru_destroy(&workingset_shadow_nodes); + list_lru_destroy(&shadow_nodes); err: return ret; } diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan index 37323b0df374..9576775a86f6 100644 --- a/scripts/Makefile.kasan +++ b/scripts/Makefile.kasan @@ -28,4 +28,6 @@ else CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL) endif endif + +CFLAGS_KASAN += $(call cc-option, -fsanitize-address-use-after-scope) endif diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index d9ff038c1b28..a27677146410 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -16,19 +16,22 @@ if len(sys.argv) != 3: sys.stderr.write("usage: %s file1 file2\n" % sys.argv[0]) sys.exit(-1) +re_NUMBER = re.compile(r'\.[0-9]+') + def getsizes(file): sym = {} - for l in os.popen("nm --size-sort " + file).readlines(): - size, type, name = l[:-1].split() - if type in "tTdDbBrR": - # strip generated symbols - if name.startswith("__mod_"): continue - if name.startswith("SyS_"): continue - if name.startswith("compat_SyS_"): continue - if name == "linux_banner": continue - # statics and some other optimizations adds random .NUMBER - name = re.sub(r'\.[0-9]+', '', name) - sym[name] = sym.get(name, 0) + int(size, 16) + with os.popen("nm --size-sort " + file) as f: + for line in f: + size, type, name = line.split() + if type in "tTdDbBrR": + # strip generated symbols + if name.startswith("__mod_"): continue + if name.startswith("SyS_"): continue + if name.startswith("compat_SyS_"): continue + if name == "linux_banner": continue + # statics and some other optimizations adds random .NUMBER + name = re_NUMBER.sub('', name) + sym[name] = sym.get(name, 0) + int(size, 16) return sym old = getsizes(sys.argv[1]) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 23f462f64a3f..ac5656ef2aec 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -761,7 +761,7 @@ sub seed_camelcase_file { sub is_maintained_obsolete { my ($filename) = @_; - return 0 if (!(-e "$root/scripts/get_maintainer.pl")); + return 0 if (!$tree || !(-e "$root/scripts/get_maintainer.pl")); my $status = `perl $root/scripts/get_maintainer.pl --status --nom --nol --nogit --nogit-fallback -f $filename 2>&1`; @@ -2589,6 +2589,7 @@ sub process { $line =~ /^rename (?:from|to) [\w\/\.\-]+\s*$/ || ($line =~ /\{\s*([\w\/\.\-]*)\s*\=\>\s*([\w\/\.\-]*)\s*\}/ && (defined($1) || defined($2))))) { + $is_patch = 1; $reported_maintainer_file = 1; WARN("FILE_PATH_CHANGES", "added, moved or deleted file(s), does MAINTAINERS need updating?\n" . $herecurr); @@ -2601,20 +2602,6 @@ sub process { $herecurr) if (!$emitted_corrupt++); } -# Check for absolute kernel paths. - if ($tree) { - while ($line =~ m{(?:^|\s)(/\S*)}g) { - my $file = $1; - - if ($file =~ m{^(.*?)(?::\d+)+:?$} && - check_absolute_file($1, $herecurr)) { - # - } else { - check_absolute_file($file, $herecurr); - } - } - } - # UTF-8 regex found at http://www.w3.org/International/questions/qa-forms-utf-8.en.php if (($realfile =~ /^$/ || $line =~ /^\+/) && $rawline !~ m/^$UTF8*$/) { @@ -2652,6 +2639,20 @@ sub process { "8-bit UTF-8 used in possible commit log\n" . $herecurr); } +# Check for absolute kernel paths in commit message + if ($tree && $in_commit_log) { + while ($line =~ m{(?:^|\s)(/\S*)}g) { + my $file = $1; + + if ($file =~ m{^(.*?)(?::\d+)+:?$} && + check_absolute_file($1, $herecurr)) { + # + } else { + check_absolute_file($file, $herecurr); + } + } + } + # Check for various typo / spelling mistakes if (defined($misspellings) && ($in_commit_log || $line =~ /^(?:\+|Subject:)/i)) { @@ -2805,7 +2806,7 @@ sub process { } # check we are in a valid source file if not then ignore this hunk - next if ($realfile !~ /\.(h|c|s|S|pl|sh|dtsi|dts)$/); + next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/); # line length limit (with some exclusions) # @@ -3440,6 +3441,18 @@ sub process { #ignore lines not being added next if ($line =~ /^[^\+]/); +# check for dereferences that span multiple lines + if ($prevline =~ /^\+.*$Lval\s*(?:\.|->)\s*$/ && + $line =~ /^\+\s*(?!\#\s*(?!define\s+|if))\s*$Lval/) { + $prevline =~ /($Lval\s*(?:\.|->))\s*$/; + my $ref = $1; + $line =~ /^.\s*($Lval)/; + $ref .= $1; + $ref =~ s/\s//g; + WARN("MULTILINE_DEREFERENCE", + "Avoid multiple line dereference - prefer '$ref'\n" . $hereprev); + } + # check for declarations of signed or unsigned without int while ($line =~ m{\b($Declare)\s*(?!char\b|short\b|int\b|long\b)\s*($Ident)?\s*[=,;\[\)\(]}g) { my $type = $1; @@ -5548,8 +5561,9 @@ sub process { "Using weak declarations can have unintended link defects\n" . $herecurr); } -# check for c99 types like uint8_t used outside of uapi/ +# check for c99 types like uint8_t used outside of uapi/ and tools/ if ($realfile !~ m@\binclude/uapi/@ && + $realfile !~ m@\btools/@ && $line =~ /\b($Declare)\s*$Ident\s*[=;,\[]/) { my $type = $1; if ($type =~ /\b($typeC99Typedefs)\b/) { @@ -5925,7 +5939,7 @@ sub process { } if (!$has_break && $has_statement) { WARN("MISSING_BREAK", - "Possible switch case/default not preceeded by break or fallthrough comment\n" . $herecurr); + "Possible switch case/default not preceded by break or fallthrough comment\n" . $herecurr); } } diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index aed4511f0304..633f2dd3de27 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -49,6 +49,7 @@ my $scm = 0; my $web = 0; my $subsystem = 0; my $status = 0; +my $letters = ""; my $keywords = 1; my $sections = 0; my $file_emails = 0; @@ -241,6 +242,7 @@ if (!GetOptions( 'status!' => \$status, 'scm!' => \$scm, 'web!' => \$web, + 'letters=s' => \$letters, 'pattern-depth=i' => \$pattern_depth, 'k|keywords!' => \$keywords, 'sections!' => \$sections, @@ -271,7 +273,8 @@ $output_multiline = 0 if ($output_separator ne ", "); $output_rolestats = 1 if ($interactive); $output_roles = 1 if ($output_rolestats); -if ($sections) { +if ($sections || $letters ne "") { + $sections = 1; $email = 0; $email_list = 0; $scm = 0; @@ -682,8 +685,10 @@ sub get_maintainers { $line =~ s/\\\./\./g; ##Convert \. to . $line =~ s/\.\*/\*/g; ##Convert .* to * } - $line =~ s/^([A-Z]):/$1:\t/g; - print("$line\n"); + my $count = $line =~ s/^([A-Z]):/$1:\t/g; + if ($letters eq "" || (!$count || $letters =~ /$1/i)) { + print("$line\n"); + } } print("\n"); } @@ -814,6 +819,7 @@ Other options: --pattern-depth => Number of pattern directory traversals (default: 0 (all)) --keywords => scan patch for keywords (default: $keywords) --sections => print all of the subsystem sections with pattern matches + --letters => print all matching 'letter' types from all matching sections --mailmap => use .mailmap file (default: $email_use_mailmap) --version => show version --help => show this help information diff --git a/scripts/tags.sh b/scripts/tags.sh index a2ff3388e5ea..df5fa777d300 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -304,11 +304,26 @@ if [ "${ARCH}" = "um" ]; then elif [ "${SRCARCH}" = "arm" -a "${SUBARCH}" != "" ]; then subarchdir=$(find ${tree}arch/$SRCARCH/ -name "mach-*" -type d -o \ -name "plat-*" -type d); + mach_suffix=$SUBARCH + plat_suffix=$SUBARCH + + # Special cases when $plat_suffix != $mach_suffix + case $mach_suffix in + "omap1" | "omap2") + plat_suffix="omap" + ;; + esac + + if [ ! -d ${tree}arch/$SRCARCH/mach-$mach_suffix ]; then + echo "Warning: arch/arm/mach-$mach_suffix/ not found." >&2 + echo " Fix your \$SUBARCH appropriately" >&2 + fi + for i in $subarchdir; do case "$i" in - *"mach-"${SUBARCH}) + *"mach-"${mach_suffix}) ;; - *"plat-"${SUBARCH}) + *"plat-"${plat_suffix}) ;; *) subarchprune="$subarchprune \ diff --git a/sound/core/misc.c b/sound/core/misc.c index f2e8226c88fb..21b228046e88 100644 --- a/sound/core/misc.c +++ b/sound/core/misc.c @@ -71,6 +71,7 @@ void __snd_printk(unsigned int level, const char *path, int line, int kern_level; struct va_format vaf; char verbose_fmt[] = KERN_DEFAULT "ALSA %s:%d %pV"; + bool level_found = false; #endif #ifdef CONFIG_SND_DEBUG @@ -83,15 +84,22 @@ void __snd_printk(unsigned int level, const char *path, int line, vaf.fmt = format; vaf.va = &args; - kern_level = printk_get_level(format); - if (kern_level) { - const char *end_of_header = printk_skip_level(format); - memcpy(verbose_fmt, format, end_of_header - format); + while ((kern_level = printk_get_level(vaf.fmt)) != 0) { + const char *end_of_header = printk_skip_level(vaf.fmt); + + /* Ignore KERN_CONT. We print filename:line for each piece. */ + if (kern_level >= '0' && kern_level <= '7') { + memcpy(verbose_fmt, vaf.fmt, end_of_header - vaf.fmt); + level_found = true; + } + vaf.fmt = end_of_header; - } else if (level) + } + + if (!level_found && level) memcpy(verbose_fmt, KERN_DEBUG, sizeof(KERN_DEBUG) - 1); - printk(verbose_fmt, sanity_file_name(path), line, &vaf); + printk(verbose_fmt, sanity_file_name(path), line, &vaf); #else vprintk(format, args); #endif diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 05d7bc488971..d1be94667a30 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -146,7 +146,7 @@ static void multiorder_check(unsigned long index, int order) slot = radix_tree_lookup_slot(&tree, index); free(*slot); - radix_tree_replace_slot(slot, item2); + radix_tree_replace_slot(&tree, slot, item2); for (i = min; i < max; i++) { struct item *item = item_lookup(&tree, i); assert(item != 0); |